xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision c4ab0d3f46036e85ad0700125c5a83cc139f55a3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  * Copyright (c) 2017, Intel Corporation.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/space_map.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/spa_impl.h>
37 #include <sys/zfeature.h>
38 #include <sys/vdev_indirect_mapping.h>
39 #include <sys/zap.h>
40 
41 #define	GANG_ALLOCATION(flags) \
42 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43 
44 uint64_t metaslab_aliquot = 512ULL << 10;
45 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
46 
47 /*
48  * Since we can touch multiple metaslabs (and their respective space maps)
49  * with each transaction group, we benefit from having a smaller space map
50  * block size since it allows us to issue more I/O operations scattered
51  * around the disk.
52  */
53 int zfs_metaslab_sm_blksz = (1 << 12);
54 
55 /*
56  * The in-core space map representation is more compact than its on-disk form.
57  * The zfs_condense_pct determines how much more compact the in-core
58  * space map representation must be before we compact it on-disk.
59  * Values should be greater than or equal to 100.
60  */
61 int zfs_condense_pct = 200;
62 
63 /*
64  * Condensing a metaslab is not guaranteed to actually reduce the amount of
65  * space used on disk. In particular, a space map uses data in increments of
66  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
67  * same number of blocks after condensing. Since the goal of condensing is to
68  * reduce the number of IOPs required to read the space map, we only want to
69  * condense when we can be sure we will reduce the number of blocks used by the
70  * space map. Unfortunately, we cannot precisely compute whether or not this is
71  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
72  * we apply the following heuristic: do not condense a spacemap unless the
73  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
74  * blocks.
75  */
76 int zfs_metaslab_condense_block_threshold = 4;
77 
78 /*
79  * The zfs_mg_noalloc_threshold defines which metaslab groups should
80  * be eligible for allocation. The value is defined as a percentage of
81  * free space. Metaslab groups that have more free space than
82  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
83  * a metaslab group's free space is less than or equal to the
84  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
85  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
86  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
87  * groups are allowed to accept allocations. Gang blocks are always
88  * eligible to allocate on any metaslab group. The default value of 0 means
89  * no metaslab group will be excluded based on this criterion.
90  */
91 int zfs_mg_noalloc_threshold = 0;
92 
93 /*
94  * Metaslab groups are considered eligible for allocations if their
95  * fragmenation metric (measured as a percentage) is less than or equal to
96  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
97  * then it will be skipped unless all metaslab groups within the metaslab
98  * class have also crossed this threshold.
99  */
100 int zfs_mg_fragmentation_threshold = 85;
101 
102 /*
103  * Allow metaslabs to keep their active state as long as their fragmentation
104  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
105  * active metaslab that exceeds this threshold will no longer keep its active
106  * status allowing better metaslabs to be selected.
107  */
108 int zfs_metaslab_fragmentation_threshold = 70;
109 
110 /*
111  * When set will load all metaslabs when pool is first opened.
112  */
113 int metaslab_debug_load = 0;
114 
115 /*
116  * When set will prevent metaslabs from being unloaded.
117  */
118 int metaslab_debug_unload = 0;
119 
120 /*
121  * Minimum size which forces the dynamic allocator to change
122  * it's allocation strategy.  Once the space map cannot satisfy
123  * an allocation of this size then it switches to using more
124  * aggressive strategy (i.e search by size rather than offset).
125  */
126 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
127 
128 /*
129  * The minimum free space, in percent, which must be available
130  * in a space map to continue allocations in a first-fit fashion.
131  * Once the space map's free space drops below this level we dynamically
132  * switch to using best-fit allocations.
133  */
134 int metaslab_df_free_pct = 4;
135 
136 /*
137  * A metaslab is considered "free" if it contains a contiguous
138  * segment which is greater than metaslab_min_alloc_size.
139  */
140 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
141 
142 /*
143  * Percentage of all cpus that can be used by the metaslab taskq.
144  */
145 int metaslab_load_pct = 50;
146 
147 /*
148  * Determines how many txgs a metaslab may remain loaded without having any
149  * allocations from it. As long as a metaslab continues to be used we will
150  * keep it loaded.
151  */
152 int metaslab_unload_delay = TXG_SIZE * 2;
153 
154 /*
155  * Max number of metaslabs per group to preload.
156  */
157 int metaslab_preload_limit = SPA_DVAS_PER_BP;
158 
159 /*
160  * Enable/disable preloading of metaslab.
161  */
162 boolean_t metaslab_preload_enabled = B_TRUE;
163 
164 /*
165  * Enable/disable fragmentation weighting on metaslabs.
166  */
167 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
168 
169 /*
170  * Enable/disable lba weighting (i.e. outer tracks are given preference).
171  */
172 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
173 
174 /*
175  * Enable/disable metaslab group biasing.
176  */
177 boolean_t metaslab_bias_enabled = B_TRUE;
178 
179 /*
180  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
181  */
182 boolean_t zfs_remap_blkptr_enable = B_TRUE;
183 
184 /*
185  * Enable/disable segment-based metaslab selection.
186  */
187 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
188 
189 /*
190  * When using segment-based metaslab selection, we will continue
191  * allocating from the active metaslab until we have exhausted
192  * zfs_metaslab_switch_threshold of its buckets.
193  */
194 int zfs_metaslab_switch_threshold = 2;
195 
196 /*
197  * Internal switch to enable/disable the metaslab allocation tracing
198  * facility.
199  */
200 boolean_t metaslab_trace_enabled = B_TRUE;
201 
202 /*
203  * Maximum entries that the metaslab allocation tracing facility will keep
204  * in a given list when running in non-debug mode. We limit the number
205  * of entries in non-debug mode to prevent us from using up too much memory.
206  * The limit should be sufficiently large that we don't expect any allocation
207  * to every exceed this value. In debug mode, the system will panic if this
208  * limit is ever reached allowing for further investigation.
209  */
210 uint64_t metaslab_trace_max_entries = 5000;
211 
212 static uint64_t metaslab_weight(metaslab_t *);
213 static void metaslab_set_fragmentation(metaslab_t *);
214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
216 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
217 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
218 
219 kmem_cache_t *metaslab_alloc_trace_cache;
220 
221 /*
222  * ==========================================================================
223  * Metaslab classes
224  * ==========================================================================
225  */
226 metaslab_class_t *
227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
228 {
229 	metaslab_class_t *mc;
230 
231 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
232 
233 	mc->mc_spa = spa;
234 	mc->mc_rotor = NULL;
235 	mc->mc_ops = ops;
236 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
237 	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
238 	    sizeof (zfs_refcount_t), KM_SLEEP);
239 	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
240 	    sizeof (uint64_t), KM_SLEEP);
241 	for (int i = 0; i < spa->spa_alloc_count; i++)
242 		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
243 
244 	return (mc);
245 }
246 
247 void
248 metaslab_class_destroy(metaslab_class_t *mc)
249 {
250 	ASSERT(mc->mc_rotor == NULL);
251 	ASSERT(mc->mc_alloc == 0);
252 	ASSERT(mc->mc_deferred == 0);
253 	ASSERT(mc->mc_space == 0);
254 	ASSERT(mc->mc_dspace == 0);
255 
256 	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
257 		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
258 	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
259 	    sizeof (zfs_refcount_t));
260 	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
261 	    sizeof (uint64_t));
262 	mutex_destroy(&mc->mc_lock);
263 	kmem_free(mc, sizeof (metaslab_class_t));
264 }
265 
266 int
267 metaslab_class_validate(metaslab_class_t *mc)
268 {
269 	metaslab_group_t *mg;
270 	vdev_t *vd;
271 
272 	/*
273 	 * Must hold one of the spa_config locks.
274 	 */
275 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
276 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
277 
278 	if ((mg = mc->mc_rotor) == NULL)
279 		return (0);
280 
281 	do {
282 		vd = mg->mg_vd;
283 		ASSERT(vd->vdev_mg != NULL);
284 		ASSERT3P(vd->vdev_top, ==, vd);
285 		ASSERT3P(mg->mg_class, ==, mc);
286 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
287 	} while ((mg = mg->mg_next) != mc->mc_rotor);
288 
289 	return (0);
290 }
291 
292 static void
293 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
294     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
295 {
296 	atomic_add_64(&mc->mc_alloc, alloc_delta);
297 	atomic_add_64(&mc->mc_deferred, defer_delta);
298 	atomic_add_64(&mc->mc_space, space_delta);
299 	atomic_add_64(&mc->mc_dspace, dspace_delta);
300 }
301 
302 uint64_t
303 metaslab_class_get_alloc(metaslab_class_t *mc)
304 {
305 	return (mc->mc_alloc);
306 }
307 
308 uint64_t
309 metaslab_class_get_deferred(metaslab_class_t *mc)
310 {
311 	return (mc->mc_deferred);
312 }
313 
314 uint64_t
315 metaslab_class_get_space(metaslab_class_t *mc)
316 {
317 	return (mc->mc_space);
318 }
319 
320 uint64_t
321 metaslab_class_get_dspace(metaslab_class_t *mc)
322 {
323 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
324 }
325 
326 void
327 metaslab_class_histogram_verify(metaslab_class_t *mc)
328 {
329 	spa_t *spa = mc->mc_spa;
330 	vdev_t *rvd = spa->spa_root_vdev;
331 	uint64_t *mc_hist;
332 	int i;
333 
334 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
335 		return;
336 
337 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
338 	    KM_SLEEP);
339 
340 	for (int c = 0; c < rvd->vdev_children; c++) {
341 		vdev_t *tvd = rvd->vdev_child[c];
342 		metaslab_group_t *mg = tvd->vdev_mg;
343 
344 		/*
345 		 * Skip any holes, uninitialized top-levels, or
346 		 * vdevs that are not in this metalab class.
347 		 */
348 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
349 		    mg->mg_class != mc) {
350 			continue;
351 		}
352 
353 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
354 			mc_hist[i] += mg->mg_histogram[i];
355 	}
356 
357 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
358 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
359 
360 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
361 }
362 
363 /*
364  * Calculate the metaslab class's fragmentation metric. The metric
365  * is weighted based on the space contribution of each metaslab group.
366  * The return value will be a number between 0 and 100 (inclusive), or
367  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
368  * zfs_frag_table for more information about the metric.
369  */
370 uint64_t
371 metaslab_class_fragmentation(metaslab_class_t *mc)
372 {
373 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
374 	uint64_t fragmentation = 0;
375 
376 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
377 
378 	for (int c = 0; c < rvd->vdev_children; c++) {
379 		vdev_t *tvd = rvd->vdev_child[c];
380 		metaslab_group_t *mg = tvd->vdev_mg;
381 
382 		/*
383 		 * Skip any holes, uninitialized top-levels,
384 		 * or vdevs that are not in this metalab class.
385 		 */
386 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
387 		    mg->mg_class != mc) {
388 			continue;
389 		}
390 
391 		/*
392 		 * If a metaslab group does not contain a fragmentation
393 		 * metric then just bail out.
394 		 */
395 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
396 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
397 			return (ZFS_FRAG_INVALID);
398 		}
399 
400 		/*
401 		 * Determine how much this metaslab_group is contributing
402 		 * to the overall pool fragmentation metric.
403 		 */
404 		fragmentation += mg->mg_fragmentation *
405 		    metaslab_group_get_space(mg);
406 	}
407 	fragmentation /= metaslab_class_get_space(mc);
408 
409 	ASSERT3U(fragmentation, <=, 100);
410 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
411 	return (fragmentation);
412 }
413 
414 /*
415  * Calculate the amount of expandable space that is available in
416  * this metaslab class. If a device is expanded then its expandable
417  * space will be the amount of allocatable space that is currently not
418  * part of this metaslab class.
419  */
420 uint64_t
421 metaslab_class_expandable_space(metaslab_class_t *mc)
422 {
423 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
424 	uint64_t space = 0;
425 
426 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
427 	for (int c = 0; c < rvd->vdev_children; c++) {
428 		uint64_t tspace;
429 		vdev_t *tvd = rvd->vdev_child[c];
430 		metaslab_group_t *mg = tvd->vdev_mg;
431 
432 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
433 		    mg->mg_class != mc) {
434 			continue;
435 		}
436 
437 		/*
438 		 * Calculate if we have enough space to add additional
439 		 * metaslabs. We report the expandable space in terms
440 		 * of the metaslab size since that's the unit of expansion.
441 		 * Adjust by efi system partition size.
442 		 */
443 		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
444 		if (tspace > mc->mc_spa->spa_bootsize) {
445 			tspace -= mc->mc_spa->spa_bootsize;
446 		}
447 		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
448 	}
449 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
450 	return (space);
451 }
452 
453 static int
454 metaslab_compare(const void *x1, const void *x2)
455 {
456 	const metaslab_t *m1 = (const metaslab_t *)x1;
457 	const metaslab_t *m2 = (const metaslab_t *)x2;
458 
459 	int sort1 = 0;
460 	int sort2 = 0;
461 	if (m1->ms_allocator != -1 && m1->ms_primary)
462 		sort1 = 1;
463 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
464 		sort1 = 2;
465 	if (m2->ms_allocator != -1 && m2->ms_primary)
466 		sort2 = 1;
467 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
468 		sort2 = 2;
469 
470 	/*
471 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
472 	 * selecting a metaslab to allocate from, an allocator first tries its
473 	 * primary, then secondary active metaslab. If it doesn't have active
474 	 * metaslabs, or can't allocate from them, it searches for an inactive
475 	 * metaslab to activate. If it can't find a suitable one, it will steal
476 	 * a primary or secondary metaslab from another allocator.
477 	 */
478 	if (sort1 < sort2)
479 		return (-1);
480 	if (sort1 > sort2)
481 		return (1);
482 
483 	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
484 	if (likely(cmp))
485 		return (cmp);
486 
487 	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
488 
489 	return (AVL_CMP(m1->ms_start, m2->ms_start));
490 }
491 
492 uint64_t
493 metaslab_allocated_space(metaslab_t *msp)
494 {
495 	return (msp->ms_allocated_space);
496 }
497 
498 /*
499  * Verify that the space accounting on disk matches the in-core range_trees.
500  */
501 static void
502 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
503 {
504 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
505 	uint64_t allocating = 0;
506 	uint64_t sm_free_space, msp_free_space;
507 
508 	ASSERT(MUTEX_HELD(&msp->ms_lock));
509 	ASSERT(!msp->ms_condensing);
510 
511 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
512 		return;
513 
514 	/*
515 	 * We can only verify the metaslab space when we're called
516 	 * from syncing context with a loaded metaslab that has an
517 	 * allocated space map. Calling this in non-syncing context
518 	 * does not provide a consistent view of the metaslab since
519 	 * we're performing allocations in the future.
520 	 */
521 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
522 	    !msp->ms_loaded)
523 		return;
524 
525 	/*
526 	 * Even though the smp_alloc field can get negative (e.g.
527 	 * see vdev_checkpoint_sm), that should never be the case
528 	 * when it come's to a metaslab's space map.
529 	 */
530 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
531 
532 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
533 
534 	/*
535 	 * Account for future allocations since we would have
536 	 * already deducted that space from the ms_allocatable.
537 	 */
538 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
539 		allocating +=
540 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
541 	}
542 
543 	ASSERT3U(msp->ms_deferspace, ==,
544 	    range_tree_space(msp->ms_defer[0]) +
545 	    range_tree_space(msp->ms_defer[1]));
546 
547 	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
548 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
549 
550 	VERIFY3U(sm_free_space, ==, msp_free_space);
551 }
552 
553 /*
554  * ==========================================================================
555  * Metaslab groups
556  * ==========================================================================
557  */
558 /*
559  * Update the allocatable flag and the metaslab group's capacity.
560  * The allocatable flag is set to true if the capacity is below
561  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
562  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
563  * transitions from allocatable to non-allocatable or vice versa then the
564  * metaslab group's class is updated to reflect the transition.
565  */
566 static void
567 metaslab_group_alloc_update(metaslab_group_t *mg)
568 {
569 	vdev_t *vd = mg->mg_vd;
570 	metaslab_class_t *mc = mg->mg_class;
571 	vdev_stat_t *vs = &vd->vdev_stat;
572 	boolean_t was_allocatable;
573 	boolean_t was_initialized;
574 
575 	ASSERT(vd == vd->vdev_top);
576 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
577 	    SCL_ALLOC);
578 
579 	mutex_enter(&mg->mg_lock);
580 	was_allocatable = mg->mg_allocatable;
581 	was_initialized = mg->mg_initialized;
582 
583 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
584 	    (vs->vs_space + 1);
585 
586 	mutex_enter(&mc->mc_lock);
587 
588 	/*
589 	 * If the metaslab group was just added then it won't
590 	 * have any space until we finish syncing out this txg.
591 	 * At that point we will consider it initialized and available
592 	 * for allocations.  We also don't consider non-activated
593 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
594 	 * to be initialized, because they can't be used for allocation.
595 	 */
596 	mg->mg_initialized = metaslab_group_initialized(mg);
597 	if (!was_initialized && mg->mg_initialized) {
598 		mc->mc_groups++;
599 	} else if (was_initialized && !mg->mg_initialized) {
600 		ASSERT3U(mc->mc_groups, >, 0);
601 		mc->mc_groups--;
602 	}
603 	if (mg->mg_initialized)
604 		mg->mg_no_free_space = B_FALSE;
605 
606 	/*
607 	 * A metaslab group is considered allocatable if it has plenty
608 	 * of free space or is not heavily fragmented. We only take
609 	 * fragmentation into account if the metaslab group has a valid
610 	 * fragmentation metric (i.e. a value between 0 and 100).
611 	 */
612 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
613 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
614 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
615 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
616 
617 	/*
618 	 * The mc_alloc_groups maintains a count of the number of
619 	 * groups in this metaslab class that are still above the
620 	 * zfs_mg_noalloc_threshold. This is used by the allocating
621 	 * threads to determine if they should avoid allocations to
622 	 * a given group. The allocator will avoid allocations to a group
623 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
624 	 * and there are still other groups that are above the threshold.
625 	 * When a group transitions from allocatable to non-allocatable or
626 	 * vice versa we update the metaslab class to reflect that change.
627 	 * When the mc_alloc_groups value drops to 0 that means that all
628 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
629 	 * eligible for allocations. This effectively means that all devices
630 	 * are balanced again.
631 	 */
632 	if (was_allocatable && !mg->mg_allocatable)
633 		mc->mc_alloc_groups--;
634 	else if (!was_allocatable && mg->mg_allocatable)
635 		mc->mc_alloc_groups++;
636 	mutex_exit(&mc->mc_lock);
637 
638 	mutex_exit(&mg->mg_lock);
639 }
640 
641 metaslab_group_t *
642 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
643 {
644 	metaslab_group_t *mg;
645 
646 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
647 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
648 	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
649 	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
650 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
651 	    KM_SLEEP);
652 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
653 	    KM_SLEEP);
654 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
655 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
656 	mg->mg_vd = vd;
657 	mg->mg_class = mc;
658 	mg->mg_activation_count = 0;
659 	mg->mg_initialized = B_FALSE;
660 	mg->mg_no_free_space = B_TRUE;
661 	mg->mg_allocators = allocators;
662 
663 	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
664 	    sizeof (zfs_refcount_t), KM_SLEEP);
665 	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
666 	    sizeof (uint64_t), KM_SLEEP);
667 	for (int i = 0; i < allocators; i++) {
668 		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
669 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
670 	}
671 
672 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
673 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
674 
675 	return (mg);
676 }
677 
678 void
679 metaslab_group_destroy(metaslab_group_t *mg)
680 {
681 	ASSERT(mg->mg_prev == NULL);
682 	ASSERT(mg->mg_next == NULL);
683 	/*
684 	 * We may have gone below zero with the activation count
685 	 * either because we never activated in the first place or
686 	 * because we're done, and possibly removing the vdev.
687 	 */
688 	ASSERT(mg->mg_activation_count <= 0);
689 
690 	taskq_destroy(mg->mg_taskq);
691 	avl_destroy(&mg->mg_metaslab_tree);
692 	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
693 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
694 	    sizeof (metaslab_t *));
695 	mutex_destroy(&mg->mg_lock);
696 	mutex_destroy(&mg->mg_ms_initialize_lock);
697 	cv_destroy(&mg->mg_ms_initialize_cv);
698 
699 	for (int i = 0; i < mg->mg_allocators; i++) {
700 		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
701 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
702 	}
703 	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
704 	    sizeof (zfs_refcount_t));
705 	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
706 	    sizeof (uint64_t));
707 
708 	kmem_free(mg, sizeof (metaslab_group_t));
709 }
710 
711 void
712 metaslab_group_activate(metaslab_group_t *mg)
713 {
714 	metaslab_class_t *mc = mg->mg_class;
715 	metaslab_group_t *mgprev, *mgnext;
716 
717 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
718 
719 	ASSERT(mc->mc_rotor != mg);
720 	ASSERT(mg->mg_prev == NULL);
721 	ASSERT(mg->mg_next == NULL);
722 	ASSERT(mg->mg_activation_count <= 0);
723 
724 	if (++mg->mg_activation_count <= 0)
725 		return;
726 
727 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
728 	metaslab_group_alloc_update(mg);
729 
730 	if ((mgprev = mc->mc_rotor) == NULL) {
731 		mg->mg_prev = mg;
732 		mg->mg_next = mg;
733 	} else {
734 		mgnext = mgprev->mg_next;
735 		mg->mg_prev = mgprev;
736 		mg->mg_next = mgnext;
737 		mgprev->mg_next = mg;
738 		mgnext->mg_prev = mg;
739 	}
740 	mc->mc_rotor = mg;
741 }
742 
743 /*
744  * Passivate a metaslab group and remove it from the allocation rotor.
745  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
746  * a metaslab group. This function will momentarily drop spa_config_locks
747  * that are lower than the SCL_ALLOC lock (see comment below).
748  */
749 void
750 metaslab_group_passivate(metaslab_group_t *mg)
751 {
752 	metaslab_class_t *mc = mg->mg_class;
753 	spa_t *spa = mc->mc_spa;
754 	metaslab_group_t *mgprev, *mgnext;
755 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
756 
757 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
758 	    (SCL_ALLOC | SCL_ZIO));
759 
760 	if (--mg->mg_activation_count != 0) {
761 		ASSERT(mc->mc_rotor != mg);
762 		ASSERT(mg->mg_prev == NULL);
763 		ASSERT(mg->mg_next == NULL);
764 		ASSERT(mg->mg_activation_count < 0);
765 		return;
766 	}
767 
768 	/*
769 	 * The spa_config_lock is an array of rwlocks, ordered as
770 	 * follows (from highest to lowest):
771 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
772 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
773 	 * (For more information about the spa_config_lock see spa_misc.c)
774 	 * The higher the lock, the broader its coverage. When we passivate
775 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
776 	 * config locks. However, the metaslab group's taskq might be trying
777 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
778 	 * lower locks to allow the I/O to complete. At a minimum,
779 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
780 	 * allocations from taking place and any changes to the vdev tree.
781 	 */
782 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
783 	taskq_wait(mg->mg_taskq);
784 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
785 	metaslab_group_alloc_update(mg);
786 	for (int i = 0; i < mg->mg_allocators; i++) {
787 		metaslab_t *msp = mg->mg_primaries[i];
788 		if (msp != NULL) {
789 			mutex_enter(&msp->ms_lock);
790 			metaslab_passivate(msp,
791 			    metaslab_weight_from_range_tree(msp));
792 			mutex_exit(&msp->ms_lock);
793 		}
794 		msp = mg->mg_secondaries[i];
795 		if (msp != NULL) {
796 			mutex_enter(&msp->ms_lock);
797 			metaslab_passivate(msp,
798 			    metaslab_weight_from_range_tree(msp));
799 			mutex_exit(&msp->ms_lock);
800 		}
801 	}
802 
803 	mgprev = mg->mg_prev;
804 	mgnext = mg->mg_next;
805 
806 	if (mg == mgnext) {
807 		mc->mc_rotor = NULL;
808 	} else {
809 		mc->mc_rotor = mgnext;
810 		mgprev->mg_next = mgnext;
811 		mgnext->mg_prev = mgprev;
812 	}
813 
814 	mg->mg_prev = NULL;
815 	mg->mg_next = NULL;
816 }
817 
818 boolean_t
819 metaslab_group_initialized(metaslab_group_t *mg)
820 {
821 	vdev_t *vd = mg->mg_vd;
822 	vdev_stat_t *vs = &vd->vdev_stat;
823 
824 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
825 }
826 
827 uint64_t
828 metaslab_group_get_space(metaslab_group_t *mg)
829 {
830 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
831 }
832 
833 void
834 metaslab_group_histogram_verify(metaslab_group_t *mg)
835 {
836 	uint64_t *mg_hist;
837 	vdev_t *vd = mg->mg_vd;
838 	uint64_t ashift = vd->vdev_ashift;
839 	int i;
840 
841 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
842 		return;
843 
844 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
845 	    KM_SLEEP);
846 
847 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
848 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
849 
850 	for (int m = 0; m < vd->vdev_ms_count; m++) {
851 		metaslab_t *msp = vd->vdev_ms[m];
852 		ASSERT(msp != NULL);
853 
854 		/* skip if not active or not a member */
855 		if (msp->ms_sm == NULL || msp->ms_group != mg)
856 			continue;
857 
858 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
859 			mg_hist[i + ashift] +=
860 			    msp->ms_sm->sm_phys->smp_histogram[i];
861 	}
862 
863 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
864 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
865 
866 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
867 }
868 
869 static void
870 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
871 {
872 	metaslab_class_t *mc = mg->mg_class;
873 	uint64_t ashift = mg->mg_vd->vdev_ashift;
874 
875 	ASSERT(MUTEX_HELD(&msp->ms_lock));
876 	if (msp->ms_sm == NULL)
877 		return;
878 
879 	mutex_enter(&mg->mg_lock);
880 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
881 		mg->mg_histogram[i + ashift] +=
882 		    msp->ms_sm->sm_phys->smp_histogram[i];
883 		mc->mc_histogram[i + ashift] +=
884 		    msp->ms_sm->sm_phys->smp_histogram[i];
885 	}
886 	mutex_exit(&mg->mg_lock);
887 }
888 
889 void
890 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
891 {
892 	metaslab_class_t *mc = mg->mg_class;
893 	uint64_t ashift = mg->mg_vd->vdev_ashift;
894 
895 	ASSERT(MUTEX_HELD(&msp->ms_lock));
896 	if (msp->ms_sm == NULL)
897 		return;
898 
899 	mutex_enter(&mg->mg_lock);
900 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
901 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
902 		    msp->ms_sm->sm_phys->smp_histogram[i]);
903 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
904 		    msp->ms_sm->sm_phys->smp_histogram[i]);
905 
906 		mg->mg_histogram[i + ashift] -=
907 		    msp->ms_sm->sm_phys->smp_histogram[i];
908 		mc->mc_histogram[i + ashift] -=
909 		    msp->ms_sm->sm_phys->smp_histogram[i];
910 	}
911 	mutex_exit(&mg->mg_lock);
912 }
913 
914 static void
915 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
916 {
917 	ASSERT(msp->ms_group == NULL);
918 	mutex_enter(&mg->mg_lock);
919 	msp->ms_group = mg;
920 	msp->ms_weight = 0;
921 	avl_add(&mg->mg_metaslab_tree, msp);
922 	mutex_exit(&mg->mg_lock);
923 
924 	mutex_enter(&msp->ms_lock);
925 	metaslab_group_histogram_add(mg, msp);
926 	mutex_exit(&msp->ms_lock);
927 }
928 
929 static void
930 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
931 {
932 	mutex_enter(&msp->ms_lock);
933 	metaslab_group_histogram_remove(mg, msp);
934 	mutex_exit(&msp->ms_lock);
935 
936 	mutex_enter(&mg->mg_lock);
937 	ASSERT(msp->ms_group == mg);
938 	avl_remove(&mg->mg_metaslab_tree, msp);
939 	msp->ms_group = NULL;
940 	mutex_exit(&mg->mg_lock);
941 }
942 
943 static void
944 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
945 {
946 	ASSERT(MUTEX_HELD(&mg->mg_lock));
947 	ASSERT(msp->ms_group == mg);
948 	avl_remove(&mg->mg_metaslab_tree, msp);
949 	msp->ms_weight = weight;
950 	avl_add(&mg->mg_metaslab_tree, msp);
951 
952 }
953 
954 static void
955 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
956 {
957 	/*
958 	 * Although in principle the weight can be any value, in
959 	 * practice we do not use values in the range [1, 511].
960 	 */
961 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
962 	ASSERT(MUTEX_HELD(&msp->ms_lock));
963 
964 	mutex_enter(&mg->mg_lock);
965 	metaslab_group_sort_impl(mg, msp, weight);
966 	mutex_exit(&mg->mg_lock);
967 }
968 
969 /*
970  * Calculate the fragmentation for a given metaslab group. We can use
971  * a simple average here since all metaslabs within the group must have
972  * the same size. The return value will be a value between 0 and 100
973  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
974  * group have a fragmentation metric.
975  */
976 uint64_t
977 metaslab_group_fragmentation(metaslab_group_t *mg)
978 {
979 	vdev_t *vd = mg->mg_vd;
980 	uint64_t fragmentation = 0;
981 	uint64_t valid_ms = 0;
982 
983 	for (int m = 0; m < vd->vdev_ms_count; m++) {
984 		metaslab_t *msp = vd->vdev_ms[m];
985 
986 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
987 			continue;
988 		if (msp->ms_group != mg)
989 			continue;
990 
991 		valid_ms++;
992 		fragmentation += msp->ms_fragmentation;
993 	}
994 
995 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
996 		return (ZFS_FRAG_INVALID);
997 
998 	fragmentation /= valid_ms;
999 	ASSERT3U(fragmentation, <=, 100);
1000 	return (fragmentation);
1001 }
1002 
1003 /*
1004  * Determine if a given metaslab group should skip allocations. A metaslab
1005  * group should avoid allocations if its free capacity is less than the
1006  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1007  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1008  * that can still handle allocations. If the allocation throttle is enabled
1009  * then we skip allocations to devices that have reached their maximum
1010  * allocation queue depth unless the selected metaslab group is the only
1011  * eligible group remaining.
1012  */
1013 static boolean_t
1014 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1015     uint64_t psize, int allocator)
1016 {
1017 	spa_t *spa = mg->mg_vd->vdev_spa;
1018 	metaslab_class_t *mc = mg->mg_class;
1019 
1020 	/*
1021 	 * We can only consider skipping this metaslab group if it's
1022 	 * in the normal metaslab class and there are other metaslab
1023 	 * groups to select from. Otherwise, we always consider it eligible
1024 	 * for allocations.
1025 	 */
1026 	if ((mc != spa_normal_class(spa) &&
1027 	    mc != spa_special_class(spa) &&
1028 	    mc != spa_dedup_class(spa)) ||
1029 	    mc->mc_groups <= 1)
1030 		return (B_TRUE);
1031 
1032 	/*
1033 	 * If the metaslab group's mg_allocatable flag is set (see comments
1034 	 * in metaslab_group_alloc_update() for more information) and
1035 	 * the allocation throttle is disabled then allow allocations to this
1036 	 * device. However, if the allocation throttle is enabled then
1037 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
1038 	 * to determine if we should allow allocations to this metaslab group.
1039 	 * If all metaslab groups are no longer considered allocatable
1040 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1041 	 * gang block size then we allow allocations on this metaslab group
1042 	 * regardless of the mg_allocatable or throttle settings.
1043 	 */
1044 	if (mg->mg_allocatable) {
1045 		metaslab_group_t *mgp;
1046 		int64_t qdepth;
1047 		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1048 
1049 		if (!mc->mc_alloc_throttle_enabled)
1050 			return (B_TRUE);
1051 
1052 		/*
1053 		 * If this metaslab group does not have any free space, then
1054 		 * there is no point in looking further.
1055 		 */
1056 		if (mg->mg_no_free_space)
1057 			return (B_FALSE);
1058 
1059 		qdepth = zfs_refcount_count(
1060 		    &mg->mg_alloc_queue_depth[allocator]);
1061 
1062 		/*
1063 		 * If this metaslab group is below its qmax or it's
1064 		 * the only allocatable metasable group, then attempt
1065 		 * to allocate from it.
1066 		 */
1067 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
1068 			return (B_TRUE);
1069 		ASSERT3U(mc->mc_alloc_groups, >, 1);
1070 
1071 		/*
1072 		 * Since this metaslab group is at or over its qmax, we
1073 		 * need to determine if there are metaslab groups after this
1074 		 * one that might be able to handle this allocation. This is
1075 		 * racy since we can't hold the locks for all metaslab
1076 		 * groups at the same time when we make this check.
1077 		 */
1078 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1079 			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1080 
1081 			qdepth = zfs_refcount_count(
1082 			    &mgp->mg_alloc_queue_depth[allocator]);
1083 
1084 			/*
1085 			 * If there is another metaslab group that
1086 			 * might be able to handle the allocation, then
1087 			 * we return false so that we skip this group.
1088 			 */
1089 			if (qdepth < qmax && !mgp->mg_no_free_space)
1090 				return (B_FALSE);
1091 		}
1092 
1093 		/*
1094 		 * We didn't find another group to handle the allocation
1095 		 * so we can't skip this metaslab group even though
1096 		 * we are at or over our qmax.
1097 		 */
1098 		return (B_TRUE);
1099 
1100 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1101 		return (B_TRUE);
1102 	}
1103 	return (B_FALSE);
1104 }
1105 
1106 /*
1107  * ==========================================================================
1108  * Range tree callbacks
1109  * ==========================================================================
1110  */
1111 
1112 /*
1113  * Comparison function for the private size-ordered tree. Tree is sorted
1114  * by size, larger sizes at the end of the tree.
1115  */
1116 static int
1117 metaslab_rangesize_compare(const void *x1, const void *x2)
1118 {
1119 	const range_seg_t *r1 = x1;
1120 	const range_seg_t *r2 = x2;
1121 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1122 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1123 
1124 	int cmp = AVL_CMP(rs_size1, rs_size2);
1125 	if (likely(cmp))
1126 		return (cmp);
1127 
1128 	return (AVL_CMP(r1->rs_start, r2->rs_start));
1129 }
1130 
1131 /*
1132  * Create any block allocator specific components. The current allocators
1133  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1134  */
1135 static void
1136 metaslab_rt_create(range_tree_t *rt, void *arg)
1137 {
1138 	metaslab_t *msp = arg;
1139 
1140 	ASSERT3P(rt->rt_arg, ==, msp);
1141 	ASSERT(msp->ms_allocatable == NULL);
1142 
1143 	avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
1144 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1145 }
1146 
1147 /*
1148  * Destroy the block allocator specific components.
1149  */
1150 static void
1151 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1152 {
1153 	metaslab_t *msp = arg;
1154 
1155 	ASSERT3P(rt->rt_arg, ==, msp);
1156 	ASSERT3P(msp->ms_allocatable, ==, rt);
1157 	ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));
1158 
1159 	avl_destroy(&msp->ms_allocatable_by_size);
1160 }
1161 
1162 static void
1163 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1164 {
1165 	metaslab_t *msp = arg;
1166 
1167 	ASSERT3P(rt->rt_arg, ==, msp);
1168 	ASSERT3P(msp->ms_allocatable, ==, rt);
1169 	VERIFY(!msp->ms_condensing);
1170 	avl_add(&msp->ms_allocatable_by_size, rs);
1171 }
1172 
1173 static void
1174 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1175 {
1176 	metaslab_t *msp = arg;
1177 
1178 	ASSERT3P(rt->rt_arg, ==, msp);
1179 	ASSERT3P(msp->ms_allocatable, ==, rt);
1180 	VERIFY(!msp->ms_condensing);
1181 	avl_remove(&msp->ms_allocatable_by_size, rs);
1182 }
1183 
1184 static void
1185 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1186 {
1187 	metaslab_t *msp = arg;
1188 
1189 	ASSERT3P(rt->rt_arg, ==, msp);
1190 	ASSERT3P(msp->ms_allocatable, ==, rt);
1191 
1192 	/*
1193 	 * Normally one would walk the tree freeing nodes along the way.
1194 	 * Since the nodes are shared with the range trees we can avoid
1195 	 * walking all nodes and just reinitialize the avl tree. The nodes
1196 	 * will be freed by the range tree, so we don't want to free them here.
1197 	 */
1198 	avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
1199 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1200 }
1201 
1202 static range_tree_ops_t metaslab_rt_ops = {
1203 	metaslab_rt_create,
1204 	metaslab_rt_destroy,
1205 	metaslab_rt_add,
1206 	metaslab_rt_remove,
1207 	metaslab_rt_vacate
1208 };
1209 
1210 /*
1211  * ==========================================================================
1212  * Common allocator routines
1213  * ==========================================================================
1214  */
1215 
1216 /*
1217  * Return the maximum contiguous segment within the metaslab.
1218  */
1219 uint64_t
1220 metaslab_block_maxsize(metaslab_t *msp)
1221 {
1222 	avl_tree_t *t = &msp->ms_allocatable_by_size;
1223 	range_seg_t *rs;
1224 
1225 	if (t == NULL || (rs = avl_last(t)) == NULL)
1226 		return (0ULL);
1227 
1228 	return (rs->rs_end - rs->rs_start);
1229 }
1230 
1231 static range_seg_t *
1232 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1233 {
1234 	range_seg_t *rs, rsearch;
1235 	avl_index_t where;
1236 
1237 	rsearch.rs_start = start;
1238 	rsearch.rs_end = start + size;
1239 
1240 	rs = avl_find(t, &rsearch, &where);
1241 	if (rs == NULL) {
1242 		rs = avl_nearest(t, where, AVL_AFTER);
1243 	}
1244 
1245 	return (rs);
1246 }
1247 
1248 /*
1249  * This is a helper function that can be used by the allocator to find
1250  * a suitable block to allocate. This will search the specified AVL
1251  * tree looking for a block that matches the specified criteria.
1252  */
1253 static uint64_t
1254 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1255     uint64_t align)
1256 {
1257 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1258 
1259 	while (rs != NULL) {
1260 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1261 
1262 		if (offset + size <= rs->rs_end) {
1263 			*cursor = offset + size;
1264 			return (offset);
1265 		}
1266 		rs = AVL_NEXT(t, rs);
1267 	}
1268 
1269 	/*
1270 	 * If we know we've searched the whole map (*cursor == 0), give up.
1271 	 * Otherwise, reset the cursor to the beginning and try again.
1272 	 */
1273 	if (*cursor == 0)
1274 		return (-1ULL);
1275 
1276 	*cursor = 0;
1277 	return (metaslab_block_picker(t, cursor, size, align));
1278 }
1279 
1280 /*
1281  * ==========================================================================
1282  * The first-fit block allocator
1283  * ==========================================================================
1284  */
1285 static uint64_t
1286 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1287 {
1288 	/*
1289 	 * Find the largest power of 2 block size that evenly divides the
1290 	 * requested size. This is used to try to allocate blocks with similar
1291 	 * alignment from the same area of the metaslab (i.e. same cursor
1292 	 * bucket) but it does not guarantee that other allocations sizes
1293 	 * may exist in the same region.
1294 	 */
1295 	uint64_t align = size & -size;
1296 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1297 	avl_tree_t *t = &msp->ms_allocatable->rt_root;
1298 
1299 	return (metaslab_block_picker(t, cursor, size, align));
1300 }
1301 
1302 static metaslab_ops_t metaslab_ff_ops = {
1303 	metaslab_ff_alloc
1304 };
1305 
1306 /*
1307  * ==========================================================================
1308  * Dynamic block allocator -
1309  * Uses the first fit allocation scheme until space get low and then
1310  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1311  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1312  * ==========================================================================
1313  */
1314 static uint64_t
1315 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1316 {
1317 	/*
1318 	 * Find the largest power of 2 block size that evenly divides the
1319 	 * requested size. This is used to try to allocate blocks with similar
1320 	 * alignment from the same area of the metaslab (i.e. same cursor
1321 	 * bucket) but it does not guarantee that other allocations sizes
1322 	 * may exist in the same region.
1323 	 */
1324 	uint64_t align = size & -size;
1325 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1326 	range_tree_t *rt = msp->ms_allocatable;
1327 	avl_tree_t *t = &rt->rt_root;
1328 	uint64_t max_size = metaslab_block_maxsize(msp);
1329 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1330 
1331 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1332 	ASSERT3U(avl_numnodes(t), ==,
1333 	    avl_numnodes(&msp->ms_allocatable_by_size));
1334 
1335 	if (max_size < size)
1336 		return (-1ULL);
1337 
1338 	/*
1339 	 * If we're running low on space switch to using the size
1340 	 * sorted AVL tree (best-fit).
1341 	 */
1342 	if (max_size < metaslab_df_alloc_threshold ||
1343 	    free_pct < metaslab_df_free_pct) {
1344 		t = &msp->ms_allocatable_by_size;
1345 		*cursor = 0;
1346 	}
1347 
1348 	return (metaslab_block_picker(t, cursor, size, 1ULL));
1349 }
1350 
1351 static metaslab_ops_t metaslab_df_ops = {
1352 	metaslab_df_alloc
1353 };
1354 
1355 /*
1356  * ==========================================================================
1357  * Cursor fit block allocator -
1358  * Select the largest region in the metaslab, set the cursor to the beginning
1359  * of the range and the cursor_end to the end of the range. As allocations
1360  * are made advance the cursor. Continue allocating from the cursor until
1361  * the range is exhausted and then find a new range.
1362  * ==========================================================================
1363  */
1364 static uint64_t
1365 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1366 {
1367 	range_tree_t *rt = msp->ms_allocatable;
1368 	avl_tree_t *t = &msp->ms_allocatable_by_size;
1369 	uint64_t *cursor = &msp->ms_lbas[0];
1370 	uint64_t *cursor_end = &msp->ms_lbas[1];
1371 	uint64_t offset = 0;
1372 
1373 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1374 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1375 
1376 	ASSERT3U(*cursor_end, >=, *cursor);
1377 
1378 	if ((*cursor + size) > *cursor_end) {
1379 		range_seg_t *rs;
1380 
1381 		rs = avl_last(&msp->ms_allocatable_by_size);
1382 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1383 			return (-1ULL);
1384 
1385 		*cursor = rs->rs_start;
1386 		*cursor_end = rs->rs_end;
1387 	}
1388 
1389 	offset = *cursor;
1390 	*cursor += size;
1391 
1392 	return (offset);
1393 }
1394 
1395 static metaslab_ops_t metaslab_cf_ops = {
1396 	metaslab_cf_alloc
1397 };
1398 
1399 /*
1400  * ==========================================================================
1401  * New dynamic fit allocator -
1402  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1403  * contiguous blocks. If no region is found then just use the largest segment
1404  * that remains.
1405  * ==========================================================================
1406  */
1407 
1408 /*
1409  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1410  * to request from the allocator.
1411  */
1412 uint64_t metaslab_ndf_clump_shift = 4;
1413 
1414 static uint64_t
1415 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1416 {
1417 	avl_tree_t *t = &msp->ms_allocatable->rt_root;
1418 	avl_index_t where;
1419 	range_seg_t *rs, rsearch;
1420 	uint64_t hbit = highbit64(size);
1421 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1422 	uint64_t max_size = metaslab_block_maxsize(msp);
1423 
1424 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1425 	ASSERT3U(avl_numnodes(t), ==,
1426 	    avl_numnodes(&msp->ms_allocatable_by_size));
1427 
1428 	if (max_size < size)
1429 		return (-1ULL);
1430 
1431 	rsearch.rs_start = *cursor;
1432 	rsearch.rs_end = *cursor + size;
1433 
1434 	rs = avl_find(t, &rsearch, &where);
1435 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1436 		t = &msp->ms_allocatable_by_size;
1437 
1438 		rsearch.rs_start = 0;
1439 		rsearch.rs_end = MIN(max_size,
1440 		    1ULL << (hbit + metaslab_ndf_clump_shift));
1441 		rs = avl_find(t, &rsearch, &where);
1442 		if (rs == NULL)
1443 			rs = avl_nearest(t, where, AVL_AFTER);
1444 		ASSERT(rs != NULL);
1445 	}
1446 
1447 	if ((rs->rs_end - rs->rs_start) >= size) {
1448 		*cursor = rs->rs_start + size;
1449 		return (rs->rs_start);
1450 	}
1451 	return (-1ULL);
1452 }
1453 
1454 static metaslab_ops_t metaslab_ndf_ops = {
1455 	metaslab_ndf_alloc
1456 };
1457 
1458 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1459 
1460 /*
1461  * ==========================================================================
1462  * Metaslabs
1463  * ==========================================================================
1464  */
1465 
1466 static void
1467 metaslab_aux_histograms_clear(metaslab_t *msp)
1468 {
1469 	/*
1470 	 * Auxiliary histograms are only cleared when resetting them,
1471 	 * which can only happen while the metaslab is loaded.
1472 	 */
1473 	ASSERT(msp->ms_loaded);
1474 
1475 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1476 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
1477 		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1478 }
1479 
1480 static void
1481 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1482     range_tree_t *rt)
1483 {
1484 	/*
1485 	 * This is modeled after space_map_histogram_add(), so refer to that
1486 	 * function for implementation details. We want this to work like
1487 	 * the space map histogram, and not the range tree histogram, as we
1488 	 * are essentially constructing a delta that will be later subtracted
1489 	 * from the space map histogram.
1490 	 */
1491 	int idx = 0;
1492 	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1493 		ASSERT3U(i, >=, idx + shift);
1494 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1495 
1496 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1497 			ASSERT3U(idx + shift, ==, i);
1498 			idx++;
1499 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1500 		}
1501 	}
1502 }
1503 
1504 /*
1505  * Called at every sync pass that the metaslab gets synced.
1506  *
1507  * The reason is that we want our auxiliary histograms to be updated
1508  * wherever the metaslab's space map histogram is updated. This way
1509  * we stay consistent on which parts of the metaslab space map's
1510  * histogram are currently not available for allocations (e.g because
1511  * they are in the defer, freed, and freeing trees).
1512  */
1513 static void
1514 metaslab_aux_histograms_update(metaslab_t *msp)
1515 {
1516 	space_map_t *sm = msp->ms_sm;
1517 	ASSERT(sm != NULL);
1518 
1519 	/*
1520 	 * This is similar to the metaslab's space map histogram updates
1521 	 * that take place in metaslab_sync(). The only difference is that
1522 	 * we only care about segments that haven't made it into the
1523 	 * ms_allocatable tree yet.
1524 	 */
1525 	if (msp->ms_loaded) {
1526 		metaslab_aux_histograms_clear(msp);
1527 
1528 		metaslab_aux_histogram_add(msp->ms_synchist,
1529 		    sm->sm_shift, msp->ms_freed);
1530 
1531 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1532 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
1533 			    sm->sm_shift, msp->ms_defer[t]);
1534 		}
1535 	}
1536 
1537 	metaslab_aux_histogram_add(msp->ms_synchist,
1538 	    sm->sm_shift, msp->ms_freeing);
1539 }
1540 
1541 /*
1542  * Called every time we are done syncing (writing to) the metaslab,
1543  * i.e. at the end of each sync pass.
1544  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1545  */
1546 static void
1547 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1548 {
1549 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1550 	space_map_t *sm = msp->ms_sm;
1551 
1552 	if (sm == NULL) {
1553 		/*
1554 		 * We came here from metaslab_init() when creating/opening a
1555 		 * pool, looking at a metaslab that hasn't had any allocations
1556 		 * yet.
1557 		 */
1558 		return;
1559 	}
1560 
1561 	/*
1562 	 * This is similar to the actions that we take for the ms_freed
1563 	 * and ms_defer trees in metaslab_sync_done().
1564 	 */
1565 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1566 	if (defer_allowed) {
1567 		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1568 		    sizeof (msp->ms_synchist));
1569 	} else {
1570 		bzero(msp->ms_deferhist[hist_index],
1571 		    sizeof (msp->ms_deferhist[hist_index]));
1572 	}
1573 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1574 }
1575 
1576 /*
1577  * Ensure that the metaslab's weight and fragmentation are consistent
1578  * with the contents of the histogram (either the range tree's histogram
1579  * or the space map's depending whether the metaslab is loaded).
1580  */
1581 static void
1582 metaslab_verify_weight_and_frag(metaslab_t *msp)
1583 {
1584 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1585 
1586 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1587 		return;
1588 
1589 	/* see comment in metaslab_verify_unflushed_changes() */
1590 	if (msp->ms_group == NULL)
1591 		return;
1592 
1593 	/*
1594 	 * Devices being removed always return a weight of 0 and leave
1595 	 * fragmentation and ms_max_size as is - there is nothing for
1596 	 * us to verify here.
1597 	 */
1598 	vdev_t *vd = msp->ms_group->mg_vd;
1599 	if (vd->vdev_removing)
1600 		return;
1601 
1602 	/*
1603 	 * If the metaslab is dirty it probably means that we've done
1604 	 * some allocations or frees that have changed our histograms
1605 	 * and thus the weight.
1606 	 */
1607 	for (int t = 0; t < TXG_SIZE; t++) {
1608 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
1609 			return;
1610 	}
1611 
1612 	/*
1613 	 * This verification checks that our in-memory state is consistent
1614 	 * with what's on disk. If the pool is read-only then there aren't
1615 	 * any changes and we just have the initially-loaded state.
1616 	 */
1617 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1618 		return;
1619 
1620 	/* some extra verification for in-core tree if you can */
1621 	if (msp->ms_loaded) {
1622 		range_tree_stat_verify(msp->ms_allocatable);
1623 		VERIFY(space_map_histogram_verify(msp->ms_sm,
1624 		    msp->ms_allocatable));
1625 	}
1626 
1627 	uint64_t weight = msp->ms_weight;
1628 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1629 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1630 	uint64_t frag = msp->ms_fragmentation;
1631 	uint64_t max_segsize = msp->ms_max_size;
1632 
1633 	msp->ms_weight = 0;
1634 	msp->ms_fragmentation = 0;
1635 	msp->ms_max_size = 0;
1636 
1637 	/*
1638 	 * This function is used for verification purposes. Regardless of
1639 	 * whether metaslab_weight() thinks this metaslab should be active or
1640 	 * not, we want to ensure that the actual weight (and therefore the
1641 	 * value of ms_weight) would be the same if it was to be recalculated
1642 	 * at this point.
1643 	 */
1644 	msp->ms_weight = metaslab_weight(msp) | was_active;
1645 
1646 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
1647 
1648 	/*
1649 	 * If the weight type changed then there is no point in doing
1650 	 * verification. Revert fields to their original values.
1651 	 */
1652 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1653 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1654 		msp->ms_fragmentation = frag;
1655 		msp->ms_weight = weight;
1656 		return;
1657 	}
1658 
1659 	VERIFY3U(msp->ms_fragmentation, ==, frag);
1660 	VERIFY3U(msp->ms_weight, ==, weight);
1661 }
1662 
1663 /*
1664  * Wait for any in-progress metaslab loads to complete.
1665  */
1666 static void
1667 metaslab_load_wait(metaslab_t *msp)
1668 {
1669 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1670 
1671 	while (msp->ms_loading) {
1672 		ASSERT(!msp->ms_loaded);
1673 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1674 	}
1675 }
1676 
1677 static int
1678 metaslab_load_impl(metaslab_t *msp)
1679 {
1680 	int error = 0;
1681 
1682 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1683 	ASSERT(msp->ms_loading);
1684 	ASSERT(!msp->ms_condensing);
1685 
1686 	/*
1687 	 * We temporarily drop the lock to unblock other operations while we
1688 	 * are reading the space map. Therefore, metaslab_sync() and
1689 	 * metaslab_sync_done() can run at the same time as we do.
1690 	 *
1691 	 * metaslab_sync() can append to the space map while we are loading.
1692 	 * Therefore we load only entries that existed when we started the
1693 	 * load. Additionally, metaslab_sync_done() has to wait for the load
1694 	 * to complete because there are potential races like metaslab_load()
1695 	 * loading parts of the space map that are currently being appended
1696 	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
1697 	 * entries that metaslab_sync_done() would try to re-add later.
1698 	 *
1699 	 * That's why before dropping the lock we remember the synced length
1700 	 * of the metaslab and read up to that point of the space map,
1701 	 * ignoring entries appended by metaslab_sync() that happen after we
1702 	 * drop the lock.
1703 	 */
1704 	uint64_t length = msp->ms_synced_length;
1705 	mutex_exit(&msp->ms_lock);
1706 
1707 	if (msp->ms_sm != NULL) {
1708 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1709 		    SM_FREE, length);
1710 	} else {
1711 		/*
1712 		 * The space map has not been allocated yet, so treat
1713 		 * all the space in the metaslab as free and add it to the
1714 		 * ms_allocatable tree.
1715 		 */
1716 		range_tree_add(msp->ms_allocatable,
1717 		    msp->ms_start, msp->ms_size);
1718 	}
1719 
1720 	/*
1721 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
1722 	 * changing the ms_sm and the metaslab's range trees while we are
1723 	 * about to use them and populate the ms_allocatable. The ms_lock
1724 	 * is insufficient for this because metaslab_sync() doesn't hold
1725 	 * the ms_lock while writing the ms_checkpointing tree to disk.
1726 	 */
1727 	mutex_enter(&msp->ms_sync_lock);
1728 	mutex_enter(&msp->ms_lock);
1729 	ASSERT(!msp->ms_condensing);
1730 
1731 	if (error != 0) {
1732 		mutex_exit(&msp->ms_sync_lock);
1733 		return (error);
1734 	}
1735 
1736 	ASSERT3P(msp->ms_group, !=, NULL);
1737 	msp->ms_loaded = B_TRUE;
1738 
1739 	/*
1740 	 * The ms_allocatable contains the segments that exist in the
1741 	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
1742 	 * them from ms_allocatable as they will be added again in
1743 	 * metaslab_sync_done().
1744 	 */
1745 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1746 		range_tree_walk(msp->ms_defer[t],
1747 		    range_tree_remove, msp->ms_allocatable);
1748 	}
1749 
1750 	/*
1751 	 * Call metaslab_recalculate_weight_and_sort() now that the
1752 	 * metaslab is loaded so we get the metaslab's real weight.
1753 	 *
1754 	 * Unless this metaslab was created with older software and
1755 	 * has not yet been converted to use segment-based weight, we
1756 	 * expect the new weight to be better or equal to the weight
1757 	 * that the metaslab had while it was not loaded. This is
1758 	 * because the old weight does not take into account the
1759 	 * consolidation of adjacent segments between TXGs. [see
1760 	 * comment for ms_synchist and ms_deferhist[] for more info]
1761 	 */
1762 	uint64_t weight = msp->ms_weight;
1763 	metaslab_recalculate_weight_and_sort(msp);
1764 	if (!WEIGHT_IS_SPACEBASED(weight))
1765 		ASSERT3U(weight, <=, msp->ms_weight);
1766 	msp->ms_max_size = metaslab_block_maxsize(msp);
1767 
1768 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1769 	metaslab_verify_space(msp, spa_syncing_txg(spa));
1770 	mutex_exit(&msp->ms_sync_lock);
1771 
1772 	return (0);
1773 }
1774 
1775 int
1776 metaslab_load(metaslab_t *msp)
1777 {
1778 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1779 
1780 	/*
1781 	 * There may be another thread loading the same metaslab, if that's
1782 	 * the case just wait until the other thread is done and return.
1783 	 */
1784 	metaslab_load_wait(msp);
1785 	if (msp->ms_loaded)
1786 		return (0);
1787 	VERIFY(!msp->ms_loading);
1788 	ASSERT(!msp->ms_condensing);
1789 
1790 	msp->ms_loading = B_TRUE;
1791 	int error = metaslab_load_impl(msp);
1792 	msp->ms_loading = B_FALSE;
1793 	cv_broadcast(&msp->ms_load_cv);
1794 
1795 	return (error);
1796 }
1797 
1798 void
1799 metaslab_unload(metaslab_t *msp)
1800 {
1801 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1802 
1803 	metaslab_verify_weight_and_frag(msp);
1804 
1805 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1806 	msp->ms_loaded = B_FALSE;
1807 
1808 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1809 	msp->ms_max_size = 0;
1810 
1811 	/*
1812 	 * We explicitly recalculate the metaslab's weight based on its space
1813 	 * map (as it is now not loaded). We want unload metaslabs to always
1814 	 * have their weights calculated from the space map histograms, while
1815 	 * loaded ones have it calculated from their in-core range tree
1816 	 * [see metaslab_load()]. This way, the weight reflects the information
1817 	 * available in-core, whether it is loaded or not
1818 	 *
1819 	 * If ms_group == NULL means that we came here from metaslab_fini(),
1820 	 * at which point it doesn't make sense for us to do the recalculation
1821 	 * and the sorting.
1822 	 */
1823 	if (msp->ms_group != NULL)
1824 		metaslab_recalculate_weight_and_sort(msp);
1825 }
1826 
1827 static void
1828 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1829     int64_t defer_delta, int64_t space_delta)
1830 {
1831 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1832 
1833 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1834 	ASSERT(vd->vdev_ms_count != 0);
1835 
1836 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1837 	    vdev_deflated_space(vd, space_delta));
1838 }
1839 
1840 int
1841 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1842     metaslab_t **msp)
1843 {
1844 	vdev_t *vd = mg->mg_vd;
1845 	spa_t *spa = vd->vdev_spa;
1846 	objset_t *mos = spa->spa_meta_objset;
1847 	metaslab_t *ms;
1848 	int error;
1849 
1850 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1851 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1852 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1853 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1854 
1855 	ms->ms_id = id;
1856 	ms->ms_start = id << vd->vdev_ms_shift;
1857 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1858 	ms->ms_allocator = -1;
1859 	ms->ms_new = B_TRUE;
1860 
1861 	/*
1862 	 * We only open space map objects that already exist. All others
1863 	 * will be opened when we finally allocate an object for it.
1864 	 *
1865 	 * Note:
1866 	 * When called from vdev_expand(), we can't call into the DMU as
1867 	 * we are holding the spa_config_lock as a writer and we would
1868 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
1869 	 * that case, the object parameter is zero though, so we won't
1870 	 * call into the DMU.
1871 	 */
1872 	if (object != 0) {
1873 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1874 		    ms->ms_size, vd->vdev_ashift);
1875 
1876 		if (error != 0) {
1877 			kmem_free(ms, sizeof (metaslab_t));
1878 			return (error);
1879 		}
1880 
1881 		ASSERT(ms->ms_sm != NULL);
1882 		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
1883 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
1884 	}
1885 
1886 	/*
1887 	 * We create the ms_allocatable here, but we don't create the
1888 	 * other range trees until metaslab_sync_done().  This serves
1889 	 * two purposes: it allows metaslab_sync_done() to detect the
1890 	 * addition of new space; and for debugging, it ensures that
1891 	 * we'd data fault on any attempt to use this metaslab before
1892 	 * it's ready.
1893 	 */
1894 	ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1895 	metaslab_group_add(mg, ms);
1896 
1897 	metaslab_set_fragmentation(ms);
1898 
1899 	/*
1900 	 * If we're opening an existing pool (txg == 0) or creating
1901 	 * a new one (txg == TXG_INITIAL), all space is available now.
1902 	 * If we're adding space to an existing pool, the new space
1903 	 * does not become available until after this txg has synced.
1904 	 * The metaslab's weight will also be initialized when we sync
1905 	 * out this txg. This ensures that we don't attempt to allocate
1906 	 * from it before we have initialized it completely.
1907 	 */
1908 	if (txg <= TXG_INITIAL) {
1909 		metaslab_sync_done(ms, 0);
1910 		metaslab_space_update(vd, mg->mg_class,
1911 		    metaslab_allocated_space(ms), 0, 0);
1912 	}
1913 
1914 	/*
1915 	 * If metaslab_debug_load is set and we're initializing a metaslab
1916 	 * that has an allocated space map object then load the space map
1917 	 * so that we can verify frees.
1918 	 */
1919 	if (metaslab_debug_load && ms->ms_sm != NULL) {
1920 		mutex_enter(&ms->ms_lock);
1921 		VERIFY0(metaslab_load(ms));
1922 		mutex_exit(&ms->ms_lock);
1923 	}
1924 
1925 	if (txg != 0) {
1926 		vdev_dirty(vd, 0, NULL, txg);
1927 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1928 	}
1929 
1930 	*msp = ms;
1931 
1932 	return (0);
1933 }
1934 
1935 void
1936 metaslab_fini(metaslab_t *msp)
1937 {
1938 	metaslab_group_t *mg = msp->ms_group;
1939 	vdev_t *vd = mg->mg_vd;
1940 
1941 	metaslab_group_remove(mg, msp);
1942 
1943 	mutex_enter(&msp->ms_lock);
1944 	VERIFY(msp->ms_group == NULL);
1945 	metaslab_space_update(vd, mg->mg_class,
1946 	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
1947 
1948 	space_map_close(msp->ms_sm);
1949 
1950 	metaslab_unload(msp);
1951 
1952 	range_tree_destroy(msp->ms_allocatable);
1953 	range_tree_destroy(msp->ms_freeing);
1954 	range_tree_destroy(msp->ms_freed);
1955 
1956 	for (int t = 0; t < TXG_SIZE; t++) {
1957 		range_tree_destroy(msp->ms_allocating[t]);
1958 	}
1959 
1960 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1961 		range_tree_destroy(msp->ms_defer[t]);
1962 	}
1963 	ASSERT0(msp->ms_deferspace);
1964 
1965 	range_tree_destroy(msp->ms_checkpointing);
1966 
1967 	for (int t = 0; t < TXG_SIZE; t++)
1968 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
1969 
1970 	mutex_exit(&msp->ms_lock);
1971 	cv_destroy(&msp->ms_load_cv);
1972 	mutex_destroy(&msp->ms_lock);
1973 	mutex_destroy(&msp->ms_sync_lock);
1974 	ASSERT3U(msp->ms_allocator, ==, -1);
1975 
1976 	kmem_free(msp, sizeof (metaslab_t));
1977 }
1978 
1979 #define	FRAGMENTATION_TABLE_SIZE	17
1980 
1981 /*
1982  * This table defines a segment size based fragmentation metric that will
1983  * allow each metaslab to derive its own fragmentation value. This is done
1984  * by calculating the space in each bucket of the spacemap histogram and
1985  * multiplying that by the fragmentation metric in this table. Doing
1986  * this for all buckets and dividing it by the total amount of free
1987  * space in this metaslab (i.e. the total free space in all buckets) gives
1988  * us the fragmentation metric. This means that a high fragmentation metric
1989  * equates to most of the free space being comprised of small segments.
1990  * Conversely, if the metric is low, then most of the free space is in
1991  * large segments. A 10% change in fragmentation equates to approximately
1992  * double the number of segments.
1993  *
1994  * This table defines 0% fragmented space using 16MB segments. Testing has
1995  * shown that segments that are greater than or equal to 16MB do not suffer
1996  * from drastic performance problems. Using this value, we derive the rest
1997  * of the table. Since the fragmentation value is never stored on disk, it
1998  * is possible to change these calculations in the future.
1999  */
2000 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2001 	100,	/* 512B	*/
2002 	100,	/* 1K	*/
2003 	98,	/* 2K	*/
2004 	95,	/* 4K	*/
2005 	90,	/* 8K	*/
2006 	80,	/* 16K	*/
2007 	70,	/* 32K	*/
2008 	60,	/* 64K	*/
2009 	50,	/* 128K	*/
2010 	40,	/* 256K	*/
2011 	30,	/* 512K	*/
2012 	20,	/* 1M	*/
2013 	15,	/* 2M	*/
2014 	10,	/* 4M	*/
2015 	5,	/* 8M	*/
2016 	0	/* 16M	*/
2017 };
2018 
2019 /*
2020  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2021  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2022  * been upgraded and does not support this metric. Otherwise, the return
2023  * value should be in the range [0, 100].
2024  */
2025 static void
2026 metaslab_set_fragmentation(metaslab_t *msp)
2027 {
2028 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2029 	uint64_t fragmentation = 0;
2030 	uint64_t total = 0;
2031 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
2032 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
2033 
2034 	if (!feature_enabled) {
2035 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2036 		return;
2037 	}
2038 
2039 	/*
2040 	 * A null space map means that the entire metaslab is free
2041 	 * and thus is not fragmented.
2042 	 */
2043 	if (msp->ms_sm == NULL) {
2044 		msp->ms_fragmentation = 0;
2045 		return;
2046 	}
2047 
2048 	/*
2049 	 * If this metaslab's space map has not been upgraded, flag it
2050 	 * so that we upgrade next time we encounter it.
2051 	 */
2052 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2053 		uint64_t txg = spa_syncing_txg(spa);
2054 		vdev_t *vd = msp->ms_group->mg_vd;
2055 
2056 		/*
2057 		 * If we've reached the final dirty txg, then we must
2058 		 * be shutting down the pool. We don't want to dirty
2059 		 * any data past this point so skip setting the condense
2060 		 * flag. We can retry this action the next time the pool
2061 		 * is imported.
2062 		 */
2063 		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2064 			msp->ms_condense_wanted = B_TRUE;
2065 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2066 			zfs_dbgmsg("txg %llu, requesting force condense: "
2067 			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
2068 			    vd->vdev_id);
2069 		}
2070 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2071 		return;
2072 	}
2073 
2074 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2075 		uint64_t space = 0;
2076 		uint8_t shift = msp->ms_sm->sm_shift;
2077 
2078 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2079 		    FRAGMENTATION_TABLE_SIZE - 1);
2080 
2081 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2082 			continue;
2083 
2084 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2085 		total += space;
2086 
2087 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2088 		fragmentation += space * zfs_frag_table[idx];
2089 	}
2090 
2091 	if (total > 0)
2092 		fragmentation /= total;
2093 	ASSERT3U(fragmentation, <=, 100);
2094 
2095 	msp->ms_fragmentation = fragmentation;
2096 }
2097 
2098 /*
2099  * Compute a weight -- a selection preference value -- for the given metaslab.
2100  * This is based on the amount of free space, the level of fragmentation,
2101  * the LBA range, and whether the metaslab is loaded.
2102  */
2103 static uint64_t
2104 metaslab_space_weight(metaslab_t *msp)
2105 {
2106 	metaslab_group_t *mg = msp->ms_group;
2107 	vdev_t *vd = mg->mg_vd;
2108 	uint64_t weight, space;
2109 
2110 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2111 	ASSERT(!vd->vdev_removing);
2112 
2113 	/*
2114 	 * The baseline weight is the metaslab's free space.
2115 	 */
2116 	space = msp->ms_size - metaslab_allocated_space(msp);
2117 
2118 	if (metaslab_fragmentation_factor_enabled &&
2119 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2120 		/*
2121 		 * Use the fragmentation information to inversely scale
2122 		 * down the baseline weight. We need to ensure that we
2123 		 * don't exclude this metaslab completely when it's 100%
2124 		 * fragmented. To avoid this we reduce the fragmented value
2125 		 * by 1.
2126 		 */
2127 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2128 
2129 		/*
2130 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2131 		 * this metaslab again. The fragmentation metric may have
2132 		 * decreased the space to something smaller than
2133 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2134 		 * so that we can consume any remaining space.
2135 		 */
2136 		if (space > 0 && space < SPA_MINBLOCKSIZE)
2137 			space = SPA_MINBLOCKSIZE;
2138 	}
2139 	weight = space;
2140 
2141 	/*
2142 	 * Modern disks have uniform bit density and constant angular velocity.
2143 	 * Therefore, the outer recording zones are faster (higher bandwidth)
2144 	 * than the inner zones by the ratio of outer to inner track diameter,
2145 	 * which is typically around 2:1.  We account for this by assigning
2146 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2147 	 * In effect, this means that we'll select the metaslab with the most
2148 	 * free bandwidth rather than simply the one with the most free space.
2149 	 */
2150 	if (metaslab_lba_weighting_enabled) {
2151 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
2152 		ASSERT(weight >= space && weight <= 2 * space);
2153 	}
2154 
2155 	/*
2156 	 * If this metaslab is one we're actively using, adjust its
2157 	 * weight to make it preferable to any inactive metaslab so
2158 	 * we'll polish it off. If the fragmentation on this metaslab
2159 	 * has exceed our threshold, then don't mark it active.
2160 	 */
2161 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
2162 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
2163 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
2164 	}
2165 
2166 	WEIGHT_SET_SPACEBASED(weight);
2167 	return (weight);
2168 }
2169 
2170 /*
2171  * Return the weight of the specified metaslab, according to the segment-based
2172  * weighting algorithm. The metaslab must be loaded. This function can
2173  * be called within a sync pass since it relies only on the metaslab's
2174  * range tree which is always accurate when the metaslab is loaded.
2175  */
2176 static uint64_t
2177 metaslab_weight_from_range_tree(metaslab_t *msp)
2178 {
2179 	uint64_t weight = 0;
2180 	uint32_t segments = 0;
2181 
2182 	ASSERT(msp->ms_loaded);
2183 
2184 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
2185 	    i--) {
2186 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
2187 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2188 
2189 		segments <<= 1;
2190 		segments += msp->ms_allocatable->rt_histogram[i];
2191 
2192 		/*
2193 		 * The range tree provides more precision than the space map
2194 		 * and must be downgraded so that all values fit within the
2195 		 * space map's histogram. This allows us to compare loaded
2196 		 * vs. unloaded metaslabs to determine which metaslab is
2197 		 * considered "best".
2198 		 */
2199 		if (i > max_idx)
2200 			continue;
2201 
2202 		if (segments != 0) {
2203 			WEIGHT_SET_COUNT(weight, segments);
2204 			WEIGHT_SET_INDEX(weight, i);
2205 			WEIGHT_SET_ACTIVE(weight, 0);
2206 			break;
2207 		}
2208 	}
2209 	return (weight);
2210 }
2211 
2212 /*
2213  * Calculate the weight based on the on-disk histogram. This should only
2214  * be called after a sync pass has completely finished since the on-disk
2215  * information is updated in metaslab_sync().
2216  */
2217 static uint64_t
2218 metaslab_weight_from_spacemap(metaslab_t *msp)
2219 {
2220 	space_map_t *sm = msp->ms_sm;
2221 	ASSERT(!msp->ms_loaded);
2222 	ASSERT(sm != NULL);
2223 	ASSERT3U(space_map_object(sm), !=, 0);
2224 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2225 
2226 	/*
2227 	 * Create a joint histogram from all the segments that have made
2228 	 * it to the metaslab's space map histogram, that are not yet
2229 	 * available for allocation because they are still in the freeing
2230 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2231 	 * these segments from the space map's histogram to get a more
2232 	 * accurate weight.
2233 	 */
2234 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2235 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2236 		deferspace_histogram[i] += msp->ms_synchist[i];
2237 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2238 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2239 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
2240 		}
2241 	}
2242 
2243 	uint64_t weight = 0;
2244 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2245 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2246 		    deferspace_histogram[i]);
2247 		uint64_t count =
2248 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2249 		if (count != 0) {
2250 			WEIGHT_SET_COUNT(weight, count);
2251 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
2252 			WEIGHT_SET_ACTIVE(weight, 0);
2253 			break;
2254 		}
2255 	}
2256 	return (weight);
2257 }
2258 
2259 /*
2260  * Compute a segment-based weight for the specified metaslab. The weight
2261  * is determined by highest bucket in the histogram. The information
2262  * for the highest bucket is encoded into the weight value.
2263  */
2264 static uint64_t
2265 metaslab_segment_weight(metaslab_t *msp)
2266 {
2267 	metaslab_group_t *mg = msp->ms_group;
2268 	uint64_t weight = 0;
2269 	uint8_t shift = mg->mg_vd->vdev_ashift;
2270 
2271 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2272 
2273 	/*
2274 	 * The metaslab is completely free.
2275 	 */
2276 	if (metaslab_allocated_space(msp) == 0) {
2277 		int idx = highbit64(msp->ms_size) - 1;
2278 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2279 
2280 		if (idx < max_idx) {
2281 			WEIGHT_SET_COUNT(weight, 1ULL);
2282 			WEIGHT_SET_INDEX(weight, idx);
2283 		} else {
2284 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2285 			WEIGHT_SET_INDEX(weight, max_idx);
2286 		}
2287 		WEIGHT_SET_ACTIVE(weight, 0);
2288 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
2289 
2290 		return (weight);
2291 	}
2292 
2293 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2294 
2295 	/*
2296 	 * If the metaslab is fully allocated then just make the weight 0.
2297 	 */
2298 	if (metaslab_allocated_space(msp) == msp->ms_size)
2299 		return (0);
2300 	/*
2301 	 * If the metaslab is already loaded, then use the range tree to
2302 	 * determine the weight. Otherwise, we rely on the space map information
2303 	 * to generate the weight.
2304 	 */
2305 	if (msp->ms_loaded) {
2306 		weight = metaslab_weight_from_range_tree(msp);
2307 	} else {
2308 		weight = metaslab_weight_from_spacemap(msp);
2309 	}
2310 
2311 	/*
2312 	 * If the metaslab was active the last time we calculated its weight
2313 	 * then keep it active. We want to consume the entire region that
2314 	 * is associated with this weight.
2315 	 */
2316 	if (msp->ms_activation_weight != 0 && weight != 0)
2317 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2318 	return (weight);
2319 }
2320 
2321 /*
2322  * Determine if we should attempt to allocate from this metaslab. If the
2323  * metaslab has a maximum size then we can quickly determine if the desired
2324  * allocation size can be satisfied. Otherwise, if we're using segment-based
2325  * weighting then we can determine the maximum allocation that this metaslab
2326  * can accommodate based on the index encoded in the weight. If we're using
2327  * space-based weights then rely on the entire weight (excluding the weight
2328  * type bit).
2329  */
2330 boolean_t
2331 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
2332 {
2333 	boolean_t should_allocate;
2334 
2335 	if (msp->ms_max_size != 0)
2336 		return (msp->ms_max_size >= asize);
2337 
2338 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2339 		/*
2340 		 * The metaslab segment weight indicates segments in the
2341 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
2342 		 * Since the asize might be in the middle of the range, we
2343 		 * should attempt the allocation if asize < 2^(i+1).
2344 		 */
2345 		should_allocate = (asize <
2346 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2347 	} else {
2348 		should_allocate = (asize <=
2349 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2350 	}
2351 	return (should_allocate);
2352 }
2353 
2354 static uint64_t
2355 metaslab_weight(metaslab_t *msp)
2356 {
2357 	vdev_t *vd = msp->ms_group->mg_vd;
2358 	spa_t *spa = vd->vdev_spa;
2359 	uint64_t weight;
2360 
2361 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2362 
2363 	/*
2364 	 * If this vdev is in the process of being removed, there is nothing
2365 	 * for us to do here.
2366 	 */
2367 	if (vd->vdev_removing)
2368 		return (0);
2369 
2370 	metaslab_set_fragmentation(msp);
2371 
2372 	/*
2373 	 * Update the maximum size if the metaslab is loaded. This will
2374 	 * ensure that we get an accurate maximum size if newly freed space
2375 	 * has been added back into the free tree.
2376 	 */
2377 	if (msp->ms_loaded)
2378 		msp->ms_max_size = metaslab_block_maxsize(msp);
2379 	else
2380 		ASSERT0(msp->ms_max_size);
2381 
2382 	/*
2383 	 * Segment-based weighting requires space map histogram support.
2384 	 */
2385 	if (zfs_metaslab_segment_weight_enabled &&
2386 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2387 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2388 	    sizeof (space_map_phys_t))) {
2389 		weight = metaslab_segment_weight(msp);
2390 	} else {
2391 		weight = metaslab_space_weight(msp);
2392 	}
2393 	return (weight);
2394 }
2395 
2396 void
2397 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2398 {
2399 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
2400 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2401 	metaslab_group_sort(msp->ms_group, msp,
2402 	    metaslab_weight(msp) | was_active);
2403 }
2404 
2405 static int
2406 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2407     int allocator, uint64_t activation_weight)
2408 {
2409 	/*
2410 	 * If we're activating for the claim code, we don't want to actually
2411 	 * set the metaslab up for a specific allocator.
2412 	 */
2413 	if (activation_weight == METASLAB_WEIGHT_CLAIM)
2414 		return (0);
2415 	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2416 	    mg->mg_primaries : mg->mg_secondaries);
2417 
2418 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2419 	mutex_enter(&mg->mg_lock);
2420 	if (arr[allocator] != NULL) {
2421 		mutex_exit(&mg->mg_lock);
2422 		return (EEXIST);
2423 	}
2424 
2425 	arr[allocator] = msp;
2426 	ASSERT3S(msp->ms_allocator, ==, -1);
2427 	msp->ms_allocator = allocator;
2428 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2429 	mutex_exit(&mg->mg_lock);
2430 
2431 	return (0);
2432 }
2433 
2434 static int
2435 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2436 {
2437 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2438 
2439 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
2440 		int error = metaslab_load(msp);
2441 		if (error != 0) {
2442 			metaslab_group_sort(msp->ms_group, msp, 0);
2443 			return (error);
2444 		}
2445 		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2446 			/*
2447 			 * The metaslab was activated for another allocator
2448 			 * while we were waiting, we should reselect.
2449 			 */
2450 			return (EBUSY);
2451 		}
2452 		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2453 		    allocator, activation_weight)) != 0) {
2454 			return (error);
2455 		}
2456 
2457 		msp->ms_activation_weight = msp->ms_weight;
2458 		metaslab_group_sort(msp->ms_group, msp,
2459 		    msp->ms_weight | activation_weight);
2460 	}
2461 	ASSERT(msp->ms_loaded);
2462 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
2463 
2464 	return (0);
2465 }
2466 
2467 static void
2468 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2469     uint64_t weight)
2470 {
2471 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2472 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
2473 		metaslab_group_sort(mg, msp, weight);
2474 		return;
2475 	}
2476 
2477 	mutex_enter(&mg->mg_lock);
2478 	ASSERT3P(msp->ms_group, ==, mg);
2479 	if (msp->ms_primary) {
2480 		ASSERT3U(0, <=, msp->ms_allocator);
2481 		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
2482 		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
2483 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
2484 		mg->mg_primaries[msp->ms_allocator] = NULL;
2485 	} else {
2486 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
2487 		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
2488 		mg->mg_secondaries[msp->ms_allocator] = NULL;
2489 	}
2490 	msp->ms_allocator = -1;
2491 	metaslab_group_sort_impl(mg, msp, weight);
2492 	mutex_exit(&mg->mg_lock);
2493 }
2494 
2495 static void
2496 metaslab_passivate(metaslab_t *msp, uint64_t weight)
2497 {
2498 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
2499 
2500 	/*
2501 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
2502 	 * this metaslab again.  In that case, it had better be empty,
2503 	 * or we would be leaving space on the table.
2504 	 */
2505 	ASSERT(size >= SPA_MINBLOCKSIZE ||
2506 	    range_tree_is_empty(msp->ms_allocatable));
2507 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
2508 
2509 	msp->ms_activation_weight = 0;
2510 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
2511 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2512 }
2513 
2514 /*
2515  * Segment-based metaslabs are activated once and remain active until
2516  * we either fail an allocation attempt (similar to space-based metaslabs)
2517  * or have exhausted the free space in zfs_metaslab_switch_threshold
2518  * buckets since the metaslab was activated. This function checks to see
2519  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
2520  * metaslab and passivates it proactively. This will allow us to select a
2521  * metaslabs with larger contiguous region if any remaining within this
2522  * metaslab group. If we're in sync pass > 1, then we continue using this
2523  * metaslab so that we don't dirty more block and cause more sync passes.
2524  */
2525 void
2526 metaslab_segment_may_passivate(metaslab_t *msp)
2527 {
2528 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2529 
2530 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
2531 		return;
2532 
2533 	/*
2534 	 * Since we are in the middle of a sync pass, the most accurate
2535 	 * information that is accessible to us is the in-core range tree
2536 	 * histogram; calculate the new weight based on that information.
2537 	 */
2538 	uint64_t weight = metaslab_weight_from_range_tree(msp);
2539 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2540 	int current_idx = WEIGHT_GET_INDEX(weight);
2541 
2542 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2543 		metaslab_passivate(msp, weight);
2544 }
2545 
2546 static void
2547 metaslab_preload(void *arg)
2548 {
2549 	metaslab_t *msp = arg;
2550 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2551 
2552 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2553 
2554 	mutex_enter(&msp->ms_lock);
2555 	(void) metaslab_load(msp);
2556 	msp->ms_selected_txg = spa_syncing_txg(spa);
2557 	mutex_exit(&msp->ms_lock);
2558 }
2559 
2560 static void
2561 metaslab_group_preload(metaslab_group_t *mg)
2562 {
2563 	spa_t *spa = mg->mg_vd->vdev_spa;
2564 	metaslab_t *msp;
2565 	avl_tree_t *t = &mg->mg_metaslab_tree;
2566 	int m = 0;
2567 
2568 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2569 		taskq_wait(mg->mg_taskq);
2570 		return;
2571 	}
2572 
2573 	mutex_enter(&mg->mg_lock);
2574 
2575 	/*
2576 	 * Load the next potential metaslabs
2577 	 */
2578 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2579 		ASSERT3P(msp->ms_group, ==, mg);
2580 
2581 		/*
2582 		 * We preload only the maximum number of metaslabs specified
2583 		 * by metaslab_preload_limit. If a metaslab is being forced
2584 		 * to condense then we preload it too. This will ensure
2585 		 * that force condensing happens in the next txg.
2586 		 */
2587 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2588 			continue;
2589 		}
2590 
2591 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2592 		    msp, TQ_SLEEP) != TASKQID_INVALID);
2593 	}
2594 	mutex_exit(&mg->mg_lock);
2595 }
2596 
2597 /*
2598  * Determine if the space map's on-disk footprint is past our tolerance
2599  * for inefficiency. We would like to use the following criteria to make
2600  * our decision:
2601  *
2602  * 1. The size of the space map object should not dramatically increase as a
2603  * result of writing out the free space range tree.
2604  *
2605  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2606  * times the size than the free space range tree representation
2607  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2608  *
2609  * 3. The on-disk size of the space map should actually decrease.
2610  *
2611  * Unfortunately, we cannot compute the on-disk size of the space map in this
2612  * context because we cannot accurately compute the effects of compression, etc.
2613  * Instead, we apply the heuristic described in the block comment for
2614  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2615  * is greater than a threshold number of blocks.
2616  */
2617 static boolean_t
2618 metaslab_should_condense(metaslab_t *msp)
2619 {
2620 	space_map_t *sm = msp->ms_sm;
2621 	vdev_t *vd = msp->ms_group->mg_vd;
2622 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
2623 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
2624 
2625 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2626 	ASSERT(msp->ms_loaded);
2627 
2628 	/*
2629 	 * Allocations and frees in early passes are generally more space
2630 	 * efficient (in terms of blocks described in space map entries)
2631 	 * than the ones in later passes (e.g. we don't compress after
2632 	 * sync pass 5) and condensing a metaslab multiple times in a txg
2633 	 * could degrade performance.
2634 	 *
2635 	 * Thus we prefer condensing each metaslab at most once every txg at
2636 	 * the earliest sync pass possible. If a metaslab is eligible for
2637 	 * condensing again after being considered for condensing within the
2638 	 * same txg, it will hopefully be dirty in the next txg where it will
2639 	 * be condensed at an earlier pass.
2640 	 */
2641 	if (msp->ms_condense_checked_txg == current_txg)
2642 		return (B_FALSE);
2643 	msp->ms_condense_checked_txg = current_txg;
2644 
2645 	/*
2646 	 * We always condense metaslabs that are empty and metaslabs for
2647 	 * which a condense request has been made.
2648 	 */
2649 	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
2650 	    msp->ms_condense_wanted)
2651 		return (B_TRUE);
2652 
2653 	uint64_t object_size = space_map_length(msp->ms_sm);
2654 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
2655 	    msp->ms_allocatable, SM_NO_VDEVID);
2656 
2657 	dmu_object_info_t doi;
2658 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
2659 	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2660 
2661 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
2662 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
2663 }
2664 
2665 /*
2666  * Condense the on-disk space map representation to its minimized form.
2667  * The minimized form consists of a small number of allocations followed by
2668  * the entries of the free range tree.
2669  */
2670 static void
2671 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2672 {
2673 	range_tree_t *condense_tree;
2674 	space_map_t *sm = msp->ms_sm;
2675 
2676 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2677 	ASSERT(msp->ms_loaded);
2678 
2679 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2680 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2681 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2682 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
2683 	    space_map_length(msp->ms_sm),
2684 	    avl_numnodes(&msp->ms_allocatable->rt_root),
2685 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
2686 
2687 	msp->ms_condense_wanted = B_FALSE;
2688 
2689 	/*
2690 	 * Create an range tree that is 100% allocated. We remove segments
2691 	 * that have been freed in this txg, any deferred frees that exist,
2692 	 * and any allocation in the future. Removing segments should be
2693 	 * a relatively inexpensive operation since we expect these trees to
2694 	 * have a small number of nodes.
2695 	 */
2696 	condense_tree = range_tree_create(NULL, NULL);
2697 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2698 
2699 	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
2700 	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
2701 
2702 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2703 		range_tree_walk(msp->ms_defer[t],
2704 		    range_tree_remove, condense_tree);
2705 	}
2706 
2707 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2708 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
2709 		    range_tree_remove, condense_tree);
2710 	}
2711 
2712 	/*
2713 	 * We're about to drop the metaslab's lock thus allowing
2714 	 * other consumers to change it's content. Set the
2715 	 * metaslab's ms_condensing flag to ensure that
2716 	 * allocations on this metaslab do not occur while we're
2717 	 * in the middle of committing it to disk. This is only critical
2718 	 * for ms_allocatable as all other range trees use per txg
2719 	 * views of their content.
2720 	 */
2721 	msp->ms_condensing = B_TRUE;
2722 
2723 	mutex_exit(&msp->ms_lock);
2724 	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
2725 
2726 	/*
2727 	 * While we would ideally like to create a space map representation
2728 	 * that consists only of allocation records, doing so can be
2729 	 * prohibitively expensive because the in-core free tree can be
2730 	 * large, and therefore computationally expensive to subtract
2731 	 * from the condense_tree. Instead we sync out two trees, a cheap
2732 	 * allocation only tree followed by the in-core free tree. While not
2733 	 * optimal, this is typically close to optimal, and much cheaper to
2734 	 * compute.
2735 	 */
2736 	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
2737 	range_tree_vacate(condense_tree, NULL, NULL);
2738 	range_tree_destroy(condense_tree);
2739 
2740 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
2741 	mutex_enter(&msp->ms_lock);
2742 	msp->ms_condensing = B_FALSE;
2743 }
2744 
2745 /*
2746  * Write a metaslab to disk in the context of the specified transaction group.
2747  */
2748 void
2749 metaslab_sync(metaslab_t *msp, uint64_t txg)
2750 {
2751 	metaslab_group_t *mg = msp->ms_group;
2752 	vdev_t *vd = mg->mg_vd;
2753 	spa_t *spa = vd->vdev_spa;
2754 	objset_t *mos = spa_meta_objset(spa);
2755 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
2756 	dmu_tx_t *tx;
2757 	uint64_t object = space_map_object(msp->ms_sm);
2758 
2759 	ASSERT(!vd->vdev_ishole);
2760 
2761 	/*
2762 	 * This metaslab has just been added so there's no work to do now.
2763 	 */
2764 	if (msp->ms_freeing == NULL) {
2765 		ASSERT3P(alloctree, ==, NULL);
2766 		return;
2767 	}
2768 
2769 	ASSERT3P(alloctree, !=, NULL);
2770 	ASSERT3P(msp->ms_freeing, !=, NULL);
2771 	ASSERT3P(msp->ms_freed, !=, NULL);
2772 	ASSERT3P(msp->ms_checkpointing, !=, NULL);
2773 
2774 	/*
2775 	 * Normally, we don't want to process a metaslab if there are no
2776 	 * allocations or frees to perform. However, if the metaslab is being
2777 	 * forced to condense and it's loaded, we need to let it through.
2778 	 */
2779 	if (range_tree_is_empty(alloctree) &&
2780 	    range_tree_is_empty(msp->ms_freeing) &&
2781 	    range_tree_is_empty(msp->ms_checkpointing) &&
2782 	    !(msp->ms_loaded && msp->ms_condense_wanted))
2783 		return;
2784 
2785 
2786 	VERIFY(txg <= spa_final_dirty_txg(spa));
2787 
2788 	/*
2789 	 * The only state that can actually be changing concurrently
2790 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
2791 	 * other thread can be modifying this txg's alloc, freeing,
2792 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
2793 	 * could call into the DMU, because the DMU can call down to
2794 	 * us (e.g. via zio_free()) at any time.
2795 	 *
2796 	 * The spa_vdev_remove_thread() can be reading metaslab state
2797 	 * concurrently, and it is locked out by the ms_sync_lock.
2798 	 * Note that the ms_lock is insufficient for this, because it
2799 	 * is dropped by space_map_write().
2800 	 */
2801 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2802 
2803 	if (msp->ms_sm == NULL) {
2804 		uint64_t new_object;
2805 
2806 		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2807 		VERIFY3U(new_object, !=, 0);
2808 
2809 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2810 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
2811 
2812 		ASSERT(msp->ms_sm != NULL);
2813 		ASSERT0(metaslab_allocated_space(msp));
2814 	}
2815 
2816 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
2817 	    vd->vdev_checkpoint_sm == NULL) {
2818 		ASSERT(spa_has_checkpoint(spa));
2819 
2820 		uint64_t new_object = space_map_alloc(mos,
2821 		    vdev_standard_sm_blksz, tx);
2822 		VERIFY3U(new_object, !=, 0);
2823 
2824 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2825 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2826 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2827 
2828 		/*
2829 		 * We save the space map object as an entry in vdev_top_zap
2830 		 * so it can be retrieved when the pool is reopened after an
2831 		 * export or through zdb.
2832 		 */
2833 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
2834 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
2835 		    sizeof (new_object), 1, &new_object, tx));
2836 	}
2837 
2838 	mutex_enter(&msp->ms_sync_lock);
2839 	mutex_enter(&msp->ms_lock);
2840 
2841 	/*
2842 	 * Note: metaslab_condense() clears the space map's histogram.
2843 	 * Therefore we must verify and remove this histogram before
2844 	 * condensing.
2845 	 */
2846 	metaslab_group_histogram_verify(mg);
2847 	metaslab_class_histogram_verify(mg->mg_class);
2848 	metaslab_group_histogram_remove(mg, msp);
2849 
2850 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
2851 		metaslab_condense(msp, txg, tx);
2852 	} else {
2853 		mutex_exit(&msp->ms_lock);
2854 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2855 		    SM_NO_VDEVID, tx);
2856 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2857 		    SM_NO_VDEVID, tx);
2858 		mutex_enter(&msp->ms_lock);
2859 	}
2860 
2861 	msp->ms_allocated_space += range_tree_space(alloctree);
2862 	ASSERT3U(msp->ms_allocated_space, >=,
2863 	    range_tree_space(msp->ms_freeing));
2864 	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
2865 
2866 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
2867 		ASSERT(spa_has_checkpoint(spa));
2868 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2869 
2870 		/*
2871 		 * Since we are doing writes to disk and the ms_checkpointing
2872 		 * tree won't be changing during that time, we drop the
2873 		 * ms_lock while writing to the checkpoint space map.
2874 		 */
2875 		mutex_exit(&msp->ms_lock);
2876 		space_map_write(vd->vdev_checkpoint_sm,
2877 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2878 		mutex_enter(&msp->ms_lock);
2879 
2880 		spa->spa_checkpoint_info.sci_dspace +=
2881 		    range_tree_space(msp->ms_checkpointing);
2882 		vd->vdev_stat.vs_checkpoint_space +=
2883 		    range_tree_space(msp->ms_checkpointing);
2884 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2885 		    -space_map_allocated(vd->vdev_checkpoint_sm));
2886 
2887 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2888 	}
2889 
2890 	if (msp->ms_loaded) {
2891 		/*
2892 		 * When the space map is loaded, we have an accurate
2893 		 * histogram in the range tree. This gives us an opportunity
2894 		 * to bring the space map's histogram up-to-date so we clear
2895 		 * it first before updating it.
2896 		 */
2897 		space_map_histogram_clear(msp->ms_sm);
2898 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2899 
2900 		/*
2901 		 * Since we've cleared the histogram we need to add back
2902 		 * any free space that has already been processed, plus
2903 		 * any deferred space. This allows the on-disk histogram
2904 		 * to accurately reflect all free space even if some space
2905 		 * is not yet available for allocation (i.e. deferred).
2906 		 */
2907 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
2908 
2909 		/*
2910 		 * Add back any deferred free space that has not been
2911 		 * added back into the in-core free tree yet. This will
2912 		 * ensure that we don't end up with a space map histogram
2913 		 * that is completely empty unless the metaslab is fully
2914 		 * allocated.
2915 		 */
2916 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2917 			space_map_histogram_add(msp->ms_sm,
2918 			    msp->ms_defer[t], tx);
2919 		}
2920 	}
2921 
2922 	/*
2923 	 * Always add the free space from this sync pass to the space
2924 	 * map histogram. We want to make sure that the on-disk histogram
2925 	 * accounts for all free space. If the space map is not loaded,
2926 	 * then we will lose some accuracy but will correct it the next
2927 	 * time we load the space map.
2928 	 */
2929 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
2930 	metaslab_aux_histograms_update(msp);
2931 
2932 	metaslab_group_histogram_add(mg, msp);
2933 	metaslab_group_histogram_verify(mg);
2934 	metaslab_class_histogram_verify(mg->mg_class);
2935 
2936 	/*
2937 	 * For sync pass 1, we avoid traversing this txg's free range tree
2938 	 * and instead will just swap the pointers for freeing and freed.
2939 	 * We can safely do this since the freed_tree is guaranteed to be
2940 	 * empty on the initial pass.
2941 	 */
2942 	if (spa_sync_pass(spa) == 1) {
2943 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
2944 		ASSERT0(msp->ms_allocated_this_txg);
2945 	} else {
2946 		range_tree_vacate(msp->ms_freeing,
2947 		    range_tree_add, msp->ms_freed);
2948 	}
2949 	msp->ms_allocated_this_txg += range_tree_space(alloctree);
2950 	range_tree_vacate(alloctree, NULL, NULL);
2951 
2952 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2953 	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2954 	    & TXG_MASK]));
2955 	ASSERT0(range_tree_space(msp->ms_freeing));
2956 	ASSERT0(range_tree_space(msp->ms_checkpointing));
2957 
2958 	mutex_exit(&msp->ms_lock);
2959 
2960 	if (object != space_map_object(msp->ms_sm)) {
2961 		object = space_map_object(msp->ms_sm);
2962 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2963 		    msp->ms_id, sizeof (uint64_t), &object, tx);
2964 	}
2965 	mutex_exit(&msp->ms_sync_lock);
2966 	dmu_tx_commit(tx);
2967 }
2968 
2969 /*
2970  * Called after a transaction group has completely synced to mark
2971  * all of the metaslab's free space as usable.
2972  */
2973 void
2974 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2975 {
2976 	metaslab_group_t *mg = msp->ms_group;
2977 	vdev_t *vd = mg->mg_vd;
2978 	spa_t *spa = vd->vdev_spa;
2979 	range_tree_t **defer_tree;
2980 	int64_t alloc_delta, defer_delta;
2981 	boolean_t defer_allowed = B_TRUE;
2982 
2983 	ASSERT(!vd->vdev_ishole);
2984 
2985 	mutex_enter(&msp->ms_lock);
2986 
2987 	/*
2988 	 * If this metaslab is just becoming available, initialize its
2989 	 * range trees and add its capacity to the vdev.
2990 	 */
2991 	if (msp->ms_freed == NULL) {
2992 		for (int t = 0; t < TXG_SIZE; t++) {
2993 			ASSERT(msp->ms_allocating[t] == NULL);
2994 
2995 			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
2996 		}
2997 
2998 		ASSERT3P(msp->ms_freeing, ==, NULL);
2999 		msp->ms_freeing = range_tree_create(NULL, NULL);
3000 
3001 		ASSERT3P(msp->ms_freed, ==, NULL);
3002 		msp->ms_freed = range_tree_create(NULL, NULL);
3003 
3004 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3005 			ASSERT(msp->ms_defer[t] == NULL);
3006 
3007 			msp->ms_defer[t] = range_tree_create(NULL, NULL);
3008 		}
3009 
3010 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
3011 		msp->ms_checkpointing = range_tree_create(NULL, NULL);
3012 
3013 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
3014 	}
3015 	ASSERT0(range_tree_space(msp->ms_freeing));
3016 	ASSERT0(range_tree_space(msp->ms_checkpointing));
3017 
3018 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
3019 
3020 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
3021 	    metaslab_class_get_alloc(spa_normal_class(spa));
3022 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
3023 		defer_allowed = B_FALSE;
3024 	}
3025 
3026 	defer_delta = 0;
3027 	alloc_delta = msp->ms_allocated_this_txg -
3028 	    range_tree_space(msp->ms_freed);
3029 	if (defer_allowed) {
3030 		defer_delta = range_tree_space(msp->ms_freed) -
3031 		    range_tree_space(*defer_tree);
3032 	} else {
3033 		defer_delta -= range_tree_space(*defer_tree);
3034 	}
3035 
3036 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3037 	    defer_delta, 0);
3038 
3039 	/*
3040 	 * If there's a metaslab_load() in progress, wait for it to complete
3041 	 * so that we have a consistent view of the in-core space map.
3042 	 */
3043 	metaslab_load_wait(msp);
3044 
3045 	/*
3046 	 * Move the frees from the defer_tree back to the free
3047 	 * range tree (if it's loaded). Swap the freed_tree and
3048 	 * the defer_tree -- this is safe to do because we've
3049 	 * just emptied out the defer_tree.
3050 	 */
3051 	range_tree_vacate(*defer_tree,
3052 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
3053 	if (defer_allowed) {
3054 		range_tree_swap(&msp->ms_freed, defer_tree);
3055 	} else {
3056 		range_tree_vacate(msp->ms_freed,
3057 		    msp->ms_loaded ? range_tree_add : NULL,
3058 		    msp->ms_allocatable);
3059 	}
3060 
3061 	msp->ms_synced_length = space_map_length(msp->ms_sm);
3062 
3063 	msp->ms_deferspace += defer_delta;
3064 	ASSERT3S(msp->ms_deferspace, >=, 0);
3065 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
3066 	if (msp->ms_deferspace != 0) {
3067 		/*
3068 		 * Keep syncing this metaslab until all deferred frees
3069 		 * are back in circulation.
3070 		 */
3071 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3072 	}
3073 	metaslab_aux_histograms_update_done(msp, defer_allowed);
3074 
3075 	if (msp->ms_new) {
3076 		msp->ms_new = B_FALSE;
3077 		mutex_enter(&mg->mg_lock);
3078 		mg->mg_ms_ready++;
3079 		mutex_exit(&mg->mg_lock);
3080 	}
3081 
3082 	/*
3083 	 * Re-sort metaslab within its group now that we've adjusted
3084 	 * its allocatable space.
3085 	 */
3086 	metaslab_recalculate_weight_and_sort(msp);
3087 
3088 	/*
3089 	 * If the metaslab is loaded and we've not tried to load or allocate
3090 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
3091 	 */
3092 	if (msp->ms_loaded &&
3093 	    msp->ms_initializing == 0 &&
3094 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
3095 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3096 			VERIFY0(range_tree_space(
3097 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
3098 		}
3099 		if (msp->ms_allocator != -1) {
3100 			metaslab_passivate(msp, msp->ms_weight &
3101 			    ~METASLAB_ACTIVE_MASK);
3102 		}
3103 
3104 		if (!metaslab_debug_unload)
3105 			metaslab_unload(msp);
3106 	}
3107 
3108 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3109 	ASSERT0(range_tree_space(msp->ms_freeing));
3110 	ASSERT0(range_tree_space(msp->ms_freed));
3111 	ASSERT0(range_tree_space(msp->ms_checkpointing));
3112 
3113 	msp->ms_allocated_this_txg = 0;
3114 	mutex_exit(&msp->ms_lock);
3115 }
3116 
3117 void
3118 metaslab_sync_reassess(metaslab_group_t *mg)
3119 {
3120 	spa_t *spa = mg->mg_class->mc_spa;
3121 
3122 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3123 	metaslab_group_alloc_update(mg);
3124 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
3125 
3126 	/*
3127 	 * Preload the next potential metaslabs but only on active
3128 	 * metaslab groups. We can get into a state where the metaslab
3129 	 * is no longer active since we dirty metaslabs as we remove a
3130 	 * a device, thus potentially making the metaslab group eligible
3131 	 * for preloading.
3132 	 */
3133 	if (mg->mg_activation_count > 0) {
3134 		metaslab_group_preload(mg);
3135 	}
3136 	spa_config_exit(spa, SCL_ALLOC, FTAG);
3137 }
3138 
3139 /*
3140  * When writing a ditto block (i.e. more than one DVA for a given BP) on
3141  * the same vdev as an existing DVA of this BP, then try to allocate it
3142  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
3143  */
3144 static boolean_t
3145 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
3146 {
3147 	uint64_t dva_ms_id;
3148 
3149 	if (DVA_GET_ASIZE(dva) == 0)
3150 		return (B_TRUE);
3151 
3152 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
3153 		return (B_TRUE);
3154 
3155 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
3156 
3157 	return (msp->ms_id != dva_ms_id);
3158 }
3159 
3160 /*
3161  * ==========================================================================
3162  * Metaslab allocation tracing facility
3163  * ==========================================================================
3164  */
3165 kstat_t *metaslab_trace_ksp;
3166 kstat_named_t metaslab_trace_over_limit;
3167 
3168 void
3169 metaslab_alloc_trace_init(void)
3170 {
3171 	ASSERT(metaslab_alloc_trace_cache == NULL);
3172 	metaslab_alloc_trace_cache = kmem_cache_create(
3173 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
3174 	    0, NULL, NULL, NULL, NULL, NULL, 0);
3175 	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
3176 	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
3177 	if (metaslab_trace_ksp != NULL) {
3178 		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
3179 		kstat_named_init(&metaslab_trace_over_limit,
3180 		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
3181 		kstat_install(metaslab_trace_ksp);
3182 	}
3183 }
3184 
3185 void
3186 metaslab_alloc_trace_fini(void)
3187 {
3188 	if (metaslab_trace_ksp != NULL) {
3189 		kstat_delete(metaslab_trace_ksp);
3190 		metaslab_trace_ksp = NULL;
3191 	}
3192 	kmem_cache_destroy(metaslab_alloc_trace_cache);
3193 	metaslab_alloc_trace_cache = NULL;
3194 }
3195 
3196 /*
3197  * Add an allocation trace element to the allocation tracing list.
3198  */
3199 static void
3200 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
3201     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
3202     int allocator)
3203 {
3204 	if (!metaslab_trace_enabled)
3205 		return;
3206 
3207 	/*
3208 	 * When the tracing list reaches its maximum we remove
3209 	 * the second element in the list before adding a new one.
3210 	 * By removing the second element we preserve the original
3211 	 * entry as a clue to what allocations steps have already been
3212 	 * performed.
3213 	 */
3214 	if (zal->zal_size == metaslab_trace_max_entries) {
3215 		metaslab_alloc_trace_t *mat_next;
3216 #ifdef DEBUG
3217 		panic("too many entries in allocation list");
3218 #endif
3219 		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
3220 		zal->zal_size--;
3221 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
3222 		list_remove(&zal->zal_list, mat_next);
3223 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
3224 	}
3225 
3226 	metaslab_alloc_trace_t *mat =
3227 	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
3228 	list_link_init(&mat->mat_list_node);
3229 	mat->mat_mg = mg;
3230 	mat->mat_msp = msp;
3231 	mat->mat_size = psize;
3232 	mat->mat_dva_id = dva_id;
3233 	mat->mat_offset = offset;
3234 	mat->mat_weight = 0;
3235 	mat->mat_allocator = allocator;
3236 
3237 	if (msp != NULL)
3238 		mat->mat_weight = msp->ms_weight;
3239 
3240 	/*
3241 	 * The list is part of the zio so locking is not required. Only
3242 	 * a single thread will perform allocations for a given zio.
3243 	 */
3244 	list_insert_tail(&zal->zal_list, mat);
3245 	zal->zal_size++;
3246 
3247 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
3248 }
3249 
3250 void
3251 metaslab_trace_init(zio_alloc_list_t *zal)
3252 {
3253 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
3254 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
3255 	zal->zal_size = 0;
3256 }
3257 
3258 void
3259 metaslab_trace_fini(zio_alloc_list_t *zal)
3260 {
3261 	metaslab_alloc_trace_t *mat;
3262 
3263 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
3264 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
3265 	list_destroy(&zal->zal_list);
3266 	zal->zal_size = 0;
3267 }
3268 
3269 /*
3270  * ==========================================================================
3271  * Metaslab block operations
3272  * ==========================================================================
3273  */
3274 
3275 static void
3276 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
3277     int allocator)
3278 {
3279 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
3280 	    (flags & METASLAB_DONT_THROTTLE))
3281 		return;
3282 
3283 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3284 	if (!mg->mg_class->mc_alloc_throttle_enabled)
3285 		return;
3286 
3287 	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
3288 }
3289 
3290 static void
3291 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
3292 {
3293 	uint64_t max = mg->mg_max_alloc_queue_depth;
3294 	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
3295 	while (cur < max) {
3296 		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
3297 		    cur, cur + 1) == cur) {
3298 			atomic_inc_64(
3299 			    &mg->mg_class->mc_alloc_max_slots[allocator]);
3300 			return;
3301 		}
3302 		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
3303 	}
3304 }
3305 
3306 void
3307 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
3308     int allocator, boolean_t io_complete)
3309 {
3310 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
3311 	    (flags & METASLAB_DONT_THROTTLE))
3312 		return;
3313 
3314 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3315 	if (!mg->mg_class->mc_alloc_throttle_enabled)
3316 		return;
3317 
3318 	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
3319 	if (io_complete)
3320 		metaslab_group_increment_qdepth(mg, allocator);
3321 }
3322 
3323 void
3324 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
3325     int allocator)
3326 {
3327 #ifdef ZFS_DEBUG
3328 	const dva_t *dva = bp->blk_dva;
3329 	int ndvas = BP_GET_NDVAS(bp);
3330 
3331 	for (int d = 0; d < ndvas; d++) {
3332 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
3333 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3334 		VERIFY(zfs_refcount_not_held(
3335 		    &mg->mg_alloc_queue_depth[allocator], tag));
3336 	}
3337 #endif
3338 }
3339 
3340 static uint64_t
3341 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
3342 {
3343 	uint64_t start;
3344 	range_tree_t *rt = msp->ms_allocatable;
3345 	metaslab_class_t *mc = msp->ms_group->mg_class;
3346 
3347 	VERIFY(!msp->ms_condensing);
3348 	VERIFY0(msp->ms_initializing);
3349 
3350 	start = mc->mc_ops->msop_alloc(msp, size);
3351 	if (start != -1ULL) {
3352 		metaslab_group_t *mg = msp->ms_group;
3353 		vdev_t *vd = mg->mg_vd;
3354 
3355 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
3356 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3357 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
3358 		range_tree_remove(rt, start, size);
3359 
3360 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
3361 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
3362 
3363 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
3364 
3365 		/* Track the last successful allocation */
3366 		msp->ms_alloc_txg = txg;
3367 		metaslab_verify_space(msp, txg);
3368 	}
3369 
3370 	/*
3371 	 * Now that we've attempted the allocation we need to update the
3372 	 * metaslab's maximum block size since it may have changed.
3373 	 */
3374 	msp->ms_max_size = metaslab_block_maxsize(msp);
3375 	return (start);
3376 }
3377 
3378 /*
3379  * Find the metaslab with the highest weight that is less than what we've
3380  * already tried.  In the common case, this means that we will examine each
3381  * metaslab at most once. Note that concurrent callers could reorder metaslabs
3382  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
3383  * activated by another thread, and we fail to allocate from the metaslab we
3384  * have selected, we may not try the newly-activated metaslab, and instead
3385  * activate another metaslab.  This is not optimal, but generally does not cause
3386  * any problems (a possible exception being if every metaslab is completely full
3387  * except for the the newly-activated metaslab which we fail to examine).
3388  */
3389 static metaslab_t *
3390 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
3391     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
3392     zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
3393 {
3394 	avl_index_t idx;
3395 	avl_tree_t *t = &mg->mg_metaslab_tree;
3396 	metaslab_t *msp = avl_find(t, search, &idx);
3397 	if (msp == NULL)
3398 		msp = avl_nearest(t, idx, AVL_AFTER);
3399 
3400 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
3401 		int i;
3402 		if (!metaslab_should_allocate(msp, asize)) {
3403 			metaslab_trace_add(zal, mg, msp, asize, d,
3404 			    TRACE_TOO_SMALL, allocator);
3405 			continue;
3406 		}
3407 
3408 		/*
3409 		 * If the selected metaslab is condensing or being
3410 		 * initialized, skip it.
3411 		 */
3412 		if (msp->ms_condensing || msp->ms_initializing > 0)
3413 			continue;
3414 
3415 		*was_active = msp->ms_allocator != -1;
3416 		/*
3417 		 * If we're activating as primary, this is our first allocation
3418 		 * from this disk, so we don't need to check how close we are.
3419 		 * If the metaslab under consideration was already active,
3420 		 * we're getting desperate enough to steal another allocator's
3421 		 * metaslab, so we still don't care about distances.
3422 		 */
3423 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
3424 			break;
3425 
3426 		for (i = 0; i < d; i++) {
3427 			if (want_unique &&
3428 			    !metaslab_is_unique(msp, &dva[i]))
3429 				break;  /* try another metaslab */
3430 		}
3431 		if (i == d)
3432 			break;
3433 	}
3434 
3435 	if (msp != NULL) {
3436 		search->ms_weight = msp->ms_weight;
3437 		search->ms_start = msp->ms_start + 1;
3438 		search->ms_allocator = msp->ms_allocator;
3439 		search->ms_primary = msp->ms_primary;
3440 	}
3441 	return (msp);
3442 }
3443 
3444 /* ARGSUSED */
3445 static uint64_t
3446 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
3447     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
3448     int d, int allocator)
3449 {
3450 	metaslab_t *msp = NULL;
3451 	uint64_t offset = -1ULL;
3452 	uint64_t activation_weight;
3453 
3454 	activation_weight = METASLAB_WEIGHT_PRIMARY;
3455 	for (int i = 0; i < d; i++) {
3456 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3457 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3458 			activation_weight = METASLAB_WEIGHT_SECONDARY;
3459 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3460 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3461 			activation_weight = METASLAB_WEIGHT_CLAIM;
3462 			break;
3463 		}
3464 	}
3465 
3466 	/*
3467 	 * If we don't have enough metaslabs active to fill the entire array, we
3468 	 * just use the 0th slot.
3469 	 */
3470 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
3471 		allocator = 0;
3472 
3473 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
3474 
3475 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
3476 	search->ms_weight = UINT64_MAX;
3477 	search->ms_start = 0;
3478 	/*
3479 	 * At the end of the metaslab tree are the already-active metaslabs,
3480 	 * first the primaries, then the secondaries. When we resume searching
3481 	 * through the tree, we need to consider ms_allocator and ms_primary so
3482 	 * we start in the location right after where we left off, and don't
3483 	 * accidentally loop forever considering the same metaslabs.
3484 	 */
3485 	search->ms_allocator = -1;
3486 	search->ms_primary = B_TRUE;
3487 	for (;;) {
3488 		boolean_t was_active = B_FALSE;
3489 
3490 		mutex_enter(&mg->mg_lock);
3491 
3492 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3493 		    mg->mg_primaries[allocator] != NULL) {
3494 			msp = mg->mg_primaries[allocator];
3495 			was_active = B_TRUE;
3496 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3497 		    mg->mg_secondaries[allocator] != NULL) {
3498 			msp = mg->mg_secondaries[allocator];
3499 			was_active = B_TRUE;
3500 		} else {
3501 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
3502 			    want_unique, asize, allocator, zal, search,
3503 			    &was_active);
3504 		}
3505 
3506 		mutex_exit(&mg->mg_lock);
3507 		if (msp == NULL) {
3508 			kmem_free(search, sizeof (*search));
3509 			return (-1ULL);
3510 		}
3511 
3512 		mutex_enter(&msp->ms_lock);
3513 		/*
3514 		 * Ensure that the metaslab we have selected is still
3515 		 * capable of handling our request. It's possible that
3516 		 * another thread may have changed the weight while we
3517 		 * were blocked on the metaslab lock. We check the
3518 		 * active status first to see if we need to reselect
3519 		 * a new metaslab.
3520 		 */
3521 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
3522 			mutex_exit(&msp->ms_lock);
3523 			continue;
3524 		}
3525 
3526 		/*
3527 		 * If the metaslab is freshly activated for an allocator that
3528 		 * isn't the one we're allocating from, or if it's a primary and
3529 		 * we're seeking a secondary (or vice versa), we go back and
3530 		 * select a new metaslab.
3531 		 */
3532 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
3533 		    (msp->ms_allocator != -1) &&
3534 		    (msp->ms_allocator != allocator || ((activation_weight ==
3535 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
3536 			mutex_exit(&msp->ms_lock);
3537 			continue;
3538 		}
3539 
3540 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
3541 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
3542 			metaslab_passivate(msp, msp->ms_weight &
3543 			    ~METASLAB_WEIGHT_CLAIM);
3544 			mutex_exit(&msp->ms_lock);
3545 			continue;
3546 		}
3547 
3548 		if (metaslab_activate(msp, allocator, activation_weight) != 0) {
3549 			mutex_exit(&msp->ms_lock);
3550 			continue;
3551 		}
3552 
3553 		msp->ms_selected_txg = txg;
3554 
3555 		/*
3556 		 * Now that we have the lock, recheck to see if we should
3557 		 * continue to use this metaslab for this allocation. The
3558 		 * the metaslab is now loaded so metaslab_should_allocate() can
3559 		 * accurately determine if the allocation attempt should
3560 		 * proceed.
3561 		 */
3562 		if (!metaslab_should_allocate(msp, asize)) {
3563 			/* Passivate this metaslab and select a new one. */
3564 			metaslab_trace_add(zal, mg, msp, asize, d,
3565 			    TRACE_TOO_SMALL, allocator);
3566 			goto next;
3567 		}
3568 
3569 		/*
3570 		 * If this metaslab is currently condensing then pick again as
3571 		 * we can't manipulate this metaslab until it's committed
3572 		 * to disk. If this metaslab is being initialized, we shouldn't
3573 		 * allocate from it since the allocated region might be
3574 		 * overwritten after allocation.
3575 		 */
3576 		if (msp->ms_condensing) {
3577 			metaslab_trace_add(zal, mg, msp, asize, d,
3578 			    TRACE_CONDENSING, allocator);
3579 			metaslab_passivate(msp, msp->ms_weight &
3580 			    ~METASLAB_ACTIVE_MASK);
3581 			mutex_exit(&msp->ms_lock);
3582 			continue;
3583 		} else if (msp->ms_initializing > 0) {
3584 			metaslab_trace_add(zal, mg, msp, asize, d,
3585 			    TRACE_INITIALIZING, allocator);
3586 			metaslab_passivate(msp, msp->ms_weight &
3587 			    ~METASLAB_ACTIVE_MASK);
3588 			mutex_exit(&msp->ms_lock);
3589 			continue;
3590 		}
3591 
3592 		offset = metaslab_block_alloc(msp, asize, txg);
3593 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
3594 
3595 		if (offset != -1ULL) {
3596 			/* Proactively passivate the metaslab, if needed */
3597 			metaslab_segment_may_passivate(msp);
3598 			break;
3599 		}
3600 next:
3601 		ASSERT(msp->ms_loaded);
3602 
3603 		/*
3604 		 * We were unable to allocate from this metaslab so determine
3605 		 * a new weight for this metaslab. Now that we have loaded
3606 		 * the metaslab we can provide a better hint to the metaslab
3607 		 * selector.
3608 		 *
3609 		 * For space-based metaslabs, we use the maximum block size.
3610 		 * This information is only available when the metaslab
3611 		 * is loaded and is more accurate than the generic free
3612 		 * space weight that was calculated by metaslab_weight().
3613 		 * This information allows us to quickly compare the maximum
3614 		 * available allocation in the metaslab to the allocation
3615 		 * size being requested.
3616 		 *
3617 		 * For segment-based metaslabs, determine the new weight
3618 		 * based on the highest bucket in the range tree. We
3619 		 * explicitly use the loaded segment weight (i.e. the range
3620 		 * tree histogram) since it contains the space that is
3621 		 * currently available for allocation and is accurate
3622 		 * even within a sync pass.
3623 		 */
3624 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3625 			uint64_t weight = metaslab_block_maxsize(msp);
3626 			WEIGHT_SET_SPACEBASED(weight);
3627 			metaslab_passivate(msp, weight);
3628 		} else {
3629 			metaslab_passivate(msp,
3630 			    metaslab_weight_from_range_tree(msp));
3631 		}
3632 
3633 		/*
3634 		 * We have just failed an allocation attempt, check
3635 		 * that metaslab_should_allocate() agrees. Otherwise,
3636 		 * we may end up in an infinite loop retrying the same
3637 		 * metaslab.
3638 		 */
3639 		ASSERT(!metaslab_should_allocate(msp, asize));
3640 
3641 		mutex_exit(&msp->ms_lock);
3642 	}
3643 	mutex_exit(&msp->ms_lock);
3644 	kmem_free(search, sizeof (*search));
3645 	return (offset);
3646 }
3647 
3648 static uint64_t
3649 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
3650     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
3651     int d, int allocator)
3652 {
3653 	uint64_t offset;
3654 	ASSERT(mg->mg_initialized);
3655 
3656 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
3657 	    dva, d, allocator);
3658 
3659 	mutex_enter(&mg->mg_lock);
3660 	if (offset == -1ULL) {
3661 		mg->mg_failed_allocations++;
3662 		metaslab_trace_add(zal, mg, NULL, asize, d,
3663 		    TRACE_GROUP_FAILURE, allocator);
3664 		if (asize == SPA_GANGBLOCKSIZE) {
3665 			/*
3666 			 * This metaslab group was unable to allocate
3667 			 * the minimum gang block size so it must be out of
3668 			 * space. We must notify the allocation throttle
3669 			 * to start skipping allocation attempts to this
3670 			 * metaslab group until more space becomes available.
3671 			 * Note: this failure cannot be caused by the
3672 			 * allocation throttle since the allocation throttle
3673 			 * is only responsible for skipping devices and
3674 			 * not failing block allocations.
3675 			 */
3676 			mg->mg_no_free_space = B_TRUE;
3677 		}
3678 	}
3679 	mg->mg_allocations++;
3680 	mutex_exit(&mg->mg_lock);
3681 	return (offset);
3682 }
3683 
3684 /*
3685  * Allocate a block for the specified i/o.
3686  */
3687 int
3688 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3689     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3690     zio_alloc_list_t *zal, int allocator)
3691 {
3692 	metaslab_group_t *mg, *rotor;
3693 	vdev_t *vd;
3694 	boolean_t try_hard = B_FALSE;
3695 
3696 	ASSERT(!DVA_IS_VALID(&dva[d]));
3697 
3698 	/*
3699 	 * For testing, make some blocks above a certain size be gang blocks.
3700 	 * This will also test spilling from special to normal.
3701 	 */
3702 	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3703 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
3704 		    allocator);
3705 		return (SET_ERROR(ENOSPC));
3706 	}
3707 
3708 	/*
3709 	 * Start at the rotor and loop through all mgs until we find something.
3710 	 * Note that there's no locking on mc_rotor or mc_aliquot because
3711 	 * nothing actually breaks if we miss a few updates -- we just won't
3712 	 * allocate quite as evenly.  It all balances out over time.
3713 	 *
3714 	 * If we are doing ditto or log blocks, try to spread them across
3715 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
3716 	 * allocated all of our ditto blocks, then try and spread them out on
3717 	 * that vdev as much as possible.  If it turns out to not be possible,
3718 	 * gradually lower our standards until anything becomes acceptable.
3719 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3720 	 * gives us hope of containing our fault domains to something we're
3721 	 * able to reason about.  Otherwise, any two top-level vdev failures
3722 	 * will guarantee the loss of data.  With consecutive allocation,
3723 	 * only two adjacent top-level vdev failures will result in data loss.
3724 	 *
3725 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
3726 	 * ourselves on the same vdev as our gang block header.  That
3727 	 * way, we can hope for locality in vdev_cache, plus it makes our
3728 	 * fault domains something tractable.
3729 	 */
3730 	if (hintdva) {
3731 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3732 
3733 		/*
3734 		 * It's possible the vdev we're using as the hint no
3735 		 * longer exists or its mg has been closed (e.g. by
3736 		 * device removal).  Consult the rotor when
3737 		 * all else fails.
3738 		 */
3739 		if (vd != NULL && vd->vdev_mg != NULL) {
3740 			mg = vd->vdev_mg;
3741 
3742 			if (flags & METASLAB_HINTBP_AVOID &&
3743 			    mg->mg_next != NULL)
3744 				mg = mg->mg_next;
3745 		} else {
3746 			mg = mc->mc_rotor;
3747 		}
3748 	} else if (d != 0) {
3749 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3750 		mg = vd->vdev_mg->mg_next;
3751 	} else {
3752 		ASSERT(mc->mc_rotor != NULL);
3753 		mg = mc->mc_rotor;
3754 	}
3755 
3756 	/*
3757 	 * If the hint put us into the wrong metaslab class, or into a
3758 	 * metaslab group that has been passivated, just follow the rotor.
3759 	 */
3760 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3761 		mg = mc->mc_rotor;
3762 
3763 	rotor = mg;
3764 top:
3765 	do {
3766 		boolean_t allocatable;
3767 
3768 		ASSERT(mg->mg_activation_count == 1);
3769 		vd = mg->mg_vd;
3770 
3771 		/*
3772 		 * Don't allocate from faulted devices.
3773 		 */
3774 		if (try_hard) {
3775 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3776 			allocatable = vdev_allocatable(vd);
3777 			spa_config_exit(spa, SCL_ZIO, FTAG);
3778 		} else {
3779 			allocatable = vdev_allocatable(vd);
3780 		}
3781 
3782 		/*
3783 		 * Determine if the selected metaslab group is eligible
3784 		 * for allocations. If we're ganging then don't allow
3785 		 * this metaslab group to skip allocations since that would
3786 		 * inadvertently return ENOSPC and suspend the pool
3787 		 * even though space is still available.
3788 		 */
3789 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3790 			allocatable = metaslab_group_allocatable(mg, rotor,
3791 			    psize, allocator);
3792 		}
3793 
3794 		if (!allocatable) {
3795 			metaslab_trace_add(zal, mg, NULL, psize, d,
3796 			    TRACE_NOT_ALLOCATABLE, allocator);
3797 			goto next;
3798 		}
3799 
3800 		ASSERT(mg->mg_initialized);
3801 
3802 		/*
3803 		 * Avoid writing single-copy data to a failing,
3804 		 * non-redundant vdev, unless we've already tried all
3805 		 * other vdevs.
3806 		 */
3807 		if ((vd->vdev_stat.vs_write_errors > 0 ||
3808 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
3809 		    d == 0 && !try_hard && vd->vdev_children == 0) {
3810 			metaslab_trace_add(zal, mg, NULL, psize, d,
3811 			    TRACE_VDEV_ERROR, allocator);
3812 			goto next;
3813 		}
3814 
3815 		ASSERT(mg->mg_class == mc);
3816 
3817 		uint64_t asize = vdev_psize_to_asize(vd, psize);
3818 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3819 
3820 		/*
3821 		 * If we don't need to try hard, then require that the
3822 		 * block be on an different metaslab from any other DVAs
3823 		 * in this BP (unique=true).  If we are trying hard, then
3824 		 * allow any metaslab to be used (unique=false).
3825 		 */
3826 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3827 		    !try_hard, dva, d, allocator);
3828 
3829 		if (offset != -1ULL) {
3830 			/*
3831 			 * If we've just selected this metaslab group,
3832 			 * figure out whether the corresponding vdev is
3833 			 * over- or under-used relative to the pool,
3834 			 * and set an allocation bias to even it out.
3835 			 */
3836 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3837 				vdev_stat_t *vs = &vd->vdev_stat;
3838 				int64_t vu, cu;
3839 
3840 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3841 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3842 
3843 				/*
3844 				 * Calculate how much more or less we should
3845 				 * try to allocate from this device during
3846 				 * this iteration around the rotor.
3847 				 * For example, if a device is 80% full
3848 				 * and the pool is 20% full then we should
3849 				 * reduce allocations by 60% on this device.
3850 				 *
3851 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
3852 				 *
3853 				 * This reduces allocations by 307K for this
3854 				 * iteration.
3855 				 */
3856 				mg->mg_bias = ((cu - vu) *
3857 				    (int64_t)mg->mg_aliquot) / 100;
3858 			} else if (!metaslab_bias_enabled) {
3859 				mg->mg_bias = 0;
3860 			}
3861 
3862 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3863 			    mg->mg_aliquot + mg->mg_bias) {
3864 				mc->mc_rotor = mg->mg_next;
3865 				mc->mc_aliquot = 0;
3866 			}
3867 
3868 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
3869 			DVA_SET_OFFSET(&dva[d], offset);
3870 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3871 			DVA_SET_ASIZE(&dva[d], asize);
3872 
3873 			return (0);
3874 		}
3875 next:
3876 		mc->mc_rotor = mg->mg_next;
3877 		mc->mc_aliquot = 0;
3878 	} while ((mg = mg->mg_next) != rotor);
3879 
3880 	/*
3881 	 * If we haven't tried hard, do so now.
3882 	 */
3883 	if (!try_hard) {
3884 		try_hard = B_TRUE;
3885 		goto top;
3886 	}
3887 
3888 	bzero(&dva[d], sizeof (dva_t));
3889 
3890 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
3891 	return (SET_ERROR(ENOSPC));
3892 }
3893 
3894 void
3895 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3896     boolean_t checkpoint)
3897 {
3898 	metaslab_t *msp;
3899 	spa_t *spa = vd->vdev_spa;
3900 
3901 	ASSERT(vdev_is_concrete(vd));
3902 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3903 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3904 
3905 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3906 
3907 	VERIFY(!msp->ms_condensing);
3908 	VERIFY3U(offset, >=, msp->ms_start);
3909 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3910 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3911 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3912 
3913 	metaslab_check_free_impl(vd, offset, asize);
3914 
3915 	mutex_enter(&msp->ms_lock);
3916 	if (range_tree_is_empty(msp->ms_freeing) &&
3917 	    range_tree_is_empty(msp->ms_checkpointing)) {
3918 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
3919 	}
3920 
3921 	if (checkpoint) {
3922 		ASSERT(spa_has_checkpoint(spa));
3923 		range_tree_add(msp->ms_checkpointing, offset, asize);
3924 	} else {
3925 		range_tree_add(msp->ms_freeing, offset, asize);
3926 	}
3927 	mutex_exit(&msp->ms_lock);
3928 }
3929 
3930 /* ARGSUSED */
3931 void
3932 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3933     uint64_t size, void *arg)
3934 {
3935 	boolean_t *checkpoint = arg;
3936 
3937 	ASSERT3P(checkpoint, !=, NULL);
3938 
3939 	if (vd->vdev_ops->vdev_op_remap != NULL)
3940 		vdev_indirect_mark_obsolete(vd, offset, size);
3941 	else
3942 		metaslab_free_impl(vd, offset, size, *checkpoint);
3943 }
3944 
3945 static void
3946 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3947     boolean_t checkpoint)
3948 {
3949 	spa_t *spa = vd->vdev_spa;
3950 
3951 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3952 
3953 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
3954 		return;
3955 
3956 	if (spa->spa_vdev_removal != NULL &&
3957 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3958 	    vdev_is_concrete(vd)) {
3959 		/*
3960 		 * Note: we check if the vdev is concrete because when
3961 		 * we complete the removal, we first change the vdev to be
3962 		 * an indirect vdev (in open context), and then (in syncing
3963 		 * context) clear spa_vdev_removal.
3964 		 */
3965 		free_from_removing_vdev(vd, offset, size);
3966 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
3967 		vdev_indirect_mark_obsolete(vd, offset, size);
3968 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
3969 		    metaslab_free_impl_cb, &checkpoint);
3970 	} else {
3971 		metaslab_free_concrete(vd, offset, size, checkpoint);
3972 	}
3973 }
3974 
3975 typedef struct remap_blkptr_cb_arg {
3976 	blkptr_t *rbca_bp;
3977 	spa_remap_cb_t rbca_cb;
3978 	vdev_t *rbca_remap_vd;
3979 	uint64_t rbca_remap_offset;
3980 	void *rbca_cb_arg;
3981 } remap_blkptr_cb_arg_t;
3982 
3983 void
3984 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3985     uint64_t size, void *arg)
3986 {
3987 	remap_blkptr_cb_arg_t *rbca = arg;
3988 	blkptr_t *bp = rbca->rbca_bp;
3989 
3990 	/* We can not remap split blocks. */
3991 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3992 		return;
3993 	ASSERT0(inner_offset);
3994 
3995 	if (rbca->rbca_cb != NULL) {
3996 		/*
3997 		 * At this point we know that we are not handling split
3998 		 * blocks and we invoke the callback on the previous
3999 		 * vdev which must be indirect.
4000 		 */
4001 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
4002 
4003 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
4004 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
4005 
4006 		/* set up remap_blkptr_cb_arg for the next call */
4007 		rbca->rbca_remap_vd = vd;
4008 		rbca->rbca_remap_offset = offset;
4009 	}
4010 
4011 	/*
4012 	 * The phys birth time is that of dva[0].  This ensures that we know
4013 	 * when each dva was written, so that resilver can determine which
4014 	 * blocks need to be scrubbed (i.e. those written during the time
4015 	 * the vdev was offline).  It also ensures that the key used in
4016 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
4017 	 * we didn't change the phys_birth, a lookup in the ARC for a
4018 	 * remapped BP could find the data that was previously stored at
4019 	 * this vdev + offset.
4020 	 */
4021 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
4022 	    DVA_GET_VDEV(&bp->blk_dva[0]));
4023 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
4024 	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
4025 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
4026 
4027 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
4028 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
4029 }
4030 
4031 /*
4032  * If the block pointer contains any indirect DVAs, modify them to refer to
4033  * concrete DVAs.  Note that this will sometimes not be possible, leaving
4034  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
4035  * segments in the mapping (i.e. it is a "split block").
4036  *
4037  * If the BP was remapped, calls the callback on the original dva (note the
4038  * callback can be called multiple times if the original indirect DVA refers
4039  * to another indirect DVA, etc).
4040  *
4041  * Returns TRUE if the BP was remapped.
4042  */
4043 boolean_t
4044 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
4045 {
4046 	remap_blkptr_cb_arg_t rbca;
4047 
4048 	if (!zfs_remap_blkptr_enable)
4049 		return (B_FALSE);
4050 
4051 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
4052 		return (B_FALSE);
4053 
4054 	/*
4055 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
4056 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
4057 	 */
4058 	if (BP_GET_DEDUP(bp))
4059 		return (B_FALSE);
4060 
4061 	/*
4062 	 * Gang blocks can not be remapped, because
4063 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
4064 	 * the BP used to read the gang block header (GBH) being the same
4065 	 * as the DVA[0] that we allocated for the GBH.
4066 	 */
4067 	if (BP_IS_GANG(bp))
4068 		return (B_FALSE);
4069 
4070 	/*
4071 	 * Embedded BP's have no DVA to remap.
4072 	 */
4073 	if (BP_GET_NDVAS(bp) < 1)
4074 		return (B_FALSE);
4075 
4076 	/*
4077 	 * Note: we only remap dva[0].  If we remapped other dvas, we
4078 	 * would no longer know what their phys birth txg is.
4079 	 */
4080 	dva_t *dva = &bp->blk_dva[0];
4081 
4082 	uint64_t offset = DVA_GET_OFFSET(dva);
4083 	uint64_t size = DVA_GET_ASIZE(dva);
4084 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
4085 
4086 	if (vd->vdev_ops->vdev_op_remap == NULL)
4087 		return (B_FALSE);
4088 
4089 	rbca.rbca_bp = bp;
4090 	rbca.rbca_cb = callback;
4091 	rbca.rbca_remap_vd = vd;
4092 	rbca.rbca_remap_offset = offset;
4093 	rbca.rbca_cb_arg = arg;
4094 
4095 	/*
4096 	 * remap_blkptr_cb() will be called in order for each level of
4097 	 * indirection, until a concrete vdev is reached or a split block is
4098 	 * encountered. old_vd and old_offset are updated within the callback
4099 	 * as we go from the one indirect vdev to the next one (either concrete
4100 	 * or indirect again) in that order.
4101 	 */
4102 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
4103 
4104 	/* Check if the DVA wasn't remapped because it is a split block */
4105 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
4106 		return (B_FALSE);
4107 
4108 	return (B_TRUE);
4109 }
4110 
4111 /*
4112  * Undo the allocation of a DVA which happened in the given transaction group.
4113  */
4114 void
4115 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
4116 {
4117 	metaslab_t *msp;
4118 	vdev_t *vd;
4119 	uint64_t vdev = DVA_GET_VDEV(dva);
4120 	uint64_t offset = DVA_GET_OFFSET(dva);
4121 	uint64_t size = DVA_GET_ASIZE(dva);
4122 
4123 	ASSERT(DVA_IS_VALID(dva));
4124 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4125 
4126 	if (txg > spa_freeze_txg(spa))
4127 		return;
4128 
4129 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
4130 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
4131 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
4132 		    (u_longlong_t)vdev, (u_longlong_t)offset);
4133 		ASSERT(0);
4134 		return;
4135 	}
4136 
4137 	ASSERT(!vd->vdev_removing);
4138 	ASSERT(vdev_is_concrete(vd));
4139 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
4140 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
4141 
4142 	if (DVA_GET_GANG(dva))
4143 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4144 
4145 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4146 
4147 	mutex_enter(&msp->ms_lock);
4148 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
4149 	    offset, size);
4150 
4151 	VERIFY(!msp->ms_condensing);
4152 	VERIFY3U(offset, >=, msp->ms_start);
4153 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
4154 	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
4155 	    msp->ms_size);
4156 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
4157 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4158 	range_tree_add(msp->ms_allocatable, offset, size);
4159 	mutex_exit(&msp->ms_lock);
4160 }
4161 
4162 /*
4163  * Free the block represented by the given DVA.
4164  */
4165 void
4166 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
4167 {
4168 	uint64_t vdev = DVA_GET_VDEV(dva);
4169 	uint64_t offset = DVA_GET_OFFSET(dva);
4170 	uint64_t size = DVA_GET_ASIZE(dva);
4171 	vdev_t *vd = vdev_lookup_top(spa, vdev);
4172 
4173 	ASSERT(DVA_IS_VALID(dva));
4174 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4175 
4176 	if (DVA_GET_GANG(dva)) {
4177 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4178 	}
4179 
4180 	metaslab_free_impl(vd, offset, size, checkpoint);
4181 }
4182 
4183 /*
4184  * Reserve some allocation slots. The reservation system must be called
4185  * before we call into the allocator. If there aren't any available slots
4186  * then the I/O will be throttled until an I/O completes and its slots are
4187  * freed up. The function returns true if it was successful in placing
4188  * the reservation.
4189  */
4190 boolean_t
4191 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
4192     zio_t *zio, int flags)
4193 {
4194 	uint64_t available_slots = 0;
4195 	boolean_t slot_reserved = B_FALSE;
4196 	uint64_t max = mc->mc_alloc_max_slots[allocator];
4197 
4198 	ASSERT(mc->mc_alloc_throttle_enabled);
4199 	mutex_enter(&mc->mc_lock);
4200 
4201 	uint64_t reserved_slots =
4202 	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
4203 	if (reserved_slots < max)
4204 		available_slots = max - reserved_slots;
4205 
4206 	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
4207 	    flags & METASLAB_MUST_RESERVE) {
4208 		/*
4209 		 * We reserve the slots individually so that we can unreserve
4210 		 * them individually when an I/O completes.
4211 		 */
4212 		for (int d = 0; d < slots; d++) {
4213 			reserved_slots =
4214 			    zfs_refcount_add(&mc->mc_alloc_slots[allocator],
4215 			    zio);
4216 		}
4217 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
4218 		slot_reserved = B_TRUE;
4219 	}
4220 
4221 	mutex_exit(&mc->mc_lock);
4222 	return (slot_reserved);
4223 }
4224 
4225 void
4226 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
4227     int allocator, zio_t *zio)
4228 {
4229 	ASSERT(mc->mc_alloc_throttle_enabled);
4230 	mutex_enter(&mc->mc_lock);
4231 	for (int d = 0; d < slots; d++) {
4232 		(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
4233 		    zio);
4234 	}
4235 	mutex_exit(&mc->mc_lock);
4236 }
4237 
4238 static int
4239 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
4240     uint64_t txg)
4241 {
4242 	metaslab_t *msp;
4243 	spa_t *spa = vd->vdev_spa;
4244 	int error = 0;
4245 
4246 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
4247 		return (ENXIO);
4248 
4249 	ASSERT3P(vd->vdev_ms, !=, NULL);
4250 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4251 
4252 	mutex_enter(&msp->ms_lock);
4253 
4254 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
4255 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
4256 	/*
4257 	 * No need to fail in that case; someone else has activated the
4258 	 * metaslab, but that doesn't preclude us from using it.
4259 	 */
4260 	if (error == EBUSY)
4261 		error = 0;
4262 
4263 	if (error == 0 &&
4264 	    !range_tree_contains(msp->ms_allocatable, offset, size))
4265 		error = SET_ERROR(ENOENT);
4266 
4267 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
4268 		mutex_exit(&msp->ms_lock);
4269 		return (error);
4270 	}
4271 
4272 	VERIFY(!msp->ms_condensing);
4273 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
4274 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4275 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
4276 	    msp->ms_size);
4277 	range_tree_remove(msp->ms_allocatable, offset, size);
4278 
4279 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
4280 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4281 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
4282 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
4283 		    offset, size);
4284 	}
4285 
4286 	mutex_exit(&msp->ms_lock);
4287 
4288 	return (0);
4289 }
4290 
4291 typedef struct metaslab_claim_cb_arg_t {
4292 	uint64_t	mcca_txg;
4293 	int		mcca_error;
4294 } metaslab_claim_cb_arg_t;
4295 
4296 /* ARGSUSED */
4297 static void
4298 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
4299     uint64_t size, void *arg)
4300 {
4301 	metaslab_claim_cb_arg_t *mcca_arg = arg;
4302 
4303 	if (mcca_arg->mcca_error == 0) {
4304 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
4305 		    size, mcca_arg->mcca_txg);
4306 	}
4307 }
4308 
4309 int
4310 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
4311 {
4312 	if (vd->vdev_ops->vdev_op_remap != NULL) {
4313 		metaslab_claim_cb_arg_t arg;
4314 
4315 		/*
4316 		 * Only zdb(1M) can claim on indirect vdevs.  This is used
4317 		 * to detect leaks of mapped space (that are not accounted
4318 		 * for in the obsolete counts, spacemap, or bpobj).
4319 		 */
4320 		ASSERT(!spa_writeable(vd->vdev_spa));
4321 		arg.mcca_error = 0;
4322 		arg.mcca_txg = txg;
4323 
4324 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
4325 		    metaslab_claim_impl_cb, &arg);
4326 
4327 		if (arg.mcca_error == 0) {
4328 			arg.mcca_error = metaslab_claim_concrete(vd,
4329 			    offset, size, txg);
4330 		}
4331 		return (arg.mcca_error);
4332 	} else {
4333 		return (metaslab_claim_concrete(vd, offset, size, txg));
4334 	}
4335 }
4336 
4337 /*
4338  * Intent log support: upon opening the pool after a crash, notify the SPA
4339  * of blocks that the intent log has allocated for immediate write, but
4340  * which are still considered free by the SPA because the last transaction
4341  * group didn't commit yet.
4342  */
4343 static int
4344 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
4345 {
4346 	uint64_t vdev = DVA_GET_VDEV(dva);
4347 	uint64_t offset = DVA_GET_OFFSET(dva);
4348 	uint64_t size = DVA_GET_ASIZE(dva);
4349 	vdev_t *vd;
4350 
4351 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4352 		return (SET_ERROR(ENXIO));
4353 	}
4354 
4355 	ASSERT(DVA_IS_VALID(dva));
4356 
4357 	if (DVA_GET_GANG(dva))
4358 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4359 
4360 	return (metaslab_claim_impl(vd, offset, size, txg));
4361 }
4362 
4363 int
4364 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4365     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4366     zio_alloc_list_t *zal, zio_t *zio, int allocator)
4367 {
4368 	dva_t *dva = bp->blk_dva;
4369 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
4370 	int error = 0;
4371 
4372 	ASSERT(bp->blk_birth == 0);
4373 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4374 
4375 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4376 
4377 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
4378 		spa_config_exit(spa, SCL_ALLOC, FTAG);
4379 		return (SET_ERROR(ENOSPC));
4380 	}
4381 
4382 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4383 	ASSERT(BP_GET_NDVAS(bp) == 0);
4384 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4385 	ASSERT3P(zal, !=, NULL);
4386 
4387 	for (int d = 0; d < ndvas; d++) {
4388 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4389 		    txg, flags, zal, allocator);
4390 		if (error != 0) {
4391 			for (d--; d >= 0; d--) {
4392 				metaslab_unalloc_dva(spa, &dva[d], txg);
4393 				metaslab_group_alloc_decrement(spa,
4394 				    DVA_GET_VDEV(&dva[d]), zio, flags,
4395 				    allocator, B_FALSE);
4396 				bzero(&dva[d], sizeof (dva_t));
4397 			}
4398 			spa_config_exit(spa, SCL_ALLOC, FTAG);
4399 			return (error);
4400 		} else {
4401 			/*
4402 			 * Update the metaslab group's queue depth
4403 			 * based on the newly allocated dva.
4404 			 */
4405 			metaslab_group_alloc_increment(spa,
4406 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
4407 		}
4408 
4409 	}
4410 	ASSERT(error == 0);
4411 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
4412 
4413 	spa_config_exit(spa, SCL_ALLOC, FTAG);
4414 
4415 	BP_SET_BIRTH(bp, txg, txg);
4416 
4417 	return (0);
4418 }
4419 
4420 void
4421 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
4422 {
4423 	const dva_t *dva = bp->blk_dva;
4424 	int ndvas = BP_GET_NDVAS(bp);
4425 
4426 	ASSERT(!BP_IS_HOLE(bp));
4427 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
4428 
4429 	/*
4430 	 * If we have a checkpoint for the pool we need to make sure that
4431 	 * the blocks that we free that are part of the checkpoint won't be
4432 	 * reused until the checkpoint is discarded or we revert to it.
4433 	 *
4434 	 * The checkpoint flag is passed down the metaslab_free code path
4435 	 * and is set whenever we want to add a block to the checkpoint's
4436 	 * accounting. That is, we "checkpoint" blocks that existed at the
4437 	 * time the checkpoint was created and are therefore referenced by
4438 	 * the checkpointed uberblock.
4439 	 *
4440 	 * Note that, we don't checkpoint any blocks if the current
4441 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
4442 	 * normally as they will be referenced by the checkpointed uberblock.
4443 	 */
4444 	boolean_t checkpoint = B_FALSE;
4445 	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
4446 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
4447 		/*
4448 		 * At this point, if the block is part of the checkpoint
4449 		 * there is no way it was created in the current txg.
4450 		 */
4451 		ASSERT(!now);
4452 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
4453 		checkpoint = B_TRUE;
4454 	}
4455 
4456 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
4457 
4458 	for (int d = 0; d < ndvas; d++) {
4459 		if (now) {
4460 			metaslab_unalloc_dva(spa, &dva[d], txg);
4461 		} else {
4462 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
4463 			metaslab_free_dva(spa, &dva[d], checkpoint);
4464 		}
4465 	}
4466 
4467 	spa_config_exit(spa, SCL_FREE, FTAG);
4468 }
4469 
4470 int
4471 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
4472 {
4473 	const dva_t *dva = bp->blk_dva;
4474 	int ndvas = BP_GET_NDVAS(bp);
4475 	int error = 0;
4476 
4477 	ASSERT(!BP_IS_HOLE(bp));
4478 
4479 	if (txg != 0) {
4480 		/*
4481 		 * First do a dry run to make sure all DVAs are claimable,
4482 		 * so we don't have to unwind from partial failures below.
4483 		 */
4484 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
4485 			return (error);
4486 	}
4487 
4488 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4489 
4490 	for (int d = 0; d < ndvas; d++) {
4491 		error = metaslab_claim_dva(spa, &dva[d], txg);
4492 		if (error != 0)
4493 			break;
4494 	}
4495 
4496 	spa_config_exit(spa, SCL_ALLOC, FTAG);
4497 
4498 	ASSERT(error == 0 || txg == 0);
4499 
4500 	return (error);
4501 }
4502 
4503 /* ARGSUSED */
4504 static void
4505 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
4506     uint64_t size, void *arg)
4507 {
4508 	if (vd->vdev_ops == &vdev_indirect_ops)
4509 		return;
4510 
4511 	metaslab_check_free_impl(vd, offset, size);
4512 }
4513 
4514 static void
4515 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
4516 {
4517 	metaslab_t *msp;
4518 	spa_t *spa = vd->vdev_spa;
4519 
4520 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4521 		return;
4522 
4523 	if (vd->vdev_ops->vdev_op_remap != NULL) {
4524 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
4525 		    metaslab_check_free_impl_cb, NULL);
4526 		return;
4527 	}
4528 
4529 	ASSERT(vdev_is_concrete(vd));
4530 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4531 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4532 
4533 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4534 
4535 	mutex_enter(&msp->ms_lock);
4536 	if (msp->ms_loaded) {
4537 		range_tree_verify_not_present(msp->ms_allocatable,
4538 		    offset, size);
4539 	}
4540 
4541 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
4542 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
4543 	range_tree_verify_not_present(msp->ms_freed, offset, size);
4544 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
4545 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
4546 	mutex_exit(&msp->ms_lock);
4547 }
4548 
4549 void
4550 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4551 {
4552 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4553 		return;
4554 
4555 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4556 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4557 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4558 		vdev_t *vd = vdev_lookup_top(spa, vdev);
4559 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4560 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4561 
4562 		if (DVA_GET_GANG(&bp->blk_dva[i]))
4563 			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4564 
4565 		ASSERT3P(vd, !=, NULL);
4566 
4567 		metaslab_check_free_impl(vd, offset, size);
4568 	}
4569 	spa_config_exit(spa, SCL_VDEV, FTAG);
4570 }
4571