xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision 6f459ff5b49a8482416f3eab8866c784121ecae3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/space_map.h>
32 #include <sys/metaslab_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zio.h>
35 #include <sys/spa_impl.h>
36 #include <sys/zfeature.h>
37 #include <sys/vdev_indirect_mapping.h>
38 
39 #define	GANG_ALLOCATION(flags) \
40 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
41 
42 uint64_t metaslab_aliquot = 512ULL << 10;
43 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
44 
45 /*
46  * The in-core space map representation is more compact than its on-disk form.
47  * The zfs_condense_pct determines how much more compact the in-core
48  * space map representation must be before we compact it on-disk.
49  * Values should be greater than or equal to 100.
50  */
51 int zfs_condense_pct = 200;
52 
53 /*
54  * Condensing a metaslab is not guaranteed to actually reduce the amount of
55  * space used on disk. In particular, a space map uses data in increments of
56  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
57  * same number of blocks after condensing. Since the goal of condensing is to
58  * reduce the number of IOPs required to read the space map, we only want to
59  * condense when we can be sure we will reduce the number of blocks used by the
60  * space map. Unfortunately, we cannot precisely compute whether or not this is
61  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
62  * we apply the following heuristic: do not condense a spacemap unless the
63  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
64  * blocks.
65  */
66 int zfs_metaslab_condense_block_threshold = 4;
67 
68 /*
69  * The zfs_mg_noalloc_threshold defines which metaslab groups should
70  * be eligible for allocation. The value is defined as a percentage of
71  * free space. Metaslab groups that have more free space than
72  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
73  * a metaslab group's free space is less than or equal to the
74  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
75  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
76  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
77  * groups are allowed to accept allocations. Gang blocks are always
78  * eligible to allocate on any metaslab group. The default value of 0 means
79  * no metaslab group will be excluded based on this criterion.
80  */
81 int zfs_mg_noalloc_threshold = 0;
82 
83 /*
84  * Metaslab groups are considered eligible for allocations if their
85  * fragmenation metric (measured as a percentage) is less than or equal to
86  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
87  * then it will be skipped unless all metaslab groups within the metaslab
88  * class have also crossed this threshold.
89  */
90 int zfs_mg_fragmentation_threshold = 85;
91 
92 /*
93  * Allow metaslabs to keep their active state as long as their fragmentation
94  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
95  * active metaslab that exceeds this threshold will no longer keep its active
96  * status allowing better metaslabs to be selected.
97  */
98 int zfs_metaslab_fragmentation_threshold = 70;
99 
100 /*
101  * When set will load all metaslabs when pool is first opened.
102  */
103 int metaslab_debug_load = 0;
104 
105 /*
106  * When set will prevent metaslabs from being unloaded.
107  */
108 int metaslab_debug_unload = 0;
109 
110 /*
111  * Minimum size which forces the dynamic allocator to change
112  * it's allocation strategy.  Once the space map cannot satisfy
113  * an allocation of this size then it switches to using more
114  * aggressive strategy (i.e search by size rather than offset).
115  */
116 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
117 
118 /*
119  * The minimum free space, in percent, which must be available
120  * in a space map to continue allocations in a first-fit fashion.
121  * Once the space map's free space drops below this level we dynamically
122  * switch to using best-fit allocations.
123  */
124 int metaslab_df_free_pct = 4;
125 
126 /*
127  * A metaslab is considered "free" if it contains a contiguous
128  * segment which is greater than metaslab_min_alloc_size.
129  */
130 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
131 
132 /*
133  * Percentage of all cpus that can be used by the metaslab taskq.
134  */
135 int metaslab_load_pct = 50;
136 
137 /*
138  * Determines how many txgs a metaslab may remain loaded without having any
139  * allocations from it. As long as a metaslab continues to be used we will
140  * keep it loaded.
141  */
142 int metaslab_unload_delay = TXG_SIZE * 2;
143 
144 /*
145  * Max number of metaslabs per group to preload.
146  */
147 int metaslab_preload_limit = SPA_DVAS_PER_BP;
148 
149 /*
150  * Enable/disable preloading of metaslab.
151  */
152 boolean_t metaslab_preload_enabled = B_TRUE;
153 
154 /*
155  * Enable/disable fragmentation weighting on metaslabs.
156  */
157 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
158 
159 /*
160  * Enable/disable lba weighting (i.e. outer tracks are given preference).
161  */
162 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
163 
164 /*
165  * Enable/disable metaslab group biasing.
166  */
167 boolean_t metaslab_bias_enabled = B_TRUE;
168 
169 /*
170  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
171  */
172 boolean_t zfs_remap_blkptr_enable = B_TRUE;
173 
174 /*
175  * Enable/disable segment-based metaslab selection.
176  */
177 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
178 
179 /*
180  * When using segment-based metaslab selection, we will continue
181  * allocating from the active metaslab until we have exhausted
182  * zfs_metaslab_switch_threshold of its buckets.
183  */
184 int zfs_metaslab_switch_threshold = 2;
185 
186 /*
187  * Internal switch to enable/disable the metaslab allocation tracing
188  * facility.
189  */
190 boolean_t metaslab_trace_enabled = B_TRUE;
191 
192 /*
193  * Maximum entries that the metaslab allocation tracing facility will keep
194  * in a given list when running in non-debug mode. We limit the number
195  * of entries in non-debug mode to prevent us from using up too much memory.
196  * The limit should be sufficiently large that we don't expect any allocation
197  * to every exceed this value. In debug mode, the system will panic if this
198  * limit is ever reached allowing for further investigation.
199  */
200 uint64_t metaslab_trace_max_entries = 5000;
201 
202 static uint64_t metaslab_weight(metaslab_t *);
203 static void metaslab_set_fragmentation(metaslab_t *);
204 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
205 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
206 
207 kmem_cache_t *metaslab_alloc_trace_cache;
208 
209 /*
210  * ==========================================================================
211  * Metaslab classes
212  * ==========================================================================
213  */
214 metaslab_class_t *
215 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
216 {
217 	metaslab_class_t *mc;
218 
219 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
220 
221 	mc->mc_spa = spa;
222 	mc->mc_rotor = NULL;
223 	mc->mc_ops = ops;
224 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
225 	refcount_create_tracked(&mc->mc_alloc_slots);
226 
227 	return (mc);
228 }
229 
230 void
231 metaslab_class_destroy(metaslab_class_t *mc)
232 {
233 	ASSERT(mc->mc_rotor == NULL);
234 	ASSERT(mc->mc_alloc == 0);
235 	ASSERT(mc->mc_deferred == 0);
236 	ASSERT(mc->mc_space == 0);
237 	ASSERT(mc->mc_dspace == 0);
238 
239 	refcount_destroy(&mc->mc_alloc_slots);
240 	mutex_destroy(&mc->mc_lock);
241 	kmem_free(mc, sizeof (metaslab_class_t));
242 }
243 
244 int
245 metaslab_class_validate(metaslab_class_t *mc)
246 {
247 	metaslab_group_t *mg;
248 	vdev_t *vd;
249 
250 	/*
251 	 * Must hold one of the spa_config locks.
252 	 */
253 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
254 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
255 
256 	if ((mg = mc->mc_rotor) == NULL)
257 		return (0);
258 
259 	do {
260 		vd = mg->mg_vd;
261 		ASSERT(vd->vdev_mg != NULL);
262 		ASSERT3P(vd->vdev_top, ==, vd);
263 		ASSERT3P(mg->mg_class, ==, mc);
264 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
265 	} while ((mg = mg->mg_next) != mc->mc_rotor);
266 
267 	return (0);
268 }
269 
270 void
271 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
272     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
273 {
274 	atomic_add_64(&mc->mc_alloc, alloc_delta);
275 	atomic_add_64(&mc->mc_deferred, defer_delta);
276 	atomic_add_64(&mc->mc_space, space_delta);
277 	atomic_add_64(&mc->mc_dspace, dspace_delta);
278 }
279 
280 uint64_t
281 metaslab_class_get_alloc(metaslab_class_t *mc)
282 {
283 	return (mc->mc_alloc);
284 }
285 
286 uint64_t
287 metaslab_class_get_deferred(metaslab_class_t *mc)
288 {
289 	return (mc->mc_deferred);
290 }
291 
292 uint64_t
293 metaslab_class_get_space(metaslab_class_t *mc)
294 {
295 	return (mc->mc_space);
296 }
297 
298 uint64_t
299 metaslab_class_get_dspace(metaslab_class_t *mc)
300 {
301 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
302 }
303 
304 void
305 metaslab_class_histogram_verify(metaslab_class_t *mc)
306 {
307 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
308 	uint64_t *mc_hist;
309 	int i;
310 
311 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
312 		return;
313 
314 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
315 	    KM_SLEEP);
316 
317 	for (int c = 0; c < rvd->vdev_children; c++) {
318 		vdev_t *tvd = rvd->vdev_child[c];
319 		metaslab_group_t *mg = tvd->vdev_mg;
320 
321 		/*
322 		 * Skip any holes, uninitialized top-levels, or
323 		 * vdevs that are not in this metalab class.
324 		 */
325 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
326 		    mg->mg_class != mc) {
327 			continue;
328 		}
329 
330 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
331 			mc_hist[i] += mg->mg_histogram[i];
332 	}
333 
334 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
335 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
336 
337 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
338 }
339 
340 /*
341  * Calculate the metaslab class's fragmentation metric. The metric
342  * is weighted based on the space contribution of each metaslab group.
343  * The return value will be a number between 0 and 100 (inclusive), or
344  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
345  * zfs_frag_table for more information about the metric.
346  */
347 uint64_t
348 metaslab_class_fragmentation(metaslab_class_t *mc)
349 {
350 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
351 	uint64_t fragmentation = 0;
352 
353 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
354 
355 	for (int c = 0; c < rvd->vdev_children; c++) {
356 		vdev_t *tvd = rvd->vdev_child[c];
357 		metaslab_group_t *mg = tvd->vdev_mg;
358 
359 		/*
360 		 * Skip any holes, uninitialized top-levels,
361 		 * or vdevs that are not in this metalab class.
362 		 */
363 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
364 		    mg->mg_class != mc) {
365 			continue;
366 		}
367 
368 		/*
369 		 * If a metaslab group does not contain a fragmentation
370 		 * metric then just bail out.
371 		 */
372 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
373 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
374 			return (ZFS_FRAG_INVALID);
375 		}
376 
377 		/*
378 		 * Determine how much this metaslab_group is contributing
379 		 * to the overall pool fragmentation metric.
380 		 */
381 		fragmentation += mg->mg_fragmentation *
382 		    metaslab_group_get_space(mg);
383 	}
384 	fragmentation /= metaslab_class_get_space(mc);
385 
386 	ASSERT3U(fragmentation, <=, 100);
387 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
388 	return (fragmentation);
389 }
390 
391 /*
392  * Calculate the amount of expandable space that is available in
393  * this metaslab class. If a device is expanded then its expandable
394  * space will be the amount of allocatable space that is currently not
395  * part of this metaslab class.
396  */
397 uint64_t
398 metaslab_class_expandable_space(metaslab_class_t *mc)
399 {
400 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
401 	uint64_t space = 0;
402 
403 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
404 	for (int c = 0; c < rvd->vdev_children; c++) {
405 		uint64_t tspace;
406 		vdev_t *tvd = rvd->vdev_child[c];
407 		metaslab_group_t *mg = tvd->vdev_mg;
408 
409 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
410 		    mg->mg_class != mc) {
411 			continue;
412 		}
413 
414 		/*
415 		 * Calculate if we have enough space to add additional
416 		 * metaslabs. We report the expandable space in terms
417 		 * of the metaslab size since that's the unit of expansion.
418 		 * Adjust by efi system partition size.
419 		 */
420 		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
421 		if (tspace > mc->mc_spa->spa_bootsize) {
422 			tspace -= mc->mc_spa->spa_bootsize;
423 		}
424 		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
425 	}
426 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
427 	return (space);
428 }
429 
430 static int
431 metaslab_compare(const void *x1, const void *x2)
432 {
433 	const metaslab_t *m1 = x1;
434 	const metaslab_t *m2 = x2;
435 
436 	if (m1->ms_weight < m2->ms_weight)
437 		return (1);
438 	if (m1->ms_weight > m2->ms_weight)
439 		return (-1);
440 
441 	/*
442 	 * If the weights are identical, use the offset to force uniqueness.
443 	 */
444 	if (m1->ms_start < m2->ms_start)
445 		return (-1);
446 	if (m1->ms_start > m2->ms_start)
447 		return (1);
448 
449 	ASSERT3P(m1, ==, m2);
450 
451 	return (0);
452 }
453 
454 /*
455  * Verify that the space accounting on disk matches the in-core range_trees.
456  */
457 void
458 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
459 {
460 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
461 	uint64_t allocated = 0;
462 	uint64_t sm_free_space, msp_free_space;
463 
464 	ASSERT(MUTEX_HELD(&msp->ms_lock));
465 
466 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
467 		return;
468 
469 	/*
470 	 * We can only verify the metaslab space when we're called
471 	 * from syncing context with a loaded metaslab that has an allocated
472 	 * space map. Calling this in non-syncing context does not
473 	 * provide a consistent view of the metaslab since we're performing
474 	 * allocations in the future.
475 	 */
476 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
477 	    !msp->ms_loaded)
478 		return;
479 
480 	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
481 	    space_map_alloc_delta(msp->ms_sm);
482 
483 	/*
484 	 * Account for future allocations since we would have already
485 	 * deducted that space from the ms_freetree.
486 	 */
487 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
488 		allocated +=
489 		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
490 	}
491 
492 	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
493 	    msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
494 
495 	VERIFY3U(sm_free_space, ==, msp_free_space);
496 }
497 
498 /*
499  * ==========================================================================
500  * Metaslab groups
501  * ==========================================================================
502  */
503 /*
504  * Update the allocatable flag and the metaslab group's capacity.
505  * The allocatable flag is set to true if the capacity is below
506  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
507  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
508  * transitions from allocatable to non-allocatable or vice versa then the
509  * metaslab group's class is updated to reflect the transition.
510  */
511 static void
512 metaslab_group_alloc_update(metaslab_group_t *mg)
513 {
514 	vdev_t *vd = mg->mg_vd;
515 	metaslab_class_t *mc = mg->mg_class;
516 	vdev_stat_t *vs = &vd->vdev_stat;
517 	boolean_t was_allocatable;
518 	boolean_t was_initialized;
519 
520 	ASSERT(vd == vd->vdev_top);
521 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
522 	    SCL_ALLOC);
523 
524 	mutex_enter(&mg->mg_lock);
525 	was_allocatable = mg->mg_allocatable;
526 	was_initialized = mg->mg_initialized;
527 
528 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
529 	    (vs->vs_space + 1);
530 
531 	mutex_enter(&mc->mc_lock);
532 
533 	/*
534 	 * If the metaslab group was just added then it won't
535 	 * have any space until we finish syncing out this txg.
536 	 * At that point we will consider it initialized and available
537 	 * for allocations.  We also don't consider non-activated
538 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
539 	 * to be initialized, because they can't be used for allocation.
540 	 */
541 	mg->mg_initialized = metaslab_group_initialized(mg);
542 	if (!was_initialized && mg->mg_initialized) {
543 		mc->mc_groups++;
544 	} else if (was_initialized && !mg->mg_initialized) {
545 		ASSERT3U(mc->mc_groups, >, 0);
546 		mc->mc_groups--;
547 	}
548 	if (mg->mg_initialized)
549 		mg->mg_no_free_space = B_FALSE;
550 
551 	/*
552 	 * A metaslab group is considered allocatable if it has plenty
553 	 * of free space or is not heavily fragmented. We only take
554 	 * fragmentation into account if the metaslab group has a valid
555 	 * fragmentation metric (i.e. a value between 0 and 100).
556 	 */
557 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
558 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
559 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
560 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
561 
562 	/*
563 	 * The mc_alloc_groups maintains a count of the number of
564 	 * groups in this metaslab class that are still above the
565 	 * zfs_mg_noalloc_threshold. This is used by the allocating
566 	 * threads to determine if they should avoid allocations to
567 	 * a given group. The allocator will avoid allocations to a group
568 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
569 	 * and there are still other groups that are above the threshold.
570 	 * When a group transitions from allocatable to non-allocatable or
571 	 * vice versa we update the metaslab class to reflect that change.
572 	 * When the mc_alloc_groups value drops to 0 that means that all
573 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
574 	 * eligible for allocations. This effectively means that all devices
575 	 * are balanced again.
576 	 */
577 	if (was_allocatable && !mg->mg_allocatable)
578 		mc->mc_alloc_groups--;
579 	else if (!was_allocatable && mg->mg_allocatable)
580 		mc->mc_alloc_groups++;
581 	mutex_exit(&mc->mc_lock);
582 
583 	mutex_exit(&mg->mg_lock);
584 }
585 
586 metaslab_group_t *
587 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
588 {
589 	metaslab_group_t *mg;
590 
591 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
592 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
593 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
594 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
595 	mg->mg_vd = vd;
596 	mg->mg_class = mc;
597 	mg->mg_activation_count = 0;
598 	mg->mg_initialized = B_FALSE;
599 	mg->mg_no_free_space = B_TRUE;
600 	refcount_create_tracked(&mg->mg_alloc_queue_depth);
601 
602 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
603 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
604 
605 	return (mg);
606 }
607 
608 void
609 metaslab_group_destroy(metaslab_group_t *mg)
610 {
611 	ASSERT(mg->mg_prev == NULL);
612 	ASSERT(mg->mg_next == NULL);
613 	/*
614 	 * We may have gone below zero with the activation count
615 	 * either because we never activated in the first place or
616 	 * because we're done, and possibly removing the vdev.
617 	 */
618 	ASSERT(mg->mg_activation_count <= 0);
619 
620 	taskq_destroy(mg->mg_taskq);
621 	avl_destroy(&mg->mg_metaslab_tree);
622 	mutex_destroy(&mg->mg_lock);
623 	refcount_destroy(&mg->mg_alloc_queue_depth);
624 	kmem_free(mg, sizeof (metaslab_group_t));
625 }
626 
627 void
628 metaslab_group_activate(metaslab_group_t *mg)
629 {
630 	metaslab_class_t *mc = mg->mg_class;
631 	metaslab_group_t *mgprev, *mgnext;
632 
633 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
634 
635 	ASSERT(mc->mc_rotor != mg);
636 	ASSERT(mg->mg_prev == NULL);
637 	ASSERT(mg->mg_next == NULL);
638 	ASSERT(mg->mg_activation_count <= 0);
639 
640 	if (++mg->mg_activation_count <= 0)
641 		return;
642 
643 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
644 	metaslab_group_alloc_update(mg);
645 
646 	if ((mgprev = mc->mc_rotor) == NULL) {
647 		mg->mg_prev = mg;
648 		mg->mg_next = mg;
649 	} else {
650 		mgnext = mgprev->mg_next;
651 		mg->mg_prev = mgprev;
652 		mg->mg_next = mgnext;
653 		mgprev->mg_next = mg;
654 		mgnext->mg_prev = mg;
655 	}
656 	mc->mc_rotor = mg;
657 }
658 
659 /*
660  * Passivate a metaslab group and remove it from the allocation rotor.
661  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
662  * a metaslab group. This function will momentarily drop spa_config_locks
663  * that are lower than the SCL_ALLOC lock (see comment below).
664  */
665 void
666 metaslab_group_passivate(metaslab_group_t *mg)
667 {
668 	metaslab_class_t *mc = mg->mg_class;
669 	spa_t *spa = mc->mc_spa;
670 	metaslab_group_t *mgprev, *mgnext;
671 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
672 
673 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
674 	    (SCL_ALLOC | SCL_ZIO));
675 
676 	if (--mg->mg_activation_count != 0) {
677 		ASSERT(mc->mc_rotor != mg);
678 		ASSERT(mg->mg_prev == NULL);
679 		ASSERT(mg->mg_next == NULL);
680 		ASSERT(mg->mg_activation_count < 0);
681 		return;
682 	}
683 
684 	/*
685 	 * The spa_config_lock is an array of rwlocks, ordered as
686 	 * follows (from highest to lowest):
687 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
688 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
689 	 * (For more information about the spa_config_lock see spa_misc.c)
690 	 * The higher the lock, the broader its coverage. When we passivate
691 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
692 	 * config locks. However, the metaslab group's taskq might be trying
693 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
694 	 * lower locks to allow the I/O to complete. At a minimum,
695 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
696 	 * allocations from taking place and any changes to the vdev tree.
697 	 */
698 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
699 	taskq_wait(mg->mg_taskq);
700 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
701 	metaslab_group_alloc_update(mg);
702 
703 	mgprev = mg->mg_prev;
704 	mgnext = mg->mg_next;
705 
706 	if (mg == mgnext) {
707 		mc->mc_rotor = NULL;
708 	} else {
709 		mc->mc_rotor = mgnext;
710 		mgprev->mg_next = mgnext;
711 		mgnext->mg_prev = mgprev;
712 	}
713 
714 	mg->mg_prev = NULL;
715 	mg->mg_next = NULL;
716 }
717 
718 boolean_t
719 metaslab_group_initialized(metaslab_group_t *mg)
720 {
721 	vdev_t *vd = mg->mg_vd;
722 	vdev_stat_t *vs = &vd->vdev_stat;
723 
724 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
725 }
726 
727 uint64_t
728 metaslab_group_get_space(metaslab_group_t *mg)
729 {
730 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
731 }
732 
733 void
734 metaslab_group_histogram_verify(metaslab_group_t *mg)
735 {
736 	uint64_t *mg_hist;
737 	vdev_t *vd = mg->mg_vd;
738 	uint64_t ashift = vd->vdev_ashift;
739 	int i;
740 
741 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
742 		return;
743 
744 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
745 	    KM_SLEEP);
746 
747 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
748 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
749 
750 	for (int m = 0; m < vd->vdev_ms_count; m++) {
751 		metaslab_t *msp = vd->vdev_ms[m];
752 
753 		if (msp->ms_sm == NULL)
754 			continue;
755 
756 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
757 			mg_hist[i + ashift] +=
758 			    msp->ms_sm->sm_phys->smp_histogram[i];
759 	}
760 
761 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
762 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
763 
764 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
765 }
766 
767 static void
768 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
769 {
770 	metaslab_class_t *mc = mg->mg_class;
771 	uint64_t ashift = mg->mg_vd->vdev_ashift;
772 
773 	ASSERT(MUTEX_HELD(&msp->ms_lock));
774 	if (msp->ms_sm == NULL)
775 		return;
776 
777 	mutex_enter(&mg->mg_lock);
778 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
779 		mg->mg_histogram[i + ashift] +=
780 		    msp->ms_sm->sm_phys->smp_histogram[i];
781 		mc->mc_histogram[i + ashift] +=
782 		    msp->ms_sm->sm_phys->smp_histogram[i];
783 	}
784 	mutex_exit(&mg->mg_lock);
785 }
786 
787 void
788 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
789 {
790 	metaslab_class_t *mc = mg->mg_class;
791 	uint64_t ashift = mg->mg_vd->vdev_ashift;
792 
793 	ASSERT(MUTEX_HELD(&msp->ms_lock));
794 	if (msp->ms_sm == NULL)
795 		return;
796 
797 	mutex_enter(&mg->mg_lock);
798 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
799 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
800 		    msp->ms_sm->sm_phys->smp_histogram[i]);
801 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
802 		    msp->ms_sm->sm_phys->smp_histogram[i]);
803 
804 		mg->mg_histogram[i + ashift] -=
805 		    msp->ms_sm->sm_phys->smp_histogram[i];
806 		mc->mc_histogram[i + ashift] -=
807 		    msp->ms_sm->sm_phys->smp_histogram[i];
808 	}
809 	mutex_exit(&mg->mg_lock);
810 }
811 
812 static void
813 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
814 {
815 	ASSERT(msp->ms_group == NULL);
816 	mutex_enter(&mg->mg_lock);
817 	msp->ms_group = mg;
818 	msp->ms_weight = 0;
819 	avl_add(&mg->mg_metaslab_tree, msp);
820 	mutex_exit(&mg->mg_lock);
821 
822 	mutex_enter(&msp->ms_lock);
823 	metaslab_group_histogram_add(mg, msp);
824 	mutex_exit(&msp->ms_lock);
825 }
826 
827 static void
828 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
829 {
830 	mutex_enter(&msp->ms_lock);
831 	metaslab_group_histogram_remove(mg, msp);
832 	mutex_exit(&msp->ms_lock);
833 
834 	mutex_enter(&mg->mg_lock);
835 	ASSERT(msp->ms_group == mg);
836 	avl_remove(&mg->mg_metaslab_tree, msp);
837 	msp->ms_group = NULL;
838 	mutex_exit(&mg->mg_lock);
839 }
840 
841 static void
842 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
843 {
844 	/*
845 	 * Although in principle the weight can be any value, in
846 	 * practice we do not use values in the range [1, 511].
847 	 */
848 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
849 	ASSERT(MUTEX_HELD(&msp->ms_lock));
850 
851 	mutex_enter(&mg->mg_lock);
852 	ASSERT(msp->ms_group == mg);
853 	avl_remove(&mg->mg_metaslab_tree, msp);
854 	msp->ms_weight = weight;
855 	avl_add(&mg->mg_metaslab_tree, msp);
856 	mutex_exit(&mg->mg_lock);
857 }
858 
859 /*
860  * Calculate the fragmentation for a given metaslab group. We can use
861  * a simple average here since all metaslabs within the group must have
862  * the same size. The return value will be a value between 0 and 100
863  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
864  * group have a fragmentation metric.
865  */
866 uint64_t
867 metaslab_group_fragmentation(metaslab_group_t *mg)
868 {
869 	vdev_t *vd = mg->mg_vd;
870 	uint64_t fragmentation = 0;
871 	uint64_t valid_ms = 0;
872 
873 	for (int m = 0; m < vd->vdev_ms_count; m++) {
874 		metaslab_t *msp = vd->vdev_ms[m];
875 
876 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
877 			continue;
878 
879 		valid_ms++;
880 		fragmentation += msp->ms_fragmentation;
881 	}
882 
883 	if (valid_ms <= vd->vdev_ms_count / 2)
884 		return (ZFS_FRAG_INVALID);
885 
886 	fragmentation /= valid_ms;
887 	ASSERT3U(fragmentation, <=, 100);
888 	return (fragmentation);
889 }
890 
891 /*
892  * Determine if a given metaslab group should skip allocations. A metaslab
893  * group should avoid allocations if its free capacity is less than the
894  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
895  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
896  * that can still handle allocations. If the allocation throttle is enabled
897  * then we skip allocations to devices that have reached their maximum
898  * allocation queue depth unless the selected metaslab group is the only
899  * eligible group remaining.
900  */
901 static boolean_t
902 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
903     uint64_t psize)
904 {
905 	spa_t *spa = mg->mg_vd->vdev_spa;
906 	metaslab_class_t *mc = mg->mg_class;
907 
908 	/*
909 	 * We can only consider skipping this metaslab group if it's
910 	 * in the normal metaslab class and there are other metaslab
911 	 * groups to select from. Otherwise, we always consider it eligible
912 	 * for allocations.
913 	 */
914 	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
915 		return (B_TRUE);
916 
917 	/*
918 	 * If the metaslab group's mg_allocatable flag is set (see comments
919 	 * in metaslab_group_alloc_update() for more information) and
920 	 * the allocation throttle is disabled then allow allocations to this
921 	 * device. However, if the allocation throttle is enabled then
922 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
923 	 * to determine if we should allow allocations to this metaslab group.
924 	 * If all metaslab groups are no longer considered allocatable
925 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
926 	 * gang block size then we allow allocations on this metaslab group
927 	 * regardless of the mg_allocatable or throttle settings.
928 	 */
929 	if (mg->mg_allocatable) {
930 		metaslab_group_t *mgp;
931 		int64_t qdepth;
932 		uint64_t qmax = mg->mg_max_alloc_queue_depth;
933 
934 		if (!mc->mc_alloc_throttle_enabled)
935 			return (B_TRUE);
936 
937 		/*
938 		 * If this metaslab group does not have any free space, then
939 		 * there is no point in looking further.
940 		 */
941 		if (mg->mg_no_free_space)
942 			return (B_FALSE);
943 
944 		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
945 
946 		/*
947 		 * If this metaslab group is below its qmax or it's
948 		 * the only allocatable metasable group, then attempt
949 		 * to allocate from it.
950 		 */
951 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
952 			return (B_TRUE);
953 		ASSERT3U(mc->mc_alloc_groups, >, 1);
954 
955 		/*
956 		 * Since this metaslab group is at or over its qmax, we
957 		 * need to determine if there are metaslab groups after this
958 		 * one that might be able to handle this allocation. This is
959 		 * racy since we can't hold the locks for all metaslab
960 		 * groups at the same time when we make this check.
961 		 */
962 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
963 			qmax = mgp->mg_max_alloc_queue_depth;
964 
965 			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
966 
967 			/*
968 			 * If there is another metaslab group that
969 			 * might be able to handle the allocation, then
970 			 * we return false so that we skip this group.
971 			 */
972 			if (qdepth < qmax && !mgp->mg_no_free_space)
973 				return (B_FALSE);
974 		}
975 
976 		/*
977 		 * We didn't find another group to handle the allocation
978 		 * so we can't skip this metaslab group even though
979 		 * we are at or over our qmax.
980 		 */
981 		return (B_TRUE);
982 
983 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
984 		return (B_TRUE);
985 	}
986 	return (B_FALSE);
987 }
988 
989 /*
990  * ==========================================================================
991  * Range tree callbacks
992  * ==========================================================================
993  */
994 
995 /*
996  * Comparison function for the private size-ordered tree. Tree is sorted
997  * by size, larger sizes at the end of the tree.
998  */
999 static int
1000 metaslab_rangesize_compare(const void *x1, const void *x2)
1001 {
1002 	const range_seg_t *r1 = x1;
1003 	const range_seg_t *r2 = x2;
1004 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1005 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1006 
1007 	if (rs_size1 < rs_size2)
1008 		return (-1);
1009 	if (rs_size1 > rs_size2)
1010 		return (1);
1011 
1012 	if (r1->rs_start < r2->rs_start)
1013 		return (-1);
1014 
1015 	if (r1->rs_start > r2->rs_start)
1016 		return (1);
1017 
1018 	return (0);
1019 }
1020 
1021 /*
1022  * Create any block allocator specific components. The current allocators
1023  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1024  */
1025 static void
1026 metaslab_rt_create(range_tree_t *rt, void *arg)
1027 {
1028 	metaslab_t *msp = arg;
1029 
1030 	ASSERT3P(rt->rt_arg, ==, msp);
1031 	ASSERT(msp->ms_tree == NULL);
1032 
1033 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1034 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1035 }
1036 
1037 /*
1038  * Destroy the block allocator specific components.
1039  */
1040 static void
1041 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1042 {
1043 	metaslab_t *msp = arg;
1044 
1045 	ASSERT3P(rt->rt_arg, ==, msp);
1046 	ASSERT3P(msp->ms_tree, ==, rt);
1047 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
1048 
1049 	avl_destroy(&msp->ms_size_tree);
1050 }
1051 
1052 static void
1053 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1054 {
1055 	metaslab_t *msp = arg;
1056 
1057 	ASSERT3P(rt->rt_arg, ==, msp);
1058 	ASSERT3P(msp->ms_tree, ==, rt);
1059 	VERIFY(!msp->ms_condensing);
1060 	avl_add(&msp->ms_size_tree, rs);
1061 }
1062 
1063 static void
1064 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1065 {
1066 	metaslab_t *msp = arg;
1067 
1068 	ASSERT3P(rt->rt_arg, ==, msp);
1069 	ASSERT3P(msp->ms_tree, ==, rt);
1070 	VERIFY(!msp->ms_condensing);
1071 	avl_remove(&msp->ms_size_tree, rs);
1072 }
1073 
1074 static void
1075 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1076 {
1077 	metaslab_t *msp = arg;
1078 
1079 	ASSERT3P(rt->rt_arg, ==, msp);
1080 	ASSERT3P(msp->ms_tree, ==, rt);
1081 
1082 	/*
1083 	 * Normally one would walk the tree freeing nodes along the way.
1084 	 * Since the nodes are shared with the range trees we can avoid
1085 	 * walking all nodes and just reinitialize the avl tree. The nodes
1086 	 * will be freed by the range tree, so we don't want to free them here.
1087 	 */
1088 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1089 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1090 }
1091 
1092 static range_tree_ops_t metaslab_rt_ops = {
1093 	metaslab_rt_create,
1094 	metaslab_rt_destroy,
1095 	metaslab_rt_add,
1096 	metaslab_rt_remove,
1097 	metaslab_rt_vacate
1098 };
1099 
1100 /*
1101  * ==========================================================================
1102  * Common allocator routines
1103  * ==========================================================================
1104  */
1105 
1106 /*
1107  * Return the maximum contiguous segment within the metaslab.
1108  */
1109 uint64_t
1110 metaslab_block_maxsize(metaslab_t *msp)
1111 {
1112 	avl_tree_t *t = &msp->ms_size_tree;
1113 	range_seg_t *rs;
1114 
1115 	if (t == NULL || (rs = avl_last(t)) == NULL)
1116 		return (0ULL);
1117 
1118 	return (rs->rs_end - rs->rs_start);
1119 }
1120 
1121 static range_seg_t *
1122 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1123 {
1124 	range_seg_t *rs, rsearch;
1125 	avl_index_t where;
1126 
1127 	rsearch.rs_start = start;
1128 	rsearch.rs_end = start + size;
1129 
1130 	rs = avl_find(t, &rsearch, &where);
1131 	if (rs == NULL) {
1132 		rs = avl_nearest(t, where, AVL_AFTER);
1133 	}
1134 
1135 	return (rs);
1136 }
1137 
1138 /*
1139  * This is a helper function that can be used by the allocator to find
1140  * a suitable block to allocate. This will search the specified AVL
1141  * tree looking for a block that matches the specified criteria.
1142  */
1143 static uint64_t
1144 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1145     uint64_t align)
1146 {
1147 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1148 
1149 	while (rs != NULL) {
1150 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1151 
1152 		if (offset + size <= rs->rs_end) {
1153 			*cursor = offset + size;
1154 			return (offset);
1155 		}
1156 		rs = AVL_NEXT(t, rs);
1157 	}
1158 
1159 	/*
1160 	 * If we know we've searched the whole map (*cursor == 0), give up.
1161 	 * Otherwise, reset the cursor to the beginning and try again.
1162 	 */
1163 	if (*cursor == 0)
1164 		return (-1ULL);
1165 
1166 	*cursor = 0;
1167 	return (metaslab_block_picker(t, cursor, size, align));
1168 }
1169 
1170 /*
1171  * ==========================================================================
1172  * The first-fit block allocator
1173  * ==========================================================================
1174  */
1175 static uint64_t
1176 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1177 {
1178 	/*
1179 	 * Find the largest power of 2 block size that evenly divides the
1180 	 * requested size. This is used to try to allocate blocks with similar
1181 	 * alignment from the same area of the metaslab (i.e. same cursor
1182 	 * bucket) but it does not guarantee that other allocations sizes
1183 	 * may exist in the same region.
1184 	 */
1185 	uint64_t align = size & -size;
1186 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1187 	avl_tree_t *t = &msp->ms_tree->rt_root;
1188 
1189 	return (metaslab_block_picker(t, cursor, size, align));
1190 }
1191 
1192 static metaslab_ops_t metaslab_ff_ops = {
1193 	metaslab_ff_alloc
1194 };
1195 
1196 /*
1197  * ==========================================================================
1198  * Dynamic block allocator -
1199  * Uses the first fit allocation scheme until space get low and then
1200  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1201  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1202  * ==========================================================================
1203  */
1204 static uint64_t
1205 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1206 {
1207 	/*
1208 	 * Find the largest power of 2 block size that evenly divides the
1209 	 * requested size. This is used to try to allocate blocks with similar
1210 	 * alignment from the same area of the metaslab (i.e. same cursor
1211 	 * bucket) but it does not guarantee that other allocations sizes
1212 	 * may exist in the same region.
1213 	 */
1214 	uint64_t align = size & -size;
1215 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1216 	range_tree_t *rt = msp->ms_tree;
1217 	avl_tree_t *t = &rt->rt_root;
1218 	uint64_t max_size = metaslab_block_maxsize(msp);
1219 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1220 
1221 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1222 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1223 
1224 	if (max_size < size)
1225 		return (-1ULL);
1226 
1227 	/*
1228 	 * If we're running low on space switch to using the size
1229 	 * sorted AVL tree (best-fit).
1230 	 */
1231 	if (max_size < metaslab_df_alloc_threshold ||
1232 	    free_pct < metaslab_df_free_pct) {
1233 		t = &msp->ms_size_tree;
1234 		*cursor = 0;
1235 	}
1236 
1237 	return (metaslab_block_picker(t, cursor, size, 1ULL));
1238 }
1239 
1240 static metaslab_ops_t metaslab_df_ops = {
1241 	metaslab_df_alloc
1242 };
1243 
1244 /*
1245  * ==========================================================================
1246  * Cursor fit block allocator -
1247  * Select the largest region in the metaslab, set the cursor to the beginning
1248  * of the range and the cursor_end to the end of the range. As allocations
1249  * are made advance the cursor. Continue allocating from the cursor until
1250  * the range is exhausted and then find a new range.
1251  * ==========================================================================
1252  */
1253 static uint64_t
1254 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1255 {
1256 	range_tree_t *rt = msp->ms_tree;
1257 	avl_tree_t *t = &msp->ms_size_tree;
1258 	uint64_t *cursor = &msp->ms_lbas[0];
1259 	uint64_t *cursor_end = &msp->ms_lbas[1];
1260 	uint64_t offset = 0;
1261 
1262 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1263 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1264 
1265 	ASSERT3U(*cursor_end, >=, *cursor);
1266 
1267 	if ((*cursor + size) > *cursor_end) {
1268 		range_seg_t *rs;
1269 
1270 		rs = avl_last(&msp->ms_size_tree);
1271 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1272 			return (-1ULL);
1273 
1274 		*cursor = rs->rs_start;
1275 		*cursor_end = rs->rs_end;
1276 	}
1277 
1278 	offset = *cursor;
1279 	*cursor += size;
1280 
1281 	return (offset);
1282 }
1283 
1284 static metaslab_ops_t metaslab_cf_ops = {
1285 	metaslab_cf_alloc
1286 };
1287 
1288 /*
1289  * ==========================================================================
1290  * New dynamic fit allocator -
1291  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1292  * contiguous blocks. If no region is found then just use the largest segment
1293  * that remains.
1294  * ==========================================================================
1295  */
1296 
1297 /*
1298  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1299  * to request from the allocator.
1300  */
1301 uint64_t metaslab_ndf_clump_shift = 4;
1302 
1303 static uint64_t
1304 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1305 {
1306 	avl_tree_t *t = &msp->ms_tree->rt_root;
1307 	avl_index_t where;
1308 	range_seg_t *rs, rsearch;
1309 	uint64_t hbit = highbit64(size);
1310 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1311 	uint64_t max_size = metaslab_block_maxsize(msp);
1312 
1313 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1314 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1315 
1316 	if (max_size < size)
1317 		return (-1ULL);
1318 
1319 	rsearch.rs_start = *cursor;
1320 	rsearch.rs_end = *cursor + size;
1321 
1322 	rs = avl_find(t, &rsearch, &where);
1323 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1324 		t = &msp->ms_size_tree;
1325 
1326 		rsearch.rs_start = 0;
1327 		rsearch.rs_end = MIN(max_size,
1328 		    1ULL << (hbit + metaslab_ndf_clump_shift));
1329 		rs = avl_find(t, &rsearch, &where);
1330 		if (rs == NULL)
1331 			rs = avl_nearest(t, where, AVL_AFTER);
1332 		ASSERT(rs != NULL);
1333 	}
1334 
1335 	if ((rs->rs_end - rs->rs_start) >= size) {
1336 		*cursor = rs->rs_start + size;
1337 		return (rs->rs_start);
1338 	}
1339 	return (-1ULL);
1340 }
1341 
1342 static metaslab_ops_t metaslab_ndf_ops = {
1343 	metaslab_ndf_alloc
1344 };
1345 
1346 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1347 
1348 /*
1349  * ==========================================================================
1350  * Metaslabs
1351  * ==========================================================================
1352  */
1353 
1354 /*
1355  * Wait for any in-progress metaslab loads to complete.
1356  */
1357 void
1358 metaslab_load_wait(metaslab_t *msp)
1359 {
1360 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1361 
1362 	while (msp->ms_loading) {
1363 		ASSERT(!msp->ms_loaded);
1364 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1365 	}
1366 }
1367 
1368 int
1369 metaslab_load(metaslab_t *msp)
1370 {
1371 	int error = 0;
1372 	boolean_t success = B_FALSE;
1373 
1374 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1375 	ASSERT(!msp->ms_loaded);
1376 	ASSERT(!msp->ms_loading);
1377 
1378 	msp->ms_loading = B_TRUE;
1379 	/*
1380 	 * Nobody else can manipulate a loading metaslab, so it's now safe
1381 	 * to drop the lock.  This way we don't have to hold the lock while
1382 	 * reading the spacemap from disk.
1383 	 */
1384 	mutex_exit(&msp->ms_lock);
1385 
1386 	/*
1387 	 * If the space map has not been allocated yet, then treat
1388 	 * all the space in the metaslab as free and add it to the
1389 	 * ms_tree.
1390 	 */
1391 	if (msp->ms_sm != NULL)
1392 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1393 	else
1394 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1395 
1396 	success = (error == 0);
1397 
1398 	mutex_enter(&msp->ms_lock);
1399 	msp->ms_loading = B_FALSE;
1400 
1401 	if (success) {
1402 		ASSERT3P(msp->ms_group, !=, NULL);
1403 		msp->ms_loaded = B_TRUE;
1404 
1405 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1406 			range_tree_walk(msp->ms_defertree[t],
1407 			    range_tree_remove, msp->ms_tree);
1408 		}
1409 		msp->ms_max_size = metaslab_block_maxsize(msp);
1410 	}
1411 	cv_broadcast(&msp->ms_load_cv);
1412 	return (error);
1413 }
1414 
1415 void
1416 metaslab_unload(metaslab_t *msp)
1417 {
1418 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1419 	range_tree_vacate(msp->ms_tree, NULL, NULL);
1420 	msp->ms_loaded = B_FALSE;
1421 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1422 	msp->ms_max_size = 0;
1423 }
1424 
1425 int
1426 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1427     metaslab_t **msp)
1428 {
1429 	vdev_t *vd = mg->mg_vd;
1430 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1431 	metaslab_t *ms;
1432 	int error;
1433 
1434 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1435 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1436 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1437 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1438 	ms->ms_id = id;
1439 	ms->ms_start = id << vd->vdev_ms_shift;
1440 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1441 
1442 	/*
1443 	 * We only open space map objects that already exist. All others
1444 	 * will be opened when we finally allocate an object for it.
1445 	 */
1446 	if (object != 0) {
1447 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1448 		    ms->ms_size, vd->vdev_ashift);
1449 
1450 		if (error != 0) {
1451 			kmem_free(ms, sizeof (metaslab_t));
1452 			return (error);
1453 		}
1454 
1455 		ASSERT(ms->ms_sm != NULL);
1456 	}
1457 
1458 	/*
1459 	 * We create the main range tree here, but we don't create the
1460 	 * other range trees until metaslab_sync_done().  This serves
1461 	 * two purposes: it allows metaslab_sync_done() to detect the
1462 	 * addition of new space; and for debugging, it ensures that we'd
1463 	 * data fault on any attempt to use this metaslab before it's ready.
1464 	 */
1465 	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms);
1466 	metaslab_group_add(mg, ms);
1467 
1468 	metaslab_set_fragmentation(ms);
1469 
1470 	/*
1471 	 * If we're opening an existing pool (txg == 0) or creating
1472 	 * a new one (txg == TXG_INITIAL), all space is available now.
1473 	 * If we're adding space to an existing pool, the new space
1474 	 * does not become available until after this txg has synced.
1475 	 * The metaslab's weight will also be initialized when we sync
1476 	 * out this txg. This ensures that we don't attempt to allocate
1477 	 * from it before we have initialized it completely.
1478 	 */
1479 	if (txg <= TXG_INITIAL)
1480 		metaslab_sync_done(ms, 0);
1481 
1482 	/*
1483 	 * If metaslab_debug_load is set and we're initializing a metaslab
1484 	 * that has an allocated space map object then load the its space
1485 	 * map so that can verify frees.
1486 	 */
1487 	if (metaslab_debug_load && ms->ms_sm != NULL) {
1488 		mutex_enter(&ms->ms_lock);
1489 		VERIFY0(metaslab_load(ms));
1490 		mutex_exit(&ms->ms_lock);
1491 	}
1492 
1493 	if (txg != 0) {
1494 		vdev_dirty(vd, 0, NULL, txg);
1495 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1496 	}
1497 
1498 	*msp = ms;
1499 
1500 	return (0);
1501 }
1502 
1503 void
1504 metaslab_fini(metaslab_t *msp)
1505 {
1506 	metaslab_group_t *mg = msp->ms_group;
1507 
1508 	metaslab_group_remove(mg, msp);
1509 
1510 	mutex_enter(&msp->ms_lock);
1511 	VERIFY(msp->ms_group == NULL);
1512 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1513 	    0, -msp->ms_size);
1514 	space_map_close(msp->ms_sm);
1515 
1516 	metaslab_unload(msp);
1517 	range_tree_destroy(msp->ms_tree);
1518 	range_tree_destroy(msp->ms_freeingtree);
1519 	range_tree_destroy(msp->ms_freedtree);
1520 
1521 	for (int t = 0; t < TXG_SIZE; t++) {
1522 		range_tree_destroy(msp->ms_alloctree[t]);
1523 	}
1524 
1525 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1526 		range_tree_destroy(msp->ms_defertree[t]);
1527 	}
1528 
1529 	ASSERT0(msp->ms_deferspace);
1530 
1531 	mutex_exit(&msp->ms_lock);
1532 	cv_destroy(&msp->ms_load_cv);
1533 	mutex_destroy(&msp->ms_lock);
1534 	mutex_destroy(&msp->ms_sync_lock);
1535 
1536 	kmem_free(msp, sizeof (metaslab_t));
1537 }
1538 
1539 #define	FRAGMENTATION_TABLE_SIZE	17
1540 
1541 /*
1542  * This table defines a segment size based fragmentation metric that will
1543  * allow each metaslab to derive its own fragmentation value. This is done
1544  * by calculating the space in each bucket of the spacemap histogram and
1545  * multiplying that by the fragmetation metric in this table. Doing
1546  * this for all buckets and dividing it by the total amount of free
1547  * space in this metaslab (i.e. the total free space in all buckets) gives
1548  * us the fragmentation metric. This means that a high fragmentation metric
1549  * equates to most of the free space being comprised of small segments.
1550  * Conversely, if the metric is low, then most of the free space is in
1551  * large segments. A 10% change in fragmentation equates to approximately
1552  * double the number of segments.
1553  *
1554  * This table defines 0% fragmented space using 16MB segments. Testing has
1555  * shown that segments that are greater than or equal to 16MB do not suffer
1556  * from drastic performance problems. Using this value, we derive the rest
1557  * of the table. Since the fragmentation value is never stored on disk, it
1558  * is possible to change these calculations in the future.
1559  */
1560 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1561 	100,	/* 512B	*/
1562 	100,	/* 1K	*/
1563 	98,	/* 2K	*/
1564 	95,	/* 4K	*/
1565 	90,	/* 8K	*/
1566 	80,	/* 16K	*/
1567 	70,	/* 32K	*/
1568 	60,	/* 64K	*/
1569 	50,	/* 128K	*/
1570 	40,	/* 256K	*/
1571 	30,	/* 512K	*/
1572 	20,	/* 1M	*/
1573 	15,	/* 2M	*/
1574 	10,	/* 4M	*/
1575 	5,	/* 8M	*/
1576 	0	/* 16M	*/
1577 };
1578 
1579 /*
1580  * Calclate the metaslab's fragmentation metric. A return value
1581  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1582  * not support this metric. Otherwise, the return value should be in the
1583  * range [0, 100].
1584  */
1585 static void
1586 metaslab_set_fragmentation(metaslab_t *msp)
1587 {
1588 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1589 	uint64_t fragmentation = 0;
1590 	uint64_t total = 0;
1591 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1592 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1593 
1594 	if (!feature_enabled) {
1595 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
1596 		return;
1597 	}
1598 
1599 	/*
1600 	 * A null space map means that the entire metaslab is free
1601 	 * and thus is not fragmented.
1602 	 */
1603 	if (msp->ms_sm == NULL) {
1604 		msp->ms_fragmentation = 0;
1605 		return;
1606 	}
1607 
1608 	/*
1609 	 * If this metaslab's space map has not been upgraded, flag it
1610 	 * so that we upgrade next time we encounter it.
1611 	 */
1612 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1613 		uint64_t txg = spa_syncing_txg(spa);
1614 		vdev_t *vd = msp->ms_group->mg_vd;
1615 
1616 		/*
1617 		 * If we've reached the final dirty txg, then we must
1618 		 * be shutting down the pool. We don't want to dirty
1619 		 * any data past this point so skip setting the condense
1620 		 * flag. We can retry this action the next time the pool
1621 		 * is imported.
1622 		 */
1623 		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1624 			msp->ms_condense_wanted = B_TRUE;
1625 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1626 			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1627 			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1628 			    vd->vdev_id);
1629 		}
1630 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
1631 		return;
1632 	}
1633 
1634 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1635 		uint64_t space = 0;
1636 		uint8_t shift = msp->ms_sm->sm_shift;
1637 
1638 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1639 		    FRAGMENTATION_TABLE_SIZE - 1);
1640 
1641 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1642 			continue;
1643 
1644 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1645 		total += space;
1646 
1647 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1648 		fragmentation += space * zfs_frag_table[idx];
1649 	}
1650 
1651 	if (total > 0)
1652 		fragmentation /= total;
1653 	ASSERT3U(fragmentation, <=, 100);
1654 
1655 	msp->ms_fragmentation = fragmentation;
1656 }
1657 
1658 /*
1659  * Compute a weight -- a selection preference value -- for the given metaslab.
1660  * This is based on the amount of free space, the level of fragmentation,
1661  * the LBA range, and whether the metaslab is loaded.
1662  */
1663 static uint64_t
1664 metaslab_space_weight(metaslab_t *msp)
1665 {
1666 	metaslab_group_t *mg = msp->ms_group;
1667 	vdev_t *vd = mg->mg_vd;
1668 	uint64_t weight, space;
1669 
1670 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1671 	ASSERT(!vd->vdev_removing);
1672 
1673 	/*
1674 	 * The baseline weight is the metaslab's free space.
1675 	 */
1676 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1677 
1678 	if (metaslab_fragmentation_factor_enabled &&
1679 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1680 		/*
1681 		 * Use the fragmentation information to inversely scale
1682 		 * down the baseline weight. We need to ensure that we
1683 		 * don't exclude this metaslab completely when it's 100%
1684 		 * fragmented. To avoid this we reduce the fragmented value
1685 		 * by 1.
1686 		 */
1687 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1688 
1689 		/*
1690 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1691 		 * this metaslab again. The fragmentation metric may have
1692 		 * decreased the space to something smaller than
1693 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1694 		 * so that we can consume any remaining space.
1695 		 */
1696 		if (space > 0 && space < SPA_MINBLOCKSIZE)
1697 			space = SPA_MINBLOCKSIZE;
1698 	}
1699 	weight = space;
1700 
1701 	/*
1702 	 * Modern disks have uniform bit density and constant angular velocity.
1703 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1704 	 * than the inner zones by the ratio of outer to inner track diameter,
1705 	 * which is typically around 2:1.  We account for this by assigning
1706 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1707 	 * In effect, this means that we'll select the metaslab with the most
1708 	 * free bandwidth rather than simply the one with the most free space.
1709 	 */
1710 	if (metaslab_lba_weighting_enabled) {
1711 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1712 		ASSERT(weight >= space && weight <= 2 * space);
1713 	}
1714 
1715 	/*
1716 	 * If this metaslab is one we're actively using, adjust its
1717 	 * weight to make it preferable to any inactive metaslab so
1718 	 * we'll polish it off. If the fragmentation on this metaslab
1719 	 * has exceed our threshold, then don't mark it active.
1720 	 */
1721 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1722 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1723 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1724 	}
1725 
1726 	WEIGHT_SET_SPACEBASED(weight);
1727 	return (weight);
1728 }
1729 
1730 /*
1731  * Return the weight of the specified metaslab, according to the segment-based
1732  * weighting algorithm. The metaslab must be loaded. This function can
1733  * be called within a sync pass since it relies only on the metaslab's
1734  * range tree which is always accurate when the metaslab is loaded.
1735  */
1736 static uint64_t
1737 metaslab_weight_from_range_tree(metaslab_t *msp)
1738 {
1739 	uint64_t weight = 0;
1740 	uint32_t segments = 0;
1741 
1742 	ASSERT(msp->ms_loaded);
1743 
1744 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1745 	    i--) {
1746 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1747 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1748 
1749 		segments <<= 1;
1750 		segments += msp->ms_tree->rt_histogram[i];
1751 
1752 		/*
1753 		 * The range tree provides more precision than the space map
1754 		 * and must be downgraded so that all values fit within the
1755 		 * space map's histogram. This allows us to compare loaded
1756 		 * vs. unloaded metaslabs to determine which metaslab is
1757 		 * considered "best".
1758 		 */
1759 		if (i > max_idx)
1760 			continue;
1761 
1762 		if (segments != 0) {
1763 			WEIGHT_SET_COUNT(weight, segments);
1764 			WEIGHT_SET_INDEX(weight, i);
1765 			WEIGHT_SET_ACTIVE(weight, 0);
1766 			break;
1767 		}
1768 	}
1769 	return (weight);
1770 }
1771 
1772 /*
1773  * Calculate the weight based on the on-disk histogram. This should only
1774  * be called after a sync pass has completely finished since the on-disk
1775  * information is updated in metaslab_sync().
1776  */
1777 static uint64_t
1778 metaslab_weight_from_spacemap(metaslab_t *msp)
1779 {
1780 	uint64_t weight = 0;
1781 
1782 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1783 		if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1784 			WEIGHT_SET_COUNT(weight,
1785 			    msp->ms_sm->sm_phys->smp_histogram[i]);
1786 			WEIGHT_SET_INDEX(weight, i +
1787 			    msp->ms_sm->sm_shift);
1788 			WEIGHT_SET_ACTIVE(weight, 0);
1789 			break;
1790 		}
1791 	}
1792 	return (weight);
1793 }
1794 
1795 /*
1796  * Compute a segment-based weight for the specified metaslab. The weight
1797  * is determined by highest bucket in the histogram. The information
1798  * for the highest bucket is encoded into the weight value.
1799  */
1800 static uint64_t
1801 metaslab_segment_weight(metaslab_t *msp)
1802 {
1803 	metaslab_group_t *mg = msp->ms_group;
1804 	uint64_t weight = 0;
1805 	uint8_t shift = mg->mg_vd->vdev_ashift;
1806 
1807 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1808 
1809 	/*
1810 	 * The metaslab is completely free.
1811 	 */
1812 	if (space_map_allocated(msp->ms_sm) == 0) {
1813 		int idx = highbit64(msp->ms_size) - 1;
1814 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1815 
1816 		if (idx < max_idx) {
1817 			WEIGHT_SET_COUNT(weight, 1ULL);
1818 			WEIGHT_SET_INDEX(weight, idx);
1819 		} else {
1820 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1821 			WEIGHT_SET_INDEX(weight, max_idx);
1822 		}
1823 		WEIGHT_SET_ACTIVE(weight, 0);
1824 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1825 
1826 		return (weight);
1827 	}
1828 
1829 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1830 
1831 	/*
1832 	 * If the metaslab is fully allocated then just make the weight 0.
1833 	 */
1834 	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1835 		return (0);
1836 	/*
1837 	 * If the metaslab is already loaded, then use the range tree to
1838 	 * determine the weight. Otherwise, we rely on the space map information
1839 	 * to generate the weight.
1840 	 */
1841 	if (msp->ms_loaded) {
1842 		weight = metaslab_weight_from_range_tree(msp);
1843 	} else {
1844 		weight = metaslab_weight_from_spacemap(msp);
1845 	}
1846 
1847 	/*
1848 	 * If the metaslab was active the last time we calculated its weight
1849 	 * then keep it active. We want to consume the entire region that
1850 	 * is associated with this weight.
1851 	 */
1852 	if (msp->ms_activation_weight != 0 && weight != 0)
1853 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1854 	return (weight);
1855 }
1856 
1857 /*
1858  * Determine if we should attempt to allocate from this metaslab. If the
1859  * metaslab has a maximum size then we can quickly determine if the desired
1860  * allocation size can be satisfied. Otherwise, if we're using segment-based
1861  * weighting then we can determine the maximum allocation that this metaslab
1862  * can accommodate based on the index encoded in the weight. If we're using
1863  * space-based weights then rely on the entire weight (excluding the weight
1864  * type bit).
1865  */
1866 boolean_t
1867 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1868 {
1869 	boolean_t should_allocate;
1870 
1871 	if (msp->ms_max_size != 0)
1872 		return (msp->ms_max_size >= asize);
1873 
1874 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1875 		/*
1876 		 * The metaslab segment weight indicates segments in the
1877 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
1878 		 * Since the asize might be in the middle of the range, we
1879 		 * should attempt the allocation if asize < 2^(i+1).
1880 		 */
1881 		should_allocate = (asize <
1882 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1883 	} else {
1884 		should_allocate = (asize <=
1885 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1886 	}
1887 	return (should_allocate);
1888 }
1889 
1890 static uint64_t
1891 metaslab_weight(metaslab_t *msp)
1892 {
1893 	vdev_t *vd = msp->ms_group->mg_vd;
1894 	spa_t *spa = vd->vdev_spa;
1895 	uint64_t weight;
1896 
1897 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1898 
1899 	/*
1900 	 * If this vdev is in the process of being removed, there is nothing
1901 	 * for us to do here.
1902 	 */
1903 	if (vd->vdev_removing)
1904 		return (0);
1905 
1906 	metaslab_set_fragmentation(msp);
1907 
1908 	/*
1909 	 * Update the maximum size if the metaslab is loaded. This will
1910 	 * ensure that we get an accurate maximum size if newly freed space
1911 	 * has been added back into the free tree.
1912 	 */
1913 	if (msp->ms_loaded)
1914 		msp->ms_max_size = metaslab_block_maxsize(msp);
1915 
1916 	/*
1917 	 * Segment-based weighting requires space map histogram support.
1918 	 */
1919 	if (zfs_metaslab_segment_weight_enabled &&
1920 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1921 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1922 	    sizeof (space_map_phys_t))) {
1923 		weight = metaslab_segment_weight(msp);
1924 	} else {
1925 		weight = metaslab_space_weight(msp);
1926 	}
1927 	return (weight);
1928 }
1929 
1930 static int
1931 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1932 {
1933 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1934 
1935 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1936 		metaslab_load_wait(msp);
1937 		if (!msp->ms_loaded) {
1938 			int error = metaslab_load(msp);
1939 			if (error) {
1940 				metaslab_group_sort(msp->ms_group, msp, 0);
1941 				return (error);
1942 			}
1943 		}
1944 
1945 		msp->ms_activation_weight = msp->ms_weight;
1946 		metaslab_group_sort(msp->ms_group, msp,
1947 		    msp->ms_weight | activation_weight);
1948 	}
1949 	ASSERT(msp->ms_loaded);
1950 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1951 
1952 	return (0);
1953 }
1954 
1955 static void
1956 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1957 {
1958 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1959 
1960 	/*
1961 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1962 	 * this metaslab again.  In that case, it had better be empty,
1963 	 * or we would be leaving space on the table.
1964 	 */
1965 	ASSERT(size >= SPA_MINBLOCKSIZE ||
1966 	    range_tree_space(msp->ms_tree) == 0);
1967 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
1968 
1969 	msp->ms_activation_weight = 0;
1970 	metaslab_group_sort(msp->ms_group, msp, weight);
1971 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1972 }
1973 
1974 /*
1975  * Segment-based metaslabs are activated once and remain active until
1976  * we either fail an allocation attempt (similar to space-based metaslabs)
1977  * or have exhausted the free space in zfs_metaslab_switch_threshold
1978  * buckets since the metaslab was activated. This function checks to see
1979  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1980  * metaslab and passivates it proactively. This will allow us to select a
1981  * metaslabs with larger contiguous region if any remaining within this
1982  * metaslab group. If we're in sync pass > 1, then we continue using this
1983  * metaslab so that we don't dirty more block and cause more sync passes.
1984  */
1985 void
1986 metaslab_segment_may_passivate(metaslab_t *msp)
1987 {
1988 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1989 
1990 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1991 		return;
1992 
1993 	/*
1994 	 * Since we are in the middle of a sync pass, the most accurate
1995 	 * information that is accessible to us is the in-core range tree
1996 	 * histogram; calculate the new weight based on that information.
1997 	 */
1998 	uint64_t weight = metaslab_weight_from_range_tree(msp);
1999 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2000 	int current_idx = WEIGHT_GET_INDEX(weight);
2001 
2002 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2003 		metaslab_passivate(msp, weight);
2004 }
2005 
2006 static void
2007 metaslab_preload(void *arg)
2008 {
2009 	metaslab_t *msp = arg;
2010 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2011 
2012 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2013 
2014 	mutex_enter(&msp->ms_lock);
2015 	metaslab_load_wait(msp);
2016 	if (!msp->ms_loaded)
2017 		(void) metaslab_load(msp);
2018 	msp->ms_selected_txg = spa_syncing_txg(spa);
2019 	mutex_exit(&msp->ms_lock);
2020 }
2021 
2022 static void
2023 metaslab_group_preload(metaslab_group_t *mg)
2024 {
2025 	spa_t *spa = mg->mg_vd->vdev_spa;
2026 	metaslab_t *msp;
2027 	avl_tree_t *t = &mg->mg_metaslab_tree;
2028 	int m = 0;
2029 
2030 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2031 		taskq_wait(mg->mg_taskq);
2032 		return;
2033 	}
2034 
2035 	mutex_enter(&mg->mg_lock);
2036 
2037 	/*
2038 	 * Load the next potential metaslabs
2039 	 */
2040 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2041 		ASSERT3P(msp->ms_group, ==, mg);
2042 
2043 		/*
2044 		 * We preload only the maximum number of metaslabs specified
2045 		 * by metaslab_preload_limit. If a metaslab is being forced
2046 		 * to condense then we preload it too. This will ensure
2047 		 * that force condensing happens in the next txg.
2048 		 */
2049 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2050 			continue;
2051 		}
2052 
2053 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2054 		    msp, TQ_SLEEP) != NULL);
2055 	}
2056 	mutex_exit(&mg->mg_lock);
2057 }
2058 
2059 /*
2060  * Determine if the space map's on-disk footprint is past our tolerance
2061  * for inefficiency. We would like to use the following criteria to make
2062  * our decision:
2063  *
2064  * 1. The size of the space map object should not dramatically increase as a
2065  * result of writing out the free space range tree.
2066  *
2067  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2068  * times the size than the free space range tree representation
2069  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2070  *
2071  * 3. The on-disk size of the space map should actually decrease.
2072  *
2073  * Checking the first condition is tricky since we don't want to walk
2074  * the entire AVL tree calculating the estimated on-disk size. Instead we
2075  * use the size-ordered range tree in the metaslab and calculate the
2076  * size required to write out the largest segment in our free tree. If the
2077  * size required to represent that segment on disk is larger than the space
2078  * map object then we avoid condensing this map.
2079  *
2080  * To determine the second criterion we use a best-case estimate and assume
2081  * each segment can be represented on-disk as a single 64-bit entry. We refer
2082  * to this best-case estimate as the space map's minimal form.
2083  *
2084  * Unfortunately, we cannot compute the on-disk size of the space map in this
2085  * context because we cannot accurately compute the effects of compression, etc.
2086  * Instead, we apply the heuristic described in the block comment for
2087  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2088  * is greater than a threshold number of blocks.
2089  */
2090 static boolean_t
2091 metaslab_should_condense(metaslab_t *msp)
2092 {
2093 	space_map_t *sm = msp->ms_sm;
2094 	range_seg_t *rs;
2095 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2096 	dmu_object_info_t doi;
2097 	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2098 
2099 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2100 	ASSERT(msp->ms_loaded);
2101 
2102 	/*
2103 	 * Use the ms_size_tree range tree, which is ordered by size, to
2104 	 * obtain the largest segment in the free tree. We always condense
2105 	 * metaslabs that are empty and metaslabs for which a condense
2106 	 * request has been made.
2107 	 */
2108 	rs = avl_last(&msp->ms_size_tree);
2109 	if (rs == NULL || msp->ms_condense_wanted)
2110 		return (B_TRUE);
2111 
2112 	/*
2113 	 * Calculate the number of 64-bit entries this segment would
2114 	 * require when written to disk. If this single segment would be
2115 	 * larger on-disk than the entire current on-disk structure, then
2116 	 * clearly condensing will increase the on-disk structure size.
2117 	 */
2118 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2119 	entries = size / (MIN(size, SM_RUN_MAX));
2120 	segsz = entries * sizeof (uint64_t);
2121 
2122 	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2123 	object_size = space_map_length(msp->ms_sm);
2124 
2125 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
2126 	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2127 
2128 	return (segsz <= object_size &&
2129 	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
2130 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
2131 }
2132 
2133 /*
2134  * Condense the on-disk space map representation to its minimized form.
2135  * The minimized form consists of a small number of allocations followed by
2136  * the entries of the free range tree.
2137  */
2138 static void
2139 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2140 {
2141 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2142 	range_tree_t *condense_tree;
2143 	space_map_t *sm = msp->ms_sm;
2144 
2145 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2146 	ASSERT3U(spa_sync_pass(spa), ==, 1);
2147 	ASSERT(msp->ms_loaded);
2148 
2149 
2150 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2151 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2152 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2153 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
2154 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2155 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
2156 
2157 	msp->ms_condense_wanted = B_FALSE;
2158 
2159 	/*
2160 	 * Create an range tree that is 100% allocated. We remove segments
2161 	 * that have been freed in this txg, any deferred frees that exist,
2162 	 * and any allocation in the future. Removing segments should be
2163 	 * a relatively inexpensive operation since we expect these trees to
2164 	 * have a small number of nodes.
2165 	 */
2166 	condense_tree = range_tree_create(NULL, NULL);
2167 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2168 
2169 	/*
2170 	 * Remove what's been freed in this txg from the condense_tree.
2171 	 * Since we're in sync_pass 1, we know that all the frees from
2172 	 * this txg are in the freeingtree.
2173 	 */
2174 	range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2175 
2176 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2177 		range_tree_walk(msp->ms_defertree[t],
2178 		    range_tree_remove, condense_tree);
2179 	}
2180 
2181 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2182 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2183 		    range_tree_remove, condense_tree);
2184 	}
2185 
2186 	/*
2187 	 * We're about to drop the metaslab's lock thus allowing
2188 	 * other consumers to change it's content. Set the
2189 	 * metaslab's ms_condensing flag to ensure that
2190 	 * allocations on this metaslab do not occur while we're
2191 	 * in the middle of committing it to disk. This is only critical
2192 	 * for the ms_tree as all other range trees use per txg
2193 	 * views of their content.
2194 	 */
2195 	msp->ms_condensing = B_TRUE;
2196 
2197 	mutex_exit(&msp->ms_lock);
2198 	space_map_truncate(sm, tx);
2199 
2200 	/*
2201 	 * While we would ideally like to create a space map representation
2202 	 * that consists only of allocation records, doing so can be
2203 	 * prohibitively expensive because the in-core free tree can be
2204 	 * large, and therefore computationally expensive to subtract
2205 	 * from the condense_tree. Instead we sync out two trees, a cheap
2206 	 * allocation only tree followed by the in-core free tree. While not
2207 	 * optimal, this is typically close to optimal, and much cheaper to
2208 	 * compute.
2209 	 */
2210 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
2211 	range_tree_vacate(condense_tree, NULL, NULL);
2212 	range_tree_destroy(condense_tree);
2213 
2214 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2215 	mutex_enter(&msp->ms_lock);
2216 	msp->ms_condensing = B_FALSE;
2217 }
2218 
2219 /*
2220  * Write a metaslab to disk in the context of the specified transaction group.
2221  */
2222 void
2223 metaslab_sync(metaslab_t *msp, uint64_t txg)
2224 {
2225 	metaslab_group_t *mg = msp->ms_group;
2226 	vdev_t *vd = mg->mg_vd;
2227 	spa_t *spa = vd->vdev_spa;
2228 	objset_t *mos = spa_meta_objset(spa);
2229 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2230 	dmu_tx_t *tx;
2231 	uint64_t object = space_map_object(msp->ms_sm);
2232 
2233 	ASSERT(!vd->vdev_ishole);
2234 
2235 	/*
2236 	 * This metaslab has just been added so there's no work to do now.
2237 	 */
2238 	if (msp->ms_freeingtree == NULL) {
2239 		ASSERT3P(alloctree, ==, NULL);
2240 		return;
2241 	}
2242 
2243 	ASSERT3P(alloctree, !=, NULL);
2244 	ASSERT3P(msp->ms_freeingtree, !=, NULL);
2245 	ASSERT3P(msp->ms_freedtree, !=, NULL);
2246 
2247 	/*
2248 	 * Normally, we don't want to process a metaslab if there
2249 	 * are no allocations or frees to perform. However, if the metaslab
2250 	 * is being forced to condense and it's loaded, we need to let it
2251 	 * through.
2252 	 */
2253 	if (range_tree_space(alloctree) == 0 &&
2254 	    range_tree_space(msp->ms_freeingtree) == 0 &&
2255 	    !(msp->ms_loaded && msp->ms_condense_wanted))
2256 		return;
2257 
2258 
2259 	VERIFY(txg <= spa_final_dirty_txg(spa));
2260 
2261 	/*
2262 	 * The only state that can actually be changing concurrently with
2263 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2264 	 * be modifying this txg's alloctree, freeingtree, freedtree, or
2265 	 * space_map_phys_t.  We drop ms_lock whenever we could call
2266 	 * into the DMU, because the DMU can call down to us
2267 	 * (e.g. via zio_free()) at any time.
2268 	 *
2269 	 * The spa_vdev_remove_thread() can be reading metaslab state
2270 	 * concurrently, and it is locked out by the ms_sync_lock.  Note
2271 	 * that the ms_lock is insufficient for this, because it is dropped
2272 	 * by space_map_write().
2273 	 */
2274 
2275 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2276 
2277 	if (msp->ms_sm == NULL) {
2278 		uint64_t new_object;
2279 
2280 		new_object = space_map_alloc(mos, tx);
2281 		VERIFY3U(new_object, !=, 0);
2282 
2283 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2284 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
2285 		ASSERT(msp->ms_sm != NULL);
2286 	}
2287 
2288 	mutex_enter(&msp->ms_sync_lock);
2289 	mutex_enter(&msp->ms_lock);
2290 
2291 	/*
2292 	 * Note: metaslab_condense() clears the space map's histogram.
2293 	 * Therefore we must verify and remove this histogram before
2294 	 * condensing.
2295 	 */
2296 	metaslab_group_histogram_verify(mg);
2297 	metaslab_class_histogram_verify(mg->mg_class);
2298 	metaslab_group_histogram_remove(mg, msp);
2299 
2300 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2301 	    metaslab_should_condense(msp)) {
2302 		metaslab_condense(msp, txg, tx);
2303 	} else {
2304 		mutex_exit(&msp->ms_lock);
2305 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2306 		space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2307 		mutex_enter(&msp->ms_lock);
2308 	}
2309 
2310 	if (msp->ms_loaded) {
2311 		/*
2312 		 * When the space map is loaded, we have an accurate
2313 		 * histogram in the range tree. This gives us an opportunity
2314 		 * to bring the space map's histogram up-to-date so we clear
2315 		 * it first before updating it.
2316 		 */
2317 		space_map_histogram_clear(msp->ms_sm);
2318 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2319 
2320 		/*
2321 		 * Since we've cleared the histogram we need to add back
2322 		 * any free space that has already been processed, plus
2323 		 * any deferred space. This allows the on-disk histogram
2324 		 * to accurately reflect all free space even if some space
2325 		 * is not yet available for allocation (i.e. deferred).
2326 		 */
2327 		space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2328 
2329 		/*
2330 		 * Add back any deferred free space that has not been
2331 		 * added back into the in-core free tree yet. This will
2332 		 * ensure that we don't end up with a space map histogram
2333 		 * that is completely empty unless the metaslab is fully
2334 		 * allocated.
2335 		 */
2336 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2337 			space_map_histogram_add(msp->ms_sm,
2338 			    msp->ms_defertree[t], tx);
2339 		}
2340 	}
2341 
2342 	/*
2343 	 * Always add the free space from this sync pass to the space
2344 	 * map histogram. We want to make sure that the on-disk histogram
2345 	 * accounts for all free space. If the space map is not loaded,
2346 	 * then we will lose some accuracy but will correct it the next
2347 	 * time we load the space map.
2348 	 */
2349 	space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2350 
2351 	metaslab_group_histogram_add(mg, msp);
2352 	metaslab_group_histogram_verify(mg);
2353 	metaslab_class_histogram_verify(mg->mg_class);
2354 
2355 	/*
2356 	 * For sync pass 1, we avoid traversing this txg's free range tree
2357 	 * and instead will just swap the pointers for freeingtree and
2358 	 * freedtree. We can safely do this since the freed_tree is
2359 	 * guaranteed to be empty on the initial pass.
2360 	 */
2361 	if (spa_sync_pass(spa) == 1) {
2362 		range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2363 	} else {
2364 		range_tree_vacate(msp->ms_freeingtree,
2365 		    range_tree_add, msp->ms_freedtree);
2366 	}
2367 	range_tree_vacate(alloctree, NULL, NULL);
2368 
2369 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2370 	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2371 	ASSERT0(range_tree_space(msp->ms_freeingtree));
2372 
2373 	mutex_exit(&msp->ms_lock);
2374 
2375 	if (object != space_map_object(msp->ms_sm)) {
2376 		object = space_map_object(msp->ms_sm);
2377 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2378 		    msp->ms_id, sizeof (uint64_t), &object, tx);
2379 	}
2380 	mutex_exit(&msp->ms_sync_lock);
2381 	dmu_tx_commit(tx);
2382 }
2383 
2384 /*
2385  * Called after a transaction group has completely synced to mark
2386  * all of the metaslab's free space as usable.
2387  */
2388 void
2389 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2390 {
2391 	metaslab_group_t *mg = msp->ms_group;
2392 	vdev_t *vd = mg->mg_vd;
2393 	spa_t *spa = vd->vdev_spa;
2394 	range_tree_t **defer_tree;
2395 	int64_t alloc_delta, defer_delta;
2396 	boolean_t defer_allowed = B_TRUE;
2397 
2398 	ASSERT(!vd->vdev_ishole);
2399 
2400 	mutex_enter(&msp->ms_lock);
2401 
2402 	/*
2403 	 * If this metaslab is just becoming available, initialize its
2404 	 * range trees and add its capacity to the vdev.
2405 	 */
2406 	if (msp->ms_freedtree == NULL) {
2407 		for (int t = 0; t < TXG_SIZE; t++) {
2408 			ASSERT(msp->ms_alloctree[t] == NULL);
2409 
2410 			msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
2411 		}
2412 
2413 		ASSERT3P(msp->ms_freeingtree, ==, NULL);
2414 		msp->ms_freeingtree = range_tree_create(NULL, NULL);
2415 
2416 		ASSERT3P(msp->ms_freedtree, ==, NULL);
2417 		msp->ms_freedtree = range_tree_create(NULL, NULL);
2418 
2419 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2420 			ASSERT(msp->ms_defertree[t] == NULL);
2421 
2422 			msp->ms_defertree[t] = range_tree_create(NULL, NULL);
2423 		}
2424 
2425 		vdev_space_update(vd, 0, 0, msp->ms_size);
2426 	}
2427 
2428 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2429 
2430 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2431 	    metaslab_class_get_alloc(spa_normal_class(spa));
2432 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2433 		defer_allowed = B_FALSE;
2434 	}
2435 
2436 	defer_delta = 0;
2437 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2438 	if (defer_allowed) {
2439 		defer_delta = range_tree_space(msp->ms_freedtree) -
2440 		    range_tree_space(*defer_tree);
2441 	} else {
2442 		defer_delta -= range_tree_space(*defer_tree);
2443 	}
2444 
2445 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2446 
2447 	/*
2448 	 * If there's a metaslab_load() in progress, wait for it to complete
2449 	 * so that we have a consistent view of the in-core space map.
2450 	 */
2451 	metaslab_load_wait(msp);
2452 
2453 	/*
2454 	 * Move the frees from the defer_tree back to the free
2455 	 * range tree (if it's loaded). Swap the freed_tree and the
2456 	 * defer_tree -- this is safe to do because we've just emptied out
2457 	 * the defer_tree.
2458 	 */
2459 	range_tree_vacate(*defer_tree,
2460 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2461 	if (defer_allowed) {
2462 		range_tree_swap(&msp->ms_freedtree, defer_tree);
2463 	} else {
2464 		range_tree_vacate(msp->ms_freedtree,
2465 		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2466 	}
2467 
2468 	space_map_update(msp->ms_sm);
2469 
2470 	msp->ms_deferspace += defer_delta;
2471 	ASSERT3S(msp->ms_deferspace, >=, 0);
2472 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2473 	if (msp->ms_deferspace != 0) {
2474 		/*
2475 		 * Keep syncing this metaslab until all deferred frees
2476 		 * are back in circulation.
2477 		 */
2478 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2479 	}
2480 
2481 	/*
2482 	 * Calculate the new weights before unloading any metaslabs.
2483 	 * This will give us the most accurate weighting.
2484 	 */
2485 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2486 
2487 	/*
2488 	 * If the metaslab is loaded and we've not tried to load or allocate
2489 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
2490 	 */
2491 	if (msp->ms_loaded &&
2492 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
2493 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2494 			VERIFY0(range_tree_space(
2495 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2496 		}
2497 
2498 		if (!metaslab_debug_unload)
2499 			metaslab_unload(msp);
2500 	}
2501 
2502 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2503 	ASSERT0(range_tree_space(msp->ms_freeingtree));
2504 	ASSERT0(range_tree_space(msp->ms_freedtree));
2505 
2506 	mutex_exit(&msp->ms_lock);
2507 }
2508 
2509 void
2510 metaslab_sync_reassess(metaslab_group_t *mg)
2511 {
2512 	spa_t *spa = mg->mg_class->mc_spa;
2513 
2514 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2515 	metaslab_group_alloc_update(mg);
2516 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2517 
2518 	/*
2519 	 * Preload the next potential metaslabs but only on active
2520 	 * metaslab groups. We can get into a state where the metaslab
2521 	 * is no longer active since we dirty metaslabs as we remove a
2522 	 * a device, thus potentially making the metaslab group eligible
2523 	 * for preloading.
2524 	 */
2525 	if (mg->mg_activation_count > 0) {
2526 		metaslab_group_preload(mg);
2527 	}
2528 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2529 }
2530 
2531 static uint64_t
2532 metaslab_distance(metaslab_t *msp, dva_t *dva)
2533 {
2534 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2535 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2536 	uint64_t start = msp->ms_id;
2537 
2538 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2539 		return (1ULL << 63);
2540 
2541 	if (offset < start)
2542 		return ((start - offset) << ms_shift);
2543 	if (offset > start)
2544 		return ((offset - start) << ms_shift);
2545 	return (0);
2546 }
2547 
2548 /*
2549  * ==========================================================================
2550  * Metaslab allocation tracing facility
2551  * ==========================================================================
2552  */
2553 kstat_t *metaslab_trace_ksp;
2554 kstat_named_t metaslab_trace_over_limit;
2555 
2556 void
2557 metaslab_alloc_trace_init(void)
2558 {
2559 	ASSERT(metaslab_alloc_trace_cache == NULL);
2560 	metaslab_alloc_trace_cache = kmem_cache_create(
2561 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2562 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2563 	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2564 	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2565 	if (metaslab_trace_ksp != NULL) {
2566 		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2567 		kstat_named_init(&metaslab_trace_over_limit,
2568 		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2569 		kstat_install(metaslab_trace_ksp);
2570 	}
2571 }
2572 
2573 void
2574 metaslab_alloc_trace_fini(void)
2575 {
2576 	if (metaslab_trace_ksp != NULL) {
2577 		kstat_delete(metaslab_trace_ksp);
2578 		metaslab_trace_ksp = NULL;
2579 	}
2580 	kmem_cache_destroy(metaslab_alloc_trace_cache);
2581 	metaslab_alloc_trace_cache = NULL;
2582 }
2583 
2584 /*
2585  * Add an allocation trace element to the allocation tracing list.
2586  */
2587 static void
2588 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2589     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2590 {
2591 	if (!metaslab_trace_enabled)
2592 		return;
2593 
2594 	/*
2595 	 * When the tracing list reaches its maximum we remove
2596 	 * the second element in the list before adding a new one.
2597 	 * By removing the second element we preserve the original
2598 	 * entry as a clue to what allocations steps have already been
2599 	 * performed.
2600 	 */
2601 	if (zal->zal_size == metaslab_trace_max_entries) {
2602 		metaslab_alloc_trace_t *mat_next;
2603 #ifdef DEBUG
2604 		panic("too many entries in allocation list");
2605 #endif
2606 		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2607 		zal->zal_size--;
2608 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2609 		list_remove(&zal->zal_list, mat_next);
2610 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2611 	}
2612 
2613 	metaslab_alloc_trace_t *mat =
2614 	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2615 	list_link_init(&mat->mat_list_node);
2616 	mat->mat_mg = mg;
2617 	mat->mat_msp = msp;
2618 	mat->mat_size = psize;
2619 	mat->mat_dva_id = dva_id;
2620 	mat->mat_offset = offset;
2621 	mat->mat_weight = 0;
2622 
2623 	if (msp != NULL)
2624 		mat->mat_weight = msp->ms_weight;
2625 
2626 	/*
2627 	 * The list is part of the zio so locking is not required. Only
2628 	 * a single thread will perform allocations for a given zio.
2629 	 */
2630 	list_insert_tail(&zal->zal_list, mat);
2631 	zal->zal_size++;
2632 
2633 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2634 }
2635 
2636 void
2637 metaslab_trace_init(zio_alloc_list_t *zal)
2638 {
2639 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2640 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
2641 	zal->zal_size = 0;
2642 }
2643 
2644 void
2645 metaslab_trace_fini(zio_alloc_list_t *zal)
2646 {
2647 	metaslab_alloc_trace_t *mat;
2648 
2649 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2650 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
2651 	list_destroy(&zal->zal_list);
2652 	zal->zal_size = 0;
2653 }
2654 
2655 /*
2656  * ==========================================================================
2657  * Metaslab block operations
2658  * ==========================================================================
2659  */
2660 
2661 static void
2662 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2663 {
2664 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2665 	    flags & METASLAB_DONT_THROTTLE)
2666 		return;
2667 
2668 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2669 	if (!mg->mg_class->mc_alloc_throttle_enabled)
2670 		return;
2671 
2672 	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2673 }
2674 
2675 void
2676 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2677 {
2678 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2679 	    flags & METASLAB_DONT_THROTTLE)
2680 		return;
2681 
2682 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2683 	if (!mg->mg_class->mc_alloc_throttle_enabled)
2684 		return;
2685 
2686 	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2687 }
2688 
2689 void
2690 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2691 {
2692 #ifdef ZFS_DEBUG
2693 	const dva_t *dva = bp->blk_dva;
2694 	int ndvas = BP_GET_NDVAS(bp);
2695 
2696 	for (int d = 0; d < ndvas; d++) {
2697 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2698 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2699 		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2700 	}
2701 #endif
2702 }
2703 
2704 static uint64_t
2705 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2706 {
2707 	uint64_t start;
2708 	range_tree_t *rt = msp->ms_tree;
2709 	metaslab_class_t *mc = msp->ms_group->mg_class;
2710 
2711 	VERIFY(!msp->ms_condensing);
2712 
2713 	start = mc->mc_ops->msop_alloc(msp, size);
2714 	if (start != -1ULL) {
2715 		metaslab_group_t *mg = msp->ms_group;
2716 		vdev_t *vd = mg->mg_vd;
2717 
2718 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2719 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2720 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2721 		range_tree_remove(rt, start, size);
2722 
2723 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2724 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2725 
2726 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2727 
2728 		/* Track the last successful allocation */
2729 		msp->ms_alloc_txg = txg;
2730 		metaslab_verify_space(msp, txg);
2731 	}
2732 
2733 	/*
2734 	 * Now that we've attempted the allocation we need to update the
2735 	 * metaslab's maximum block size since it may have changed.
2736 	 */
2737 	msp->ms_max_size = metaslab_block_maxsize(msp);
2738 	return (start);
2739 }
2740 
2741 static uint64_t
2742 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2743     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2744 {
2745 	metaslab_t *msp = NULL;
2746 	uint64_t offset = -1ULL;
2747 	uint64_t activation_weight;
2748 	uint64_t target_distance;
2749 	int i;
2750 
2751 	activation_weight = METASLAB_WEIGHT_PRIMARY;
2752 	for (i = 0; i < d; i++) {
2753 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2754 			activation_weight = METASLAB_WEIGHT_SECONDARY;
2755 			break;
2756 		}
2757 	}
2758 
2759 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2760 	search->ms_weight = UINT64_MAX;
2761 	search->ms_start = 0;
2762 	for (;;) {
2763 		boolean_t was_active;
2764 		avl_tree_t *t = &mg->mg_metaslab_tree;
2765 		avl_index_t idx;
2766 
2767 		mutex_enter(&mg->mg_lock);
2768 
2769 		/*
2770 		 * Find the metaslab with the highest weight that is less
2771 		 * than what we've already tried.  In the common case, this
2772 		 * means that we will examine each metaslab at most once.
2773 		 * Note that concurrent callers could reorder metaslabs
2774 		 * by activation/passivation once we have dropped the mg_lock.
2775 		 * If a metaslab is activated by another thread, and we fail
2776 		 * to allocate from the metaslab we have selected, we may
2777 		 * not try the newly-activated metaslab, and instead activate
2778 		 * another metaslab.  This is not optimal, but generally
2779 		 * does not cause any problems (a possible exception being
2780 		 * if every metaslab is completely full except for the
2781 		 * the newly-activated metaslab which we fail to examine).
2782 		 */
2783 		msp = avl_find(t, search, &idx);
2784 		if (msp == NULL)
2785 			msp = avl_nearest(t, idx, AVL_AFTER);
2786 		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2787 
2788 			if (!metaslab_should_allocate(msp, asize)) {
2789 				metaslab_trace_add(zal, mg, msp, asize, d,
2790 				    TRACE_TOO_SMALL);
2791 				continue;
2792 			}
2793 
2794 			/*
2795 			 * If the selected metaslab is condensing, skip it.
2796 			 */
2797 			if (msp->ms_condensing)
2798 				continue;
2799 
2800 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2801 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2802 				break;
2803 
2804 			target_distance = min_distance +
2805 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2806 			    min_distance >> 1);
2807 
2808 			for (i = 0; i < d; i++) {
2809 				if (metaslab_distance(msp, &dva[i]) <
2810 				    target_distance)
2811 					break;
2812 			}
2813 			if (i == d)
2814 				break;
2815 		}
2816 		mutex_exit(&mg->mg_lock);
2817 		if (msp == NULL) {
2818 			kmem_free(search, sizeof (*search));
2819 			return (-1ULL);
2820 		}
2821 		search->ms_weight = msp->ms_weight;
2822 		search->ms_start = msp->ms_start + 1;
2823 
2824 		mutex_enter(&msp->ms_lock);
2825 
2826 		/*
2827 		 * Ensure that the metaslab we have selected is still
2828 		 * capable of handling our request. It's possible that
2829 		 * another thread may have changed the weight while we
2830 		 * were blocked on the metaslab lock. We check the
2831 		 * active status first to see if we need to reselect
2832 		 * a new metaslab.
2833 		 */
2834 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2835 			mutex_exit(&msp->ms_lock);
2836 			continue;
2837 		}
2838 
2839 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2840 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2841 			metaslab_passivate(msp,
2842 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2843 			mutex_exit(&msp->ms_lock);
2844 			continue;
2845 		}
2846 
2847 		if (metaslab_activate(msp, activation_weight) != 0) {
2848 			mutex_exit(&msp->ms_lock);
2849 			continue;
2850 		}
2851 		msp->ms_selected_txg = txg;
2852 
2853 		/*
2854 		 * Now that we have the lock, recheck to see if we should
2855 		 * continue to use this metaslab for this allocation. The
2856 		 * the metaslab is now loaded so metaslab_should_allocate() can
2857 		 * accurately determine if the allocation attempt should
2858 		 * proceed.
2859 		 */
2860 		if (!metaslab_should_allocate(msp, asize)) {
2861 			/* Passivate this metaslab and select a new one. */
2862 			metaslab_trace_add(zal, mg, msp, asize, d,
2863 			    TRACE_TOO_SMALL);
2864 			goto next;
2865 		}
2866 
2867 		/*
2868 		 * If this metaslab is currently condensing then pick again as
2869 		 * we can't manipulate this metaslab until it's committed
2870 		 * to disk.
2871 		 */
2872 		if (msp->ms_condensing) {
2873 			metaslab_trace_add(zal, mg, msp, asize, d,
2874 			    TRACE_CONDENSING);
2875 			mutex_exit(&msp->ms_lock);
2876 			continue;
2877 		}
2878 
2879 		offset = metaslab_block_alloc(msp, asize, txg);
2880 		metaslab_trace_add(zal, mg, msp, asize, d, offset);
2881 
2882 		if (offset != -1ULL) {
2883 			/* Proactively passivate the metaslab, if needed */
2884 			metaslab_segment_may_passivate(msp);
2885 			break;
2886 		}
2887 next:
2888 		ASSERT(msp->ms_loaded);
2889 
2890 		/*
2891 		 * We were unable to allocate from this metaslab so determine
2892 		 * a new weight for this metaslab. Now that we have loaded
2893 		 * the metaslab we can provide a better hint to the metaslab
2894 		 * selector.
2895 		 *
2896 		 * For space-based metaslabs, we use the maximum block size.
2897 		 * This information is only available when the metaslab
2898 		 * is loaded and is more accurate than the generic free
2899 		 * space weight that was calculated by metaslab_weight().
2900 		 * This information allows us to quickly compare the maximum
2901 		 * available allocation in the metaslab to the allocation
2902 		 * size being requested.
2903 		 *
2904 		 * For segment-based metaslabs, determine the new weight
2905 		 * based on the highest bucket in the range tree. We
2906 		 * explicitly use the loaded segment weight (i.e. the range
2907 		 * tree histogram) since it contains the space that is
2908 		 * currently available for allocation and is accurate
2909 		 * even within a sync pass.
2910 		 */
2911 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2912 			uint64_t weight = metaslab_block_maxsize(msp);
2913 			WEIGHT_SET_SPACEBASED(weight);
2914 			metaslab_passivate(msp, weight);
2915 		} else {
2916 			metaslab_passivate(msp,
2917 			    metaslab_weight_from_range_tree(msp));
2918 		}
2919 
2920 		/*
2921 		 * We have just failed an allocation attempt, check
2922 		 * that metaslab_should_allocate() agrees. Otherwise,
2923 		 * we may end up in an infinite loop retrying the same
2924 		 * metaslab.
2925 		 */
2926 		ASSERT(!metaslab_should_allocate(msp, asize));
2927 		mutex_exit(&msp->ms_lock);
2928 	}
2929 	mutex_exit(&msp->ms_lock);
2930 	kmem_free(search, sizeof (*search));
2931 	return (offset);
2932 }
2933 
2934 static uint64_t
2935 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2936     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2937 {
2938 	uint64_t offset;
2939 	ASSERT(mg->mg_initialized);
2940 
2941 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2942 	    min_distance, dva, d);
2943 
2944 	mutex_enter(&mg->mg_lock);
2945 	if (offset == -1ULL) {
2946 		mg->mg_failed_allocations++;
2947 		metaslab_trace_add(zal, mg, NULL, asize, d,
2948 		    TRACE_GROUP_FAILURE);
2949 		if (asize == SPA_GANGBLOCKSIZE) {
2950 			/*
2951 			 * This metaslab group was unable to allocate
2952 			 * the minimum gang block size so it must be out of
2953 			 * space. We must notify the allocation throttle
2954 			 * to start skipping allocation attempts to this
2955 			 * metaslab group until more space becomes available.
2956 			 * Note: this failure cannot be caused by the
2957 			 * allocation throttle since the allocation throttle
2958 			 * is only responsible for skipping devices and
2959 			 * not failing block allocations.
2960 			 */
2961 			mg->mg_no_free_space = B_TRUE;
2962 		}
2963 	}
2964 	mg->mg_allocations++;
2965 	mutex_exit(&mg->mg_lock);
2966 	return (offset);
2967 }
2968 
2969 /*
2970  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2971  * on the same vdev as an existing DVA of this BP, then try to allocate it
2972  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2973  * existing DVAs.
2974  */
2975 int ditto_same_vdev_distance_shift = 3;
2976 
2977 /*
2978  * Allocate a block for the specified i/o.
2979  */
2980 int
2981 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2982     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2983     zio_alloc_list_t *zal)
2984 {
2985 	metaslab_group_t *mg, *rotor;
2986 	vdev_t *vd;
2987 	boolean_t try_hard = B_FALSE;
2988 
2989 	ASSERT(!DVA_IS_VALID(&dva[d]));
2990 
2991 	/*
2992 	 * For testing, make some blocks above a certain size be gang blocks.
2993 	 */
2994 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2995 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2996 		return (SET_ERROR(ENOSPC));
2997 	}
2998 
2999 	/*
3000 	 * Start at the rotor and loop through all mgs until we find something.
3001 	 * Note that there's no locking on mc_rotor or mc_aliquot because
3002 	 * nothing actually breaks if we miss a few updates -- we just won't
3003 	 * allocate quite as evenly.  It all balances out over time.
3004 	 *
3005 	 * If we are doing ditto or log blocks, try to spread them across
3006 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
3007 	 * allocated all of our ditto blocks, then try and spread them out on
3008 	 * that vdev as much as possible.  If it turns out to not be possible,
3009 	 * gradually lower our standards until anything becomes acceptable.
3010 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3011 	 * gives us hope of containing our fault domains to something we're
3012 	 * able to reason about.  Otherwise, any two top-level vdev failures
3013 	 * will guarantee the loss of data.  With consecutive allocation,
3014 	 * only two adjacent top-level vdev failures will result in data loss.
3015 	 *
3016 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
3017 	 * ourselves on the same vdev as our gang block header.  That
3018 	 * way, we can hope for locality in vdev_cache, plus it makes our
3019 	 * fault domains something tractable.
3020 	 */
3021 	if (hintdva) {
3022 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3023 
3024 		/*
3025 		 * It's possible the vdev we're using as the hint no
3026 		 * longer exists or its mg has been closed (e.g. by
3027 		 * device removal).  Consult the rotor when
3028 		 * all else fails.
3029 		 */
3030 		if (vd != NULL && vd->vdev_mg != NULL) {
3031 			mg = vd->vdev_mg;
3032 
3033 			if (flags & METASLAB_HINTBP_AVOID &&
3034 			    mg->mg_next != NULL)
3035 				mg = mg->mg_next;
3036 		} else {
3037 			mg = mc->mc_rotor;
3038 		}
3039 	} else if (d != 0) {
3040 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3041 		mg = vd->vdev_mg->mg_next;
3042 	} else {
3043 		mg = mc->mc_rotor;
3044 	}
3045 
3046 	/*
3047 	 * If the hint put us into the wrong metaslab class, or into a
3048 	 * metaslab group that has been passivated, just follow the rotor.
3049 	 */
3050 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3051 		mg = mc->mc_rotor;
3052 
3053 	rotor = mg;
3054 top:
3055 	do {
3056 		boolean_t allocatable;
3057 
3058 		ASSERT(mg->mg_activation_count == 1);
3059 		vd = mg->mg_vd;
3060 
3061 		/*
3062 		 * Don't allocate from faulted devices.
3063 		 */
3064 		if (try_hard) {
3065 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3066 			allocatable = vdev_allocatable(vd);
3067 			spa_config_exit(spa, SCL_ZIO, FTAG);
3068 		} else {
3069 			allocatable = vdev_allocatable(vd);
3070 		}
3071 
3072 		/*
3073 		 * Determine if the selected metaslab group is eligible
3074 		 * for allocations. If we're ganging then don't allow
3075 		 * this metaslab group to skip allocations since that would
3076 		 * inadvertently return ENOSPC and suspend the pool
3077 		 * even though space is still available.
3078 		 */
3079 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3080 			allocatable = metaslab_group_allocatable(mg, rotor,
3081 			    psize);
3082 		}
3083 
3084 		if (!allocatable) {
3085 			metaslab_trace_add(zal, mg, NULL, psize, d,
3086 			    TRACE_NOT_ALLOCATABLE);
3087 			goto next;
3088 		}
3089 
3090 		ASSERT(mg->mg_initialized);
3091 
3092 		/*
3093 		 * Avoid writing single-copy data to a failing,
3094 		 * non-redundant vdev, unless we've already tried all
3095 		 * other vdevs.
3096 		 */
3097 		if ((vd->vdev_stat.vs_write_errors > 0 ||
3098 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
3099 		    d == 0 && !try_hard && vd->vdev_children == 0) {
3100 			metaslab_trace_add(zal, mg, NULL, psize, d,
3101 			    TRACE_VDEV_ERROR);
3102 			goto next;
3103 		}
3104 
3105 		ASSERT(mg->mg_class == mc);
3106 
3107 		/*
3108 		 * If we don't need to try hard, then require that the
3109 		 * block be 1/8th of the device away from any other DVAs
3110 		 * in this BP.  If we are trying hard, allow any offset
3111 		 * to be used (distance=0).
3112 		 */
3113 		uint64_t distance = 0;
3114 		if (!try_hard) {
3115 			distance = vd->vdev_asize >>
3116 			    ditto_same_vdev_distance_shift;
3117 			if (distance <= (1ULL << vd->vdev_ms_shift))
3118 				distance = 0;
3119 		}
3120 
3121 		uint64_t asize = vdev_psize_to_asize(vd, psize);
3122 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3123 
3124 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3125 		    distance, dva, d);
3126 
3127 		if (offset != -1ULL) {
3128 			/*
3129 			 * If we've just selected this metaslab group,
3130 			 * figure out whether the corresponding vdev is
3131 			 * over- or under-used relative to the pool,
3132 			 * and set an allocation bias to even it out.
3133 			 */
3134 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3135 				vdev_stat_t *vs = &vd->vdev_stat;
3136 				int64_t vu, cu;
3137 
3138 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3139 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3140 
3141 				/*
3142 				 * Calculate how much more or less we should
3143 				 * try to allocate from this device during
3144 				 * this iteration around the rotor.
3145 				 * For example, if a device is 80% full
3146 				 * and the pool is 20% full then we should
3147 				 * reduce allocations by 60% on this device.
3148 				 *
3149 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
3150 				 *
3151 				 * This reduces allocations by 307K for this
3152 				 * iteration.
3153 				 */
3154 				mg->mg_bias = ((cu - vu) *
3155 				    (int64_t)mg->mg_aliquot) / 100;
3156 			} else if (!metaslab_bias_enabled) {
3157 				mg->mg_bias = 0;
3158 			}
3159 
3160 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3161 			    mg->mg_aliquot + mg->mg_bias) {
3162 				mc->mc_rotor = mg->mg_next;
3163 				mc->mc_aliquot = 0;
3164 			}
3165 
3166 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
3167 			DVA_SET_OFFSET(&dva[d], offset);
3168 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3169 			DVA_SET_ASIZE(&dva[d], asize);
3170 
3171 			return (0);
3172 		}
3173 next:
3174 		mc->mc_rotor = mg->mg_next;
3175 		mc->mc_aliquot = 0;
3176 	} while ((mg = mg->mg_next) != rotor);
3177 
3178 	/*
3179 	 * If we haven't tried hard, do so now.
3180 	 */
3181 	if (!try_hard) {
3182 		try_hard = B_TRUE;
3183 		goto top;
3184 	}
3185 
3186 	bzero(&dva[d], sizeof (dva_t));
3187 
3188 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3189 	return (SET_ERROR(ENOSPC));
3190 }
3191 
3192 void
3193 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3194     uint64_t txg)
3195 {
3196 	metaslab_t *msp;
3197 	spa_t *spa = vd->vdev_spa;
3198 
3199 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
3200 	ASSERT(vdev_is_concrete(vd));
3201 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3202 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3203 
3204 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3205 
3206 	VERIFY(!msp->ms_condensing);
3207 	VERIFY3U(offset, >=, msp->ms_start);
3208 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3209 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3210 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3211 
3212 	metaslab_check_free_impl(vd, offset, asize);
3213 	mutex_enter(&msp->ms_lock);
3214 	if (range_tree_space(msp->ms_freeingtree) == 0) {
3215 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
3216 	}
3217 	range_tree_add(msp->ms_freeingtree, offset, asize);
3218 	mutex_exit(&msp->ms_lock);
3219 }
3220 
3221 /* ARGSUSED */
3222 void
3223 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3224     uint64_t size, void *arg)
3225 {
3226 	uint64_t *txgp = arg;
3227 
3228 	if (vd->vdev_ops->vdev_op_remap != NULL)
3229 		vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
3230 	else
3231 		metaslab_free_impl(vd, offset, size, *txgp);
3232 }
3233 
3234 static void
3235 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3236     uint64_t txg)
3237 {
3238 	spa_t *spa = vd->vdev_spa;
3239 
3240 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3241 
3242 	if (txg > spa_freeze_txg(spa))
3243 		return;
3244 
3245 	if (spa->spa_vdev_removal != NULL &&
3246 	    spa->spa_vdev_removal->svr_vdev == vd &&
3247 	    vdev_is_concrete(vd)) {
3248 		/*
3249 		 * Note: we check if the vdev is concrete because when
3250 		 * we complete the removal, we first change the vdev to be
3251 		 * an indirect vdev (in open context), and then (in syncing
3252 		 * context) clear spa_vdev_removal.
3253 		 */
3254 		free_from_removing_vdev(vd, offset, size, txg);
3255 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
3256 		vdev_indirect_mark_obsolete(vd, offset, size, txg);
3257 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
3258 		    metaslab_free_impl_cb, &txg);
3259 	} else {
3260 		metaslab_free_concrete(vd, offset, size, txg);
3261 	}
3262 }
3263 
3264 typedef struct remap_blkptr_cb_arg {
3265 	blkptr_t *rbca_bp;
3266 	spa_remap_cb_t rbca_cb;
3267 	vdev_t *rbca_remap_vd;
3268 	uint64_t rbca_remap_offset;
3269 	void *rbca_cb_arg;
3270 } remap_blkptr_cb_arg_t;
3271 
3272 void
3273 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3274     uint64_t size, void *arg)
3275 {
3276 	remap_blkptr_cb_arg_t *rbca = arg;
3277 	blkptr_t *bp = rbca->rbca_bp;
3278 
3279 	/* We can not remap split blocks. */
3280 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3281 		return;
3282 	ASSERT0(inner_offset);
3283 
3284 	if (rbca->rbca_cb != NULL) {
3285 		/*
3286 		 * At this point we know that we are not handling split
3287 		 * blocks and we invoke the callback on the previous
3288 		 * vdev which must be indirect.
3289 		 */
3290 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3291 
3292 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3293 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3294 
3295 		/* set up remap_blkptr_cb_arg for the next call */
3296 		rbca->rbca_remap_vd = vd;
3297 		rbca->rbca_remap_offset = offset;
3298 	}
3299 
3300 	/*
3301 	 * The phys birth time is that of dva[0].  This ensures that we know
3302 	 * when each dva was written, so that resilver can determine which
3303 	 * blocks need to be scrubbed (i.e. those written during the time
3304 	 * the vdev was offline).  It also ensures that the key used in
3305 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3306 	 * we didn't change the phys_birth, a lookup in the ARC for a
3307 	 * remapped BP could find the data that was previously stored at
3308 	 * this vdev + offset.
3309 	 */
3310 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3311 	    DVA_GET_VDEV(&bp->blk_dva[0]));
3312 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3313 	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3314 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3315 
3316 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3317 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3318 }
3319 
3320 /*
3321  * If the block pointer contains any indirect DVAs, modify them to refer to
3322  * concrete DVAs.  Note that this will sometimes not be possible, leaving
3323  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3324  * segments in the mapping (i.e. it is a "split block").
3325  *
3326  * If the BP was remapped, calls the callback on the original dva (note the
3327  * callback can be called multiple times if the original indirect DVA refers
3328  * to another indirect DVA, etc).
3329  *
3330  * Returns TRUE if the BP was remapped.
3331  */
3332 boolean_t
3333 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3334 {
3335 	remap_blkptr_cb_arg_t rbca;
3336 
3337 	if (!zfs_remap_blkptr_enable)
3338 		return (B_FALSE);
3339 
3340 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3341 		return (B_FALSE);
3342 
3343 	/*
3344 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
3345 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
3346 	 */
3347 	if (BP_GET_DEDUP(bp))
3348 		return (B_FALSE);
3349 
3350 	/*
3351 	 * Gang blocks can not be remapped, because
3352 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3353 	 * the BP used to read the gang block header (GBH) being the same
3354 	 * as the DVA[0] that we allocated for the GBH.
3355 	 */
3356 	if (BP_IS_GANG(bp))
3357 		return (B_FALSE);
3358 
3359 	/*
3360 	 * Embedded BP's have no DVA to remap.
3361 	 */
3362 	if (BP_GET_NDVAS(bp) < 1)
3363 		return (B_FALSE);
3364 
3365 	/*
3366 	 * Note: we only remap dva[0].  If we remapped other dvas, we
3367 	 * would no longer know what their phys birth txg is.
3368 	 */
3369 	dva_t *dva = &bp->blk_dva[0];
3370 
3371 	uint64_t offset = DVA_GET_OFFSET(dva);
3372 	uint64_t size = DVA_GET_ASIZE(dva);
3373 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3374 
3375 	if (vd->vdev_ops->vdev_op_remap == NULL)
3376 		return (B_FALSE);
3377 
3378 	rbca.rbca_bp = bp;
3379 	rbca.rbca_cb = callback;
3380 	rbca.rbca_remap_vd = vd;
3381 	rbca.rbca_remap_offset = offset;
3382 	rbca.rbca_cb_arg = arg;
3383 
3384 	/*
3385 	 * remap_blkptr_cb() will be called in order for each level of
3386 	 * indirection, until a concrete vdev is reached or a split block is
3387 	 * encountered. old_vd and old_offset are updated within the callback
3388 	 * as we go from the one indirect vdev to the next one (either concrete
3389 	 * or indirect again) in that order.
3390 	 */
3391 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3392 
3393 	/* Check if the DVA wasn't remapped because it is a split block */
3394 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3395 		return (B_FALSE);
3396 
3397 	return (B_TRUE);
3398 }
3399 
3400 /*
3401  * Undo the allocation of a DVA which happened in the given transaction group.
3402  */
3403 void
3404 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3405 {
3406 	metaslab_t *msp;
3407 	vdev_t *vd;
3408 	uint64_t vdev = DVA_GET_VDEV(dva);
3409 	uint64_t offset = DVA_GET_OFFSET(dva);
3410 	uint64_t size = DVA_GET_ASIZE(dva);
3411 
3412 	ASSERT(DVA_IS_VALID(dva));
3413 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3414 
3415 	if (txg > spa_freeze_txg(spa))
3416 		return;
3417 
3418 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3419 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3420 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3421 		    (u_longlong_t)vdev, (u_longlong_t)offset);
3422 		ASSERT(0);
3423 		return;
3424 	}
3425 
3426 	ASSERT(!vd->vdev_removing);
3427 	ASSERT(vdev_is_concrete(vd));
3428 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3429 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3430 
3431 	if (DVA_GET_GANG(dva))
3432 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3433 
3434 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3435 
3436 	mutex_enter(&msp->ms_lock);
3437 	range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3438 	    offset, size);
3439 
3440 	VERIFY(!msp->ms_condensing);
3441 	VERIFY3U(offset, >=, msp->ms_start);
3442 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3443 	VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3444 	    msp->ms_size);
3445 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3446 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3447 	range_tree_add(msp->ms_tree, offset, size);
3448 	mutex_exit(&msp->ms_lock);
3449 }
3450 
3451 /*
3452  * Free the block represented by DVA in the context of the specified
3453  * transaction group.
3454  */
3455 void
3456 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3457 {
3458 	uint64_t vdev = DVA_GET_VDEV(dva);
3459 	uint64_t offset = DVA_GET_OFFSET(dva);
3460 	uint64_t size = DVA_GET_ASIZE(dva);
3461 	vdev_t *vd = vdev_lookup_top(spa, vdev);
3462 
3463 	ASSERT(DVA_IS_VALID(dva));
3464 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3465 
3466 	if (DVA_GET_GANG(dva)) {
3467 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3468 	}
3469 
3470 	metaslab_free_impl(vd, offset, size, txg);
3471 }
3472 
3473 /*
3474  * Reserve some allocation slots. The reservation system must be called
3475  * before we call into the allocator. If there aren't any available slots
3476  * then the I/O will be throttled until an I/O completes and its slots are
3477  * freed up. The function returns true if it was successful in placing
3478  * the reservation.
3479  */
3480 boolean_t
3481 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3482     int flags)
3483 {
3484 	uint64_t available_slots = 0;
3485 	boolean_t slot_reserved = B_FALSE;
3486 
3487 	ASSERT(mc->mc_alloc_throttle_enabled);
3488 	mutex_enter(&mc->mc_lock);
3489 
3490 	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3491 	if (reserved_slots < mc->mc_alloc_max_slots)
3492 		available_slots = mc->mc_alloc_max_slots - reserved_slots;
3493 
3494 	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3495 		/*
3496 		 * We reserve the slots individually so that we can unreserve
3497 		 * them individually when an I/O completes.
3498 		 */
3499 		for (int d = 0; d < slots; d++) {
3500 			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3501 		}
3502 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3503 		slot_reserved = B_TRUE;
3504 	}
3505 
3506 	mutex_exit(&mc->mc_lock);
3507 	return (slot_reserved);
3508 }
3509 
3510 void
3511 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3512 {
3513 	ASSERT(mc->mc_alloc_throttle_enabled);
3514 	mutex_enter(&mc->mc_lock);
3515 	for (int d = 0; d < slots; d++) {
3516 		(void) refcount_remove(&mc->mc_alloc_slots, zio);
3517 	}
3518 	mutex_exit(&mc->mc_lock);
3519 }
3520 
3521 static int
3522 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3523     uint64_t txg)
3524 {
3525 	metaslab_t *msp;
3526 	spa_t *spa = vd->vdev_spa;
3527 	int error = 0;
3528 
3529 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3530 		return (ENXIO);
3531 
3532 	ASSERT3P(vd->vdev_ms, !=, NULL);
3533 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3534 
3535 	mutex_enter(&msp->ms_lock);
3536 
3537 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3538 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3539 
3540 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3541 		error = SET_ERROR(ENOENT);
3542 
3543 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
3544 		mutex_exit(&msp->ms_lock);
3545 		return (error);
3546 	}
3547 
3548 	VERIFY(!msp->ms_condensing);
3549 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3550 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3551 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3552 	range_tree_remove(msp->ms_tree, offset, size);
3553 
3554 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
3555 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3556 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
3557 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3558 	}
3559 
3560 	mutex_exit(&msp->ms_lock);
3561 
3562 	return (0);
3563 }
3564 
3565 typedef struct metaslab_claim_cb_arg_t {
3566 	uint64_t	mcca_txg;
3567 	int		mcca_error;
3568 } metaslab_claim_cb_arg_t;
3569 
3570 /* ARGSUSED */
3571 static void
3572 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3573     uint64_t size, void *arg)
3574 {
3575 	metaslab_claim_cb_arg_t *mcca_arg = arg;
3576 
3577 	if (mcca_arg->mcca_error == 0) {
3578 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3579 		    size, mcca_arg->mcca_txg);
3580 	}
3581 }
3582 
3583 int
3584 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3585 {
3586 	if (vd->vdev_ops->vdev_op_remap != NULL) {
3587 		metaslab_claim_cb_arg_t arg;
3588 
3589 		/*
3590 		 * Only zdb(1M) can claim on indirect vdevs.  This is used
3591 		 * to detect leaks of mapped space (that are not accounted
3592 		 * for in the obsolete counts, spacemap, or bpobj).
3593 		 */
3594 		ASSERT(!spa_writeable(vd->vdev_spa));
3595 		arg.mcca_error = 0;
3596 		arg.mcca_txg = txg;
3597 
3598 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
3599 		    metaslab_claim_impl_cb, &arg);
3600 
3601 		if (arg.mcca_error == 0) {
3602 			arg.mcca_error = metaslab_claim_concrete(vd,
3603 			    offset, size, txg);
3604 		}
3605 		return (arg.mcca_error);
3606 	} else {
3607 		return (metaslab_claim_concrete(vd, offset, size, txg));
3608 	}
3609 }
3610 
3611 /*
3612  * Intent log support: upon opening the pool after a crash, notify the SPA
3613  * of blocks that the intent log has allocated for immediate write, but
3614  * which are still considered free by the SPA because the last transaction
3615  * group didn't commit yet.
3616  */
3617 static int
3618 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3619 {
3620 	uint64_t vdev = DVA_GET_VDEV(dva);
3621 	uint64_t offset = DVA_GET_OFFSET(dva);
3622 	uint64_t size = DVA_GET_ASIZE(dva);
3623 	vdev_t *vd;
3624 
3625 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3626 		return (SET_ERROR(ENXIO));
3627 	}
3628 
3629 	ASSERT(DVA_IS_VALID(dva));
3630 
3631 	if (DVA_GET_GANG(dva))
3632 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3633 
3634 	return (metaslab_claim_impl(vd, offset, size, txg));
3635 }
3636 
3637 int
3638 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3639     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3640     zio_alloc_list_t *zal, zio_t *zio)
3641 {
3642 	dva_t *dva = bp->blk_dva;
3643 	dva_t *hintdva = hintbp->blk_dva;
3644 	int error = 0;
3645 
3646 	ASSERT(bp->blk_birth == 0);
3647 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3648 
3649 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3650 
3651 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
3652 		spa_config_exit(spa, SCL_ALLOC, FTAG);
3653 		return (SET_ERROR(ENOSPC));
3654 	}
3655 
3656 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3657 	ASSERT(BP_GET_NDVAS(bp) == 0);
3658 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3659 	ASSERT3P(zal, !=, NULL);
3660 
3661 	for (int d = 0; d < ndvas; d++) {
3662 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3663 		    txg, flags, zal);
3664 		if (error != 0) {
3665 			for (d--; d >= 0; d--) {
3666 				metaslab_unalloc_dva(spa, &dva[d], txg);
3667 				metaslab_group_alloc_decrement(spa,
3668 				    DVA_GET_VDEV(&dva[d]), zio, flags);
3669 				bzero(&dva[d], sizeof (dva_t));
3670 			}
3671 			spa_config_exit(spa, SCL_ALLOC, FTAG);
3672 			return (error);
3673 		} else {
3674 			/*
3675 			 * Update the metaslab group's queue depth
3676 			 * based on the newly allocated dva.
3677 			 */
3678 			metaslab_group_alloc_increment(spa,
3679 			    DVA_GET_VDEV(&dva[d]), zio, flags);
3680 		}
3681 
3682 	}
3683 	ASSERT(error == 0);
3684 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
3685 
3686 	spa_config_exit(spa, SCL_ALLOC, FTAG);
3687 
3688 	BP_SET_BIRTH(bp, txg, txg);
3689 
3690 	return (0);
3691 }
3692 
3693 void
3694 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3695 {
3696 	const dva_t *dva = bp->blk_dva;
3697 	int ndvas = BP_GET_NDVAS(bp);
3698 
3699 	ASSERT(!BP_IS_HOLE(bp));
3700 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3701 
3702 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3703 
3704 	for (int d = 0; d < ndvas; d++) {
3705 		if (now) {
3706 			metaslab_unalloc_dva(spa, &dva[d], txg);
3707 		} else {
3708 			metaslab_free_dva(spa, &dva[d], txg);
3709 		}
3710 	}
3711 
3712 	spa_config_exit(spa, SCL_FREE, FTAG);
3713 }
3714 
3715 int
3716 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3717 {
3718 	const dva_t *dva = bp->blk_dva;
3719 	int ndvas = BP_GET_NDVAS(bp);
3720 	int error = 0;
3721 
3722 	ASSERT(!BP_IS_HOLE(bp));
3723 
3724 	if (txg != 0) {
3725 		/*
3726 		 * First do a dry run to make sure all DVAs are claimable,
3727 		 * so we don't have to unwind from partial failures below.
3728 		 */
3729 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
3730 			return (error);
3731 	}
3732 
3733 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3734 
3735 	for (int d = 0; d < ndvas; d++)
3736 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3737 			break;
3738 
3739 	spa_config_exit(spa, SCL_ALLOC, FTAG);
3740 
3741 	ASSERT(error == 0 || txg == 0);
3742 
3743 	return (error);
3744 }
3745 
3746 /* ARGSUSED */
3747 static void
3748 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
3749     uint64_t size, void *arg)
3750 {
3751 	if (vd->vdev_ops == &vdev_indirect_ops)
3752 		return;
3753 
3754 	metaslab_check_free_impl(vd, offset, size);
3755 }
3756 
3757 static void
3758 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
3759 {
3760 	metaslab_t *msp;
3761 	spa_t *spa = vd->vdev_spa;
3762 
3763 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3764 		return;
3765 
3766 	if (vd->vdev_ops->vdev_op_remap != NULL) {
3767 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
3768 		    metaslab_check_free_impl_cb, NULL);
3769 		return;
3770 	}
3771 
3772 	ASSERT(vdev_is_concrete(vd));
3773 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3774 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3775 
3776 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3777 
3778 	mutex_enter(&msp->ms_lock);
3779 	if (msp->ms_loaded)
3780 		range_tree_verify(msp->ms_tree, offset, size);
3781 
3782 	range_tree_verify(msp->ms_freeingtree, offset, size);
3783 	range_tree_verify(msp->ms_freedtree, offset, size);
3784 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
3785 		range_tree_verify(msp->ms_defertree[j], offset, size);
3786 	mutex_exit(&msp->ms_lock);
3787 }
3788 
3789 void
3790 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3791 {
3792 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3793 		return;
3794 
3795 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3796 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3797 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3798 		vdev_t *vd = vdev_lookup_top(spa, vdev);
3799 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3800 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3801 
3802 		if (DVA_GET_GANG(&bp->blk_dva[i]))
3803 			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3804 
3805 		ASSERT3P(vd, !=, NULL);
3806 
3807 		metaslab_check_free_impl(vd, offset, size);
3808 	}
3809 	spa_config_exit(spa, SCL_VDEV, FTAG);
3810 }
3811