xref: /titanic_44/usr/src/uts/common/fs/zfs/metaslab.c (revision edf345e6b8342e8627ec20ce821a977a62cee19d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35 
36 /*
37  * Allow allocations to switch to gang blocks quickly. We do this to
38  * avoid having to load lots of space_maps in a given txg. There are,
39  * however, some cases where we want to avoid "fast" ganging and instead
40  * we want to do an exhaustive search of all metaslabs on this device.
41  * Currently we don't allow any gang, slog, or dump device related allocations
42  * to "fast" gang.
43  */
44 #define	CAN_FASTGANG(flags) \
45 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46 	METASLAB_GANG_AVOID)))
47 
48 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
49 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
50 #define	METASLAB_ACTIVE_MASK		\
51 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
52 
53 uint64_t metaslab_aliquot = 512ULL << 10;
54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
55 
56 /*
57  * The in-core space map representation is more compact than its on-disk form.
58  * The zfs_condense_pct determines how much more compact the in-core
59  * space_map representation must be before we compact it on-disk.
60  * Values should be greater than or equal to 100.
61  */
62 int zfs_condense_pct = 200;
63 
64 /*
65  * The zfs_mg_noalloc_threshold defines which metaslab groups should
66  * be eligible for allocation. The value is defined as a percentage of
67  * a free space. Metaslab groups that have more free space than
68  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
69  * a metaslab group's free space is less than or equal to the
70  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
71  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
72  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
73  * groups are allowed to accept allocations. Gang blocks are always
74  * eligible to allocate on any metaslab group. The default value of 0 means
75  * no metaslab group will be excluded based on this criterion.
76  */
77 int zfs_mg_noalloc_threshold = 0;
78 
79 /*
80  * When set will load all metaslabs when pool is first opened.
81  */
82 int metaslab_debug_load = 0;
83 
84 /*
85  * When set will prevent metaslabs from being unloaded.
86  */
87 int metaslab_debug_unload = 0;
88 
89 /*
90  * Minimum size which forces the dynamic allocator to change
91  * it's allocation strategy.  Once the space map cannot satisfy
92  * an allocation of this size then it switches to using more
93  * aggressive strategy (i.e search by size rather than offset).
94  */
95 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
96 
97 /*
98  * The minimum free space, in percent, which must be available
99  * in a space map to continue allocations in a first-fit fashion.
100  * Once the space_map's free space drops below this level we dynamically
101  * switch to using best-fit allocations.
102  */
103 int metaslab_df_free_pct = 4;
104 
105 /*
106  * A metaslab is considered "free" if it contains a contiguous
107  * segment which is greater than metaslab_min_alloc_size.
108  */
109 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
110 
111 /*
112  * Percentage of all cpus that can be used by the metaslab taskq.
113  */
114 int metaslab_load_pct = 50;
115 
116 /*
117  * Determines how many txgs a metaslab may remain loaded without having any
118  * allocations from it. As long as a metaslab continues to be used we will
119  * keep it loaded.
120  */
121 int metaslab_unload_delay = TXG_SIZE * 2;
122 
123 /*
124  * Should we be willing to write data to degraded vdevs?
125  */
126 boolean_t zfs_write_to_degraded = B_FALSE;
127 
128 /*
129  * Max number of metaslabs per group to preload.
130  */
131 int metaslab_preload_limit = SPA_DVAS_PER_BP;
132 
133 /*
134  * Enable/disable preloading of metaslab.
135  */
136 boolean_t metaslab_preload_enabled = B_TRUE;
137 
138 /*
139  * Enable/disable additional weight factor for each metaslab.
140  */
141 boolean_t metaslab_weight_factor_enable = B_FALSE;
142 
143 
144 /*
145  * ==========================================================================
146  * Metaslab classes
147  * ==========================================================================
148  */
149 metaslab_class_t *
150 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
151 {
152 	metaslab_class_t *mc;
153 
154 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
155 
156 	mc->mc_spa = spa;
157 	mc->mc_rotor = NULL;
158 	mc->mc_ops = ops;
159 
160 	return (mc);
161 }
162 
163 void
164 metaslab_class_destroy(metaslab_class_t *mc)
165 {
166 	ASSERT(mc->mc_rotor == NULL);
167 	ASSERT(mc->mc_alloc == 0);
168 	ASSERT(mc->mc_deferred == 0);
169 	ASSERT(mc->mc_space == 0);
170 	ASSERT(mc->mc_dspace == 0);
171 
172 	kmem_free(mc, sizeof (metaslab_class_t));
173 }
174 
175 int
176 metaslab_class_validate(metaslab_class_t *mc)
177 {
178 	metaslab_group_t *mg;
179 	vdev_t *vd;
180 
181 	/*
182 	 * Must hold one of the spa_config locks.
183 	 */
184 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
185 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
186 
187 	if ((mg = mc->mc_rotor) == NULL)
188 		return (0);
189 
190 	do {
191 		vd = mg->mg_vd;
192 		ASSERT(vd->vdev_mg != NULL);
193 		ASSERT3P(vd->vdev_top, ==, vd);
194 		ASSERT3P(mg->mg_class, ==, mc);
195 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
196 	} while ((mg = mg->mg_next) != mc->mc_rotor);
197 
198 	return (0);
199 }
200 
201 void
202 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
203     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
204 {
205 	atomic_add_64(&mc->mc_alloc, alloc_delta);
206 	atomic_add_64(&mc->mc_deferred, defer_delta);
207 	atomic_add_64(&mc->mc_space, space_delta);
208 	atomic_add_64(&mc->mc_dspace, dspace_delta);
209 }
210 
211 uint64_t
212 metaslab_class_get_alloc(metaslab_class_t *mc)
213 {
214 	return (mc->mc_alloc);
215 }
216 
217 uint64_t
218 metaslab_class_get_deferred(metaslab_class_t *mc)
219 {
220 	return (mc->mc_deferred);
221 }
222 
223 uint64_t
224 metaslab_class_get_space(metaslab_class_t *mc)
225 {
226 	return (mc->mc_space);
227 }
228 
229 uint64_t
230 metaslab_class_get_dspace(metaslab_class_t *mc)
231 {
232 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
233 }
234 
235 /*
236  * ==========================================================================
237  * Metaslab groups
238  * ==========================================================================
239  */
240 static int
241 metaslab_compare(const void *x1, const void *x2)
242 {
243 	const metaslab_t *m1 = x1;
244 	const metaslab_t *m2 = x2;
245 
246 	if (m1->ms_weight < m2->ms_weight)
247 		return (1);
248 	if (m1->ms_weight > m2->ms_weight)
249 		return (-1);
250 
251 	/*
252 	 * If the weights are identical, use the offset to force uniqueness.
253 	 */
254 	if (m1->ms_start < m2->ms_start)
255 		return (-1);
256 	if (m1->ms_start > m2->ms_start)
257 		return (1);
258 
259 	ASSERT3P(m1, ==, m2);
260 
261 	return (0);
262 }
263 
264 /*
265  * Update the allocatable flag and the metaslab group's capacity.
266  * The allocatable flag is set to true if the capacity is below
267  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
268  * from allocatable to non-allocatable or vice versa then the metaslab
269  * group's class is updated to reflect the transition.
270  */
271 static void
272 metaslab_group_alloc_update(metaslab_group_t *mg)
273 {
274 	vdev_t *vd = mg->mg_vd;
275 	metaslab_class_t *mc = mg->mg_class;
276 	vdev_stat_t *vs = &vd->vdev_stat;
277 	boolean_t was_allocatable;
278 
279 	ASSERT(vd == vd->vdev_top);
280 
281 	mutex_enter(&mg->mg_lock);
282 	was_allocatable = mg->mg_allocatable;
283 
284 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
285 	    (vs->vs_space + 1);
286 
287 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
288 
289 	/*
290 	 * The mc_alloc_groups maintains a count of the number of
291 	 * groups in this metaslab class that are still above the
292 	 * zfs_mg_noalloc_threshold. This is used by the allocating
293 	 * threads to determine if they should avoid allocations to
294 	 * a given group. The allocator will avoid allocations to a group
295 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
296 	 * and there are still other groups that are above the threshold.
297 	 * When a group transitions from allocatable to non-allocatable or
298 	 * vice versa we update the metaslab class to reflect that change.
299 	 * When the mc_alloc_groups value drops to 0 that means that all
300 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
301 	 * eligible for allocations. This effectively means that all devices
302 	 * are balanced again.
303 	 */
304 	if (was_allocatable && !mg->mg_allocatable)
305 		mc->mc_alloc_groups--;
306 	else if (!was_allocatable && mg->mg_allocatable)
307 		mc->mc_alloc_groups++;
308 	mutex_exit(&mg->mg_lock);
309 }
310 
311 metaslab_group_t *
312 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
313 {
314 	metaslab_group_t *mg;
315 
316 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
317 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
318 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
319 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
320 	mg->mg_vd = vd;
321 	mg->mg_class = mc;
322 	mg->mg_activation_count = 0;
323 
324 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
325 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
326 
327 	return (mg);
328 }
329 
330 void
331 metaslab_group_destroy(metaslab_group_t *mg)
332 {
333 	ASSERT(mg->mg_prev == NULL);
334 	ASSERT(mg->mg_next == NULL);
335 	/*
336 	 * We may have gone below zero with the activation count
337 	 * either because we never activated in the first place or
338 	 * because we're done, and possibly removing the vdev.
339 	 */
340 	ASSERT(mg->mg_activation_count <= 0);
341 
342 	taskq_destroy(mg->mg_taskq);
343 	avl_destroy(&mg->mg_metaslab_tree);
344 	mutex_destroy(&mg->mg_lock);
345 	kmem_free(mg, sizeof (metaslab_group_t));
346 }
347 
348 void
349 metaslab_group_activate(metaslab_group_t *mg)
350 {
351 	metaslab_class_t *mc = mg->mg_class;
352 	metaslab_group_t *mgprev, *mgnext;
353 
354 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
355 
356 	ASSERT(mc->mc_rotor != mg);
357 	ASSERT(mg->mg_prev == NULL);
358 	ASSERT(mg->mg_next == NULL);
359 	ASSERT(mg->mg_activation_count <= 0);
360 
361 	if (++mg->mg_activation_count <= 0)
362 		return;
363 
364 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
365 	metaslab_group_alloc_update(mg);
366 
367 	if ((mgprev = mc->mc_rotor) == NULL) {
368 		mg->mg_prev = mg;
369 		mg->mg_next = mg;
370 	} else {
371 		mgnext = mgprev->mg_next;
372 		mg->mg_prev = mgprev;
373 		mg->mg_next = mgnext;
374 		mgprev->mg_next = mg;
375 		mgnext->mg_prev = mg;
376 	}
377 	mc->mc_rotor = mg;
378 }
379 
380 void
381 metaslab_group_passivate(metaslab_group_t *mg)
382 {
383 	metaslab_class_t *mc = mg->mg_class;
384 	metaslab_group_t *mgprev, *mgnext;
385 
386 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
387 
388 	if (--mg->mg_activation_count != 0) {
389 		ASSERT(mc->mc_rotor != mg);
390 		ASSERT(mg->mg_prev == NULL);
391 		ASSERT(mg->mg_next == NULL);
392 		ASSERT(mg->mg_activation_count < 0);
393 		return;
394 	}
395 
396 	taskq_wait(mg->mg_taskq);
397 
398 	mgprev = mg->mg_prev;
399 	mgnext = mg->mg_next;
400 
401 	if (mg == mgnext) {
402 		mc->mc_rotor = NULL;
403 	} else {
404 		mc->mc_rotor = mgnext;
405 		mgprev->mg_next = mgnext;
406 		mgnext->mg_prev = mgprev;
407 	}
408 
409 	mg->mg_prev = NULL;
410 	mg->mg_next = NULL;
411 }
412 
413 static void
414 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
415 {
416 	mutex_enter(&mg->mg_lock);
417 	ASSERT(msp->ms_group == NULL);
418 	msp->ms_group = mg;
419 	msp->ms_weight = 0;
420 	avl_add(&mg->mg_metaslab_tree, msp);
421 	mutex_exit(&mg->mg_lock);
422 }
423 
424 static void
425 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
426 {
427 	mutex_enter(&mg->mg_lock);
428 	ASSERT(msp->ms_group == mg);
429 	avl_remove(&mg->mg_metaslab_tree, msp);
430 	msp->ms_group = NULL;
431 	mutex_exit(&mg->mg_lock);
432 }
433 
434 static void
435 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
436 {
437 	/*
438 	 * Although in principle the weight can be any value, in
439 	 * practice we do not use values in the range [1, 510].
440 	 */
441 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
442 	ASSERT(MUTEX_HELD(&msp->ms_lock));
443 
444 	mutex_enter(&mg->mg_lock);
445 	ASSERT(msp->ms_group == mg);
446 	avl_remove(&mg->mg_metaslab_tree, msp);
447 	msp->ms_weight = weight;
448 	avl_add(&mg->mg_metaslab_tree, msp);
449 	mutex_exit(&mg->mg_lock);
450 }
451 
452 /*
453  * Determine if a given metaslab group should skip allocations. A metaslab
454  * group should avoid allocations if its used capacity has crossed the
455  * zfs_mg_noalloc_threshold and there is at least one metaslab group
456  * that can still handle allocations.
457  */
458 static boolean_t
459 metaslab_group_allocatable(metaslab_group_t *mg)
460 {
461 	vdev_t *vd = mg->mg_vd;
462 	spa_t *spa = vd->vdev_spa;
463 	metaslab_class_t *mc = mg->mg_class;
464 
465 	/*
466 	 * A metaslab group is considered allocatable if its free capacity
467 	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
468 	 * associated with a slog, or there are no other metaslab groups
469 	 * with free capacity greater than zfs_mg_noalloc_threshold.
470 	 */
471 	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
472 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
473 }
474 
475 /*
476  * ==========================================================================
477  * Range tree callbacks
478  * ==========================================================================
479  */
480 
481 /*
482  * Comparison function for the private size-ordered tree. Tree is sorted
483  * by size, larger sizes at the end of the tree.
484  */
485 static int
486 metaslab_rangesize_compare(const void *x1, const void *x2)
487 {
488 	const range_seg_t *r1 = x1;
489 	const range_seg_t *r2 = x2;
490 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
491 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
492 
493 	if (rs_size1 < rs_size2)
494 		return (-1);
495 	if (rs_size1 > rs_size2)
496 		return (1);
497 
498 	if (r1->rs_start < r2->rs_start)
499 		return (-1);
500 
501 	if (r1->rs_start > r2->rs_start)
502 		return (1);
503 
504 	return (0);
505 }
506 
507 /*
508  * Create any block allocator specific components. The current allocators
509  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
510  */
511 static void
512 metaslab_rt_create(range_tree_t *rt, void *arg)
513 {
514 	metaslab_t *msp = arg;
515 
516 	ASSERT3P(rt->rt_arg, ==, msp);
517 	ASSERT(msp->ms_tree == NULL);
518 
519 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
520 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
521 }
522 
523 /*
524  * Destroy the block allocator specific components.
525  */
526 static void
527 metaslab_rt_destroy(range_tree_t *rt, void *arg)
528 {
529 	metaslab_t *msp = arg;
530 
531 	ASSERT3P(rt->rt_arg, ==, msp);
532 	ASSERT3P(msp->ms_tree, ==, rt);
533 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
534 
535 	avl_destroy(&msp->ms_size_tree);
536 }
537 
538 static void
539 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
540 {
541 	metaslab_t *msp = arg;
542 
543 	ASSERT3P(rt->rt_arg, ==, msp);
544 	ASSERT3P(msp->ms_tree, ==, rt);
545 	VERIFY(!msp->ms_condensing);
546 	avl_add(&msp->ms_size_tree, rs);
547 }
548 
549 static void
550 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
551 {
552 	metaslab_t *msp = arg;
553 
554 	ASSERT3P(rt->rt_arg, ==, msp);
555 	ASSERT3P(msp->ms_tree, ==, rt);
556 	VERIFY(!msp->ms_condensing);
557 	avl_remove(&msp->ms_size_tree, rs);
558 }
559 
560 static void
561 metaslab_rt_vacate(range_tree_t *rt, void *arg)
562 {
563 	metaslab_t *msp = arg;
564 
565 	ASSERT3P(rt->rt_arg, ==, msp);
566 	ASSERT3P(msp->ms_tree, ==, rt);
567 
568 	/*
569 	 * Normally one would walk the tree freeing nodes along the way.
570 	 * Since the nodes are shared with the range trees we can avoid
571 	 * walking all nodes and just reinitialize the avl tree. The nodes
572 	 * will be freed by the range tree, so we don't want to free them here.
573 	 */
574 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
575 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
576 }
577 
578 static range_tree_ops_t metaslab_rt_ops = {
579 	metaslab_rt_create,
580 	metaslab_rt_destroy,
581 	metaslab_rt_add,
582 	metaslab_rt_remove,
583 	metaslab_rt_vacate
584 };
585 
586 /*
587  * ==========================================================================
588  * Metaslab block operations
589  * ==========================================================================
590  */
591 
592 /*
593  * Return the maximum contiguous segment within the metaslab.
594  */
595 uint64_t
596 metaslab_block_maxsize(metaslab_t *msp)
597 {
598 	avl_tree_t *t = &msp->ms_size_tree;
599 	range_seg_t *rs;
600 
601 	if (t == NULL || (rs = avl_last(t)) == NULL)
602 		return (0ULL);
603 
604 	return (rs->rs_end - rs->rs_start);
605 }
606 
607 uint64_t
608 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
609 {
610 	uint64_t start;
611 	range_tree_t *rt = msp->ms_tree;
612 
613 	VERIFY(!msp->ms_condensing);
614 
615 	start = msp->ms_ops->msop_alloc(msp, size);
616 	if (start != -1ULL) {
617 		vdev_t *vd = msp->ms_group->mg_vd;
618 
619 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
620 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
621 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
622 		range_tree_remove(rt, start, size);
623 	}
624 	return (start);
625 }
626 
627 /*
628  * ==========================================================================
629  * Common allocator routines
630  * ==========================================================================
631  */
632 
633 /*
634  * This is a helper function that can be used by the allocator to find
635  * a suitable block to allocate. This will search the specified AVL
636  * tree looking for a block that matches the specified criteria.
637  */
638 static uint64_t
639 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
640     uint64_t align)
641 {
642 	range_seg_t *rs, rsearch;
643 	avl_index_t where;
644 
645 	rsearch.rs_start = *cursor;
646 	rsearch.rs_end = *cursor + size;
647 
648 	rs = avl_find(t, &rsearch, &where);
649 	if (rs == NULL)
650 		rs = avl_nearest(t, where, AVL_AFTER);
651 
652 	while (rs != NULL) {
653 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
654 
655 		if (offset + size <= rs->rs_end) {
656 			*cursor = offset + size;
657 			return (offset);
658 		}
659 		rs = AVL_NEXT(t, rs);
660 	}
661 
662 	/*
663 	 * If we know we've searched the whole map (*cursor == 0), give up.
664 	 * Otherwise, reset the cursor to the beginning and try again.
665 	 */
666 	if (*cursor == 0)
667 		return (-1ULL);
668 
669 	*cursor = 0;
670 	return (metaslab_block_picker(t, cursor, size, align));
671 }
672 
673 /*
674  * ==========================================================================
675  * The first-fit block allocator
676  * ==========================================================================
677  */
678 static uint64_t
679 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
680 {
681 	/*
682 	 * Find the largest power of 2 block size that evenly divides the
683 	 * requested size. This is used to try to allocate blocks with similar
684 	 * alignment from the same area of the metaslab (i.e. same cursor
685 	 * bucket) but it does not guarantee that other allocations sizes
686 	 * may exist in the same region.
687 	 */
688 	uint64_t align = size & -size;
689 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
690 	avl_tree_t *t = &msp->ms_tree->rt_root;
691 
692 	return (metaslab_block_picker(t, cursor, size, align));
693 }
694 
695 /* ARGSUSED */
696 static boolean_t
697 metaslab_ff_fragmented(metaslab_t *msp)
698 {
699 	return (B_TRUE);
700 }
701 
702 static metaslab_ops_t metaslab_ff_ops = {
703 	metaslab_ff_alloc,
704 	metaslab_ff_fragmented
705 };
706 
707 /*
708  * ==========================================================================
709  * Dynamic block allocator -
710  * Uses the first fit allocation scheme until space get low and then
711  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
712  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
713  * ==========================================================================
714  */
715 static uint64_t
716 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
717 {
718 	/*
719 	 * Find the largest power of 2 block size that evenly divides the
720 	 * requested size. This is used to try to allocate blocks with similar
721 	 * alignment from the same area of the metaslab (i.e. same cursor
722 	 * bucket) but it does not guarantee that other allocations sizes
723 	 * may exist in the same region.
724 	 */
725 	uint64_t align = size & -size;
726 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
727 	range_tree_t *rt = msp->ms_tree;
728 	avl_tree_t *t = &rt->rt_root;
729 	uint64_t max_size = metaslab_block_maxsize(msp);
730 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
731 
732 	ASSERT(MUTEX_HELD(&msp->ms_lock));
733 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
734 
735 	if (max_size < size)
736 		return (-1ULL);
737 
738 	/*
739 	 * If we're running low on space switch to using the size
740 	 * sorted AVL tree (best-fit).
741 	 */
742 	if (max_size < metaslab_df_alloc_threshold ||
743 	    free_pct < metaslab_df_free_pct) {
744 		t = &msp->ms_size_tree;
745 		*cursor = 0;
746 	}
747 
748 	return (metaslab_block_picker(t, cursor, size, 1ULL));
749 }
750 
751 static boolean_t
752 metaslab_df_fragmented(metaslab_t *msp)
753 {
754 	range_tree_t *rt = msp->ms_tree;
755 	uint64_t max_size = metaslab_block_maxsize(msp);
756 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
757 
758 	if (max_size >= metaslab_df_alloc_threshold &&
759 	    free_pct >= metaslab_df_free_pct)
760 		return (B_FALSE);
761 
762 	return (B_TRUE);
763 }
764 
765 static metaslab_ops_t metaslab_df_ops = {
766 	metaslab_df_alloc,
767 	metaslab_df_fragmented
768 };
769 
770 /*
771  * ==========================================================================
772  * Cursor fit block allocator -
773  * Select the largest region in the metaslab, set the cursor to the beginning
774  * of the range and the cursor_end to the end of the range. As allocations
775  * are made advance the cursor. Continue allocating from the cursor until
776  * the range is exhausted and then find a new range.
777  * ==========================================================================
778  */
779 static uint64_t
780 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
781 {
782 	range_tree_t *rt = msp->ms_tree;
783 	avl_tree_t *t = &msp->ms_size_tree;
784 	uint64_t *cursor = &msp->ms_lbas[0];
785 	uint64_t *cursor_end = &msp->ms_lbas[1];
786 	uint64_t offset = 0;
787 
788 	ASSERT(MUTEX_HELD(&msp->ms_lock));
789 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
790 
791 	ASSERT3U(*cursor_end, >=, *cursor);
792 
793 	if ((*cursor + size) > *cursor_end) {
794 		range_seg_t *rs;
795 
796 		rs = avl_last(&msp->ms_size_tree);
797 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
798 			return (-1ULL);
799 
800 		*cursor = rs->rs_start;
801 		*cursor_end = rs->rs_end;
802 	}
803 
804 	offset = *cursor;
805 	*cursor += size;
806 
807 	return (offset);
808 }
809 
810 static boolean_t
811 metaslab_cf_fragmented(metaslab_t *msp)
812 {
813 	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
814 }
815 
816 static metaslab_ops_t metaslab_cf_ops = {
817 	metaslab_cf_alloc,
818 	metaslab_cf_fragmented
819 };
820 
821 /*
822  * ==========================================================================
823  * New dynamic fit allocator -
824  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
825  * contiguous blocks. If no region is found then just use the largest segment
826  * that remains.
827  * ==========================================================================
828  */
829 
830 /*
831  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
832  * to request from the allocator.
833  */
834 uint64_t metaslab_ndf_clump_shift = 4;
835 
836 static uint64_t
837 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
838 {
839 	avl_tree_t *t = &msp->ms_tree->rt_root;
840 	avl_index_t where;
841 	range_seg_t *rs, rsearch;
842 	uint64_t hbit = highbit64(size);
843 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
844 	uint64_t max_size = metaslab_block_maxsize(msp);
845 
846 	ASSERT(MUTEX_HELD(&msp->ms_lock));
847 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
848 
849 	if (max_size < size)
850 		return (-1ULL);
851 
852 	rsearch.rs_start = *cursor;
853 	rsearch.rs_end = *cursor + size;
854 
855 	rs = avl_find(t, &rsearch, &where);
856 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
857 		t = &msp->ms_size_tree;
858 
859 		rsearch.rs_start = 0;
860 		rsearch.rs_end = MIN(max_size,
861 		    1ULL << (hbit + metaslab_ndf_clump_shift));
862 		rs = avl_find(t, &rsearch, &where);
863 		if (rs == NULL)
864 			rs = avl_nearest(t, where, AVL_AFTER);
865 		ASSERT(rs != NULL);
866 	}
867 
868 	if ((rs->rs_end - rs->rs_start) >= size) {
869 		*cursor = rs->rs_start + size;
870 		return (rs->rs_start);
871 	}
872 	return (-1ULL);
873 }
874 
875 static boolean_t
876 metaslab_ndf_fragmented(metaslab_t *msp)
877 {
878 	return (metaslab_block_maxsize(msp) <=
879 	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
880 }
881 
882 static metaslab_ops_t metaslab_ndf_ops = {
883 	metaslab_ndf_alloc,
884 	metaslab_ndf_fragmented
885 };
886 
887 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
888 
889 /*
890  * ==========================================================================
891  * Metaslabs
892  * ==========================================================================
893  */
894 
895 /*
896  * Wait for any in-progress metaslab loads to complete.
897  */
898 void
899 metaslab_load_wait(metaslab_t *msp)
900 {
901 	ASSERT(MUTEX_HELD(&msp->ms_lock));
902 
903 	while (msp->ms_loading) {
904 		ASSERT(!msp->ms_loaded);
905 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
906 	}
907 }
908 
909 int
910 metaslab_load(metaslab_t *msp)
911 {
912 	int error = 0;
913 
914 	ASSERT(MUTEX_HELD(&msp->ms_lock));
915 	ASSERT(!msp->ms_loaded);
916 	ASSERT(!msp->ms_loading);
917 
918 	msp->ms_loading = B_TRUE;
919 
920 	/*
921 	 * If the space map has not been allocated yet, then treat
922 	 * all the space in the metaslab as free and add it to the
923 	 * ms_tree.
924 	 */
925 	if (msp->ms_sm != NULL)
926 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
927 	else
928 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
929 
930 	msp->ms_loaded = (error == 0);
931 	msp->ms_loading = B_FALSE;
932 
933 	if (msp->ms_loaded) {
934 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
935 			range_tree_walk(msp->ms_defertree[t],
936 			    range_tree_remove, msp->ms_tree);
937 		}
938 	}
939 	cv_broadcast(&msp->ms_load_cv);
940 	return (error);
941 }
942 
943 void
944 metaslab_unload(metaslab_t *msp)
945 {
946 	ASSERT(MUTEX_HELD(&msp->ms_lock));
947 	range_tree_vacate(msp->ms_tree, NULL, NULL);
948 	msp->ms_loaded = B_FALSE;
949 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
950 }
951 
952 metaslab_t *
953 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
954 {
955 	vdev_t *vd = mg->mg_vd;
956 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
957 	metaslab_t *msp;
958 
959 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
960 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
961 	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
962 	msp->ms_id = id;
963 	msp->ms_start = id << vd->vdev_ms_shift;
964 	msp->ms_size = 1ULL << vd->vdev_ms_shift;
965 
966 	/*
967 	 * We only open space map objects that already exist. All others
968 	 * will be opened when we finally allocate an object for it.
969 	 */
970 	if (object != 0) {
971 		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
972 		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
973 		ASSERT(msp->ms_sm != NULL);
974 	}
975 
976 	/*
977 	 * We create the main range tree here, but we don't create the
978 	 * alloctree and freetree until metaslab_sync_done().  This serves
979 	 * two purposes: it allows metaslab_sync_done() to detect the
980 	 * addition of new space; and for debugging, it ensures that we'd
981 	 * data fault on any attempt to use this metaslab before it's ready.
982 	 */
983 	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
984 	metaslab_group_add(mg, msp);
985 
986 	msp->ms_ops = mg->mg_class->mc_ops;
987 
988 	/*
989 	 * If we're opening an existing pool (txg == 0) or creating
990 	 * a new one (txg == TXG_INITIAL), all space is available now.
991 	 * If we're adding space to an existing pool, the new space
992 	 * does not become available until after this txg has synced.
993 	 */
994 	if (txg <= TXG_INITIAL)
995 		metaslab_sync_done(msp, 0);
996 
997 	/*
998 	 * If metaslab_debug_load is set and we're initializing a metaslab
999 	 * that has an allocated space_map object then load the its space
1000 	 * map so that can verify frees.
1001 	 */
1002 	if (metaslab_debug_load && msp->ms_sm != NULL) {
1003 		mutex_enter(&msp->ms_lock);
1004 		VERIFY0(metaslab_load(msp));
1005 		mutex_exit(&msp->ms_lock);
1006 	}
1007 
1008 	if (txg != 0) {
1009 		vdev_dirty(vd, 0, NULL, txg);
1010 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1011 	}
1012 
1013 	return (msp);
1014 }
1015 
1016 void
1017 metaslab_fini(metaslab_t *msp)
1018 {
1019 	metaslab_group_t *mg = msp->ms_group;
1020 
1021 	metaslab_group_remove(mg, msp);
1022 
1023 	mutex_enter(&msp->ms_lock);
1024 
1025 	VERIFY(msp->ms_group == NULL);
1026 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1027 	    0, -msp->ms_size);
1028 	space_map_close(msp->ms_sm);
1029 
1030 	metaslab_unload(msp);
1031 	range_tree_destroy(msp->ms_tree);
1032 
1033 	for (int t = 0; t < TXG_SIZE; t++) {
1034 		range_tree_destroy(msp->ms_alloctree[t]);
1035 		range_tree_destroy(msp->ms_freetree[t]);
1036 	}
1037 
1038 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1039 		range_tree_destroy(msp->ms_defertree[t]);
1040 	}
1041 
1042 	ASSERT0(msp->ms_deferspace);
1043 
1044 	mutex_exit(&msp->ms_lock);
1045 	cv_destroy(&msp->ms_load_cv);
1046 	mutex_destroy(&msp->ms_lock);
1047 
1048 	kmem_free(msp, sizeof (metaslab_t));
1049 }
1050 
1051 /*
1052  * Apply a weighting factor based on the histogram information for this
1053  * metaslab. The current weighting factor is somewhat arbitrary and requires
1054  * additional investigation. The implementation provides a measure of
1055  * "weighted" free space and gives a higher weighting for larger contiguous
1056  * regions. The weighting factor is determined by counting the number of
1057  * sm_shift sectors that exist in each region represented by the histogram.
1058  * That value is then multiplied by the power of 2 exponent and the sm_shift
1059  * value.
1060  *
1061  * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1062  * metaslab has an sm_shift value of 9 (512B):
1063  *
1064  * 1) calculate the number of sm_shift sectors in the region:
1065  *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1066  * 2) multiply by the power of 2 exponent and the sm_shift value:
1067  *	16384 * 21 * 9 = 3096576
1068  * This value will be added to the weighting of the metaslab.
1069  */
1070 static uint64_t
1071 metaslab_weight_factor(metaslab_t *msp)
1072 {
1073 	uint64_t factor = 0;
1074 	uint64_t sectors;
1075 	int i;
1076 
1077 	/*
1078 	 * A null space map means that the entire metaslab is free,
1079 	 * calculate a weight factor that spans the entire size of the
1080 	 * metaslab.
1081 	 */
1082 	if (msp->ms_sm == NULL) {
1083 		vdev_t *vd = msp->ms_group->mg_vd;
1084 
1085 		i = highbit64(msp->ms_size) - 1;
1086 		sectors = msp->ms_size >> vd->vdev_ashift;
1087 		return (sectors * i * vd->vdev_ashift);
1088 	}
1089 
1090 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1091 		return (0);
1092 
1093 	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1094 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1095 			continue;
1096 
1097 		/*
1098 		 * Determine the number of sm_shift sectors in the region
1099 		 * indicated by the histogram. For example, given an
1100 		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1101 		 * that we're looking at an 8K region in the histogram
1102 		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1103 		 * number of sm_shift sectors (512 bytes in this example),
1104 		 * we would take 8192 / 512 = 16. Since the histogram
1105 		 * is offset by sm_shift we can simply use the value of
1106 		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1107 		 */
1108 		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1109 		factor += (i + msp->ms_sm->sm_shift) * sectors;
1110 	}
1111 	return (factor * msp->ms_sm->sm_shift);
1112 }
1113 
1114 static uint64_t
1115 metaslab_weight(metaslab_t *msp)
1116 {
1117 	metaslab_group_t *mg = msp->ms_group;
1118 	vdev_t *vd = mg->mg_vd;
1119 	uint64_t weight, space;
1120 
1121 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1122 
1123 	/*
1124 	 * This vdev is in the process of being removed so there is nothing
1125 	 * for us to do here.
1126 	 */
1127 	if (vd->vdev_removing) {
1128 		ASSERT0(space_map_allocated(msp->ms_sm));
1129 		ASSERT0(vd->vdev_ms_shift);
1130 		return (0);
1131 	}
1132 
1133 	/*
1134 	 * The baseline weight is the metaslab's free space.
1135 	 */
1136 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1137 	weight = space;
1138 
1139 	/*
1140 	 * Modern disks have uniform bit density and constant angular velocity.
1141 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1142 	 * than the inner zones by the ratio of outer to inner track diameter,
1143 	 * which is typically around 2:1.  We account for this by assigning
1144 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1145 	 * In effect, this means that we'll select the metaslab with the most
1146 	 * free bandwidth rather than simply the one with the most free space.
1147 	 */
1148 	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1149 	ASSERT(weight >= space && weight <= 2 * space);
1150 
1151 	msp->ms_factor = metaslab_weight_factor(msp);
1152 	if (metaslab_weight_factor_enable)
1153 		weight += msp->ms_factor;
1154 
1155 	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1156 		/*
1157 		 * If this metaslab is one we're actively using, adjust its
1158 		 * weight to make it preferable to any inactive metaslab so
1159 		 * we'll polish it off.
1160 		 */
1161 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1162 	}
1163 
1164 	return (weight);
1165 }
1166 
1167 static int
1168 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1169 {
1170 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1171 
1172 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1173 		metaslab_load_wait(msp);
1174 		if (!msp->ms_loaded) {
1175 			int error = metaslab_load(msp);
1176 			if (error) {
1177 				metaslab_group_sort(msp->ms_group, msp, 0);
1178 				return (error);
1179 			}
1180 		}
1181 
1182 		metaslab_group_sort(msp->ms_group, msp,
1183 		    msp->ms_weight | activation_weight);
1184 	}
1185 	ASSERT(msp->ms_loaded);
1186 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1187 
1188 	return (0);
1189 }
1190 
1191 static void
1192 metaslab_passivate(metaslab_t *msp, uint64_t size)
1193 {
1194 	/*
1195 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1196 	 * this metaslab again.  In that case, it had better be empty,
1197 	 * or we would be leaving space on the table.
1198 	 */
1199 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1200 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1201 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1202 }
1203 
1204 static void
1205 metaslab_preload(void *arg)
1206 {
1207 	metaslab_t *msp = arg;
1208 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1209 
1210 	mutex_enter(&msp->ms_lock);
1211 	metaslab_load_wait(msp);
1212 	if (!msp->ms_loaded)
1213 		(void) metaslab_load(msp);
1214 
1215 	/*
1216 	 * Set the ms_access_txg value so that we don't unload it right away.
1217 	 */
1218 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1219 	mutex_exit(&msp->ms_lock);
1220 }
1221 
1222 static void
1223 metaslab_group_preload(metaslab_group_t *mg)
1224 {
1225 	spa_t *spa = mg->mg_vd->vdev_spa;
1226 	metaslab_t *msp;
1227 	avl_tree_t *t = &mg->mg_metaslab_tree;
1228 	int m = 0;
1229 
1230 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1231 		taskq_wait(mg->mg_taskq);
1232 		return;
1233 	}
1234 	mutex_enter(&mg->mg_lock);
1235 
1236 	/*
1237 	 * Prefetch the next potential metaslabs
1238 	 */
1239 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1240 
1241 		/* If we have reached our preload limit then we're done */
1242 		if (++m > metaslab_preload_limit)
1243 			break;
1244 
1245 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1246 		    msp, TQ_SLEEP) != NULL);
1247 	}
1248 	mutex_exit(&mg->mg_lock);
1249 }
1250 
1251 /*
1252  * Determine if the space map's on-disk footprint is past our tolerance
1253  * for inefficiency. We would like to use the following criteria to make
1254  * our decision:
1255  *
1256  * 1. The size of the space map object should not dramatically increase as a
1257  * result of writing out the free space range tree.
1258  *
1259  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1260  * times the size than the free space range tree representation
1261  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1262  *
1263  * Checking the first condition is tricky since we don't want to walk
1264  * the entire AVL tree calculating the estimated on-disk size. Instead we
1265  * use the size-ordered range tree in the metaslab and calculate the
1266  * size required to write out the largest segment in our free tree. If the
1267  * size required to represent that segment on disk is larger than the space
1268  * map object then we avoid condensing this map.
1269  *
1270  * To determine the second criterion we use a best-case estimate and assume
1271  * each segment can be represented on-disk as a single 64-bit entry. We refer
1272  * to this best-case estimate as the space map's minimal form.
1273  */
1274 static boolean_t
1275 metaslab_should_condense(metaslab_t *msp)
1276 {
1277 	space_map_t *sm = msp->ms_sm;
1278 	range_seg_t *rs;
1279 	uint64_t size, entries, segsz;
1280 
1281 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1282 	ASSERT(msp->ms_loaded);
1283 
1284 	/*
1285 	 * Use the ms_size_tree range tree, which is ordered by size, to
1286 	 * obtain the largest segment in the free tree. If the tree is empty
1287 	 * then we should condense the map.
1288 	 */
1289 	rs = avl_last(&msp->ms_size_tree);
1290 	if (rs == NULL)
1291 		return (B_TRUE);
1292 
1293 	/*
1294 	 * Calculate the number of 64-bit entries this segment would
1295 	 * require when written to disk. If this single segment would be
1296 	 * larger on-disk than the entire current on-disk structure, then
1297 	 * clearly condensing will increase the on-disk structure size.
1298 	 */
1299 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1300 	entries = size / (MIN(size, SM_RUN_MAX));
1301 	segsz = entries * sizeof (uint64_t);
1302 
1303 	return (segsz <= space_map_length(msp->ms_sm) &&
1304 	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1305 	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1306 }
1307 
1308 /*
1309  * Condense the on-disk space map representation to its minimized form.
1310  * The minimized form consists of a small number of allocations followed by
1311  * the entries of the free range tree.
1312  */
1313 static void
1314 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1315 {
1316 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1317 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1318 	range_tree_t *condense_tree;
1319 	space_map_t *sm = msp->ms_sm;
1320 
1321 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1322 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1323 	ASSERT(msp->ms_loaded);
1324 
1325 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1326 	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1327 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1328 
1329 	/*
1330 	 * Create an range tree that is 100% allocated. We remove segments
1331 	 * that have been freed in this txg, any deferred frees that exist,
1332 	 * and any allocation in the future. Removing segments should be
1333 	 * a relatively inexpensive operation since we expect these trees to
1334 	 * have a small number of nodes.
1335 	 */
1336 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1337 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1338 
1339 	/*
1340 	 * Remove what's been freed in this txg from the condense_tree.
1341 	 * Since we're in sync_pass 1, we know that all the frees from
1342 	 * this txg are in the freetree.
1343 	 */
1344 	range_tree_walk(freetree, range_tree_remove, condense_tree);
1345 
1346 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1347 		range_tree_walk(msp->ms_defertree[t],
1348 		    range_tree_remove, condense_tree);
1349 	}
1350 
1351 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1352 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1353 		    range_tree_remove, condense_tree);
1354 	}
1355 
1356 	/*
1357 	 * We're about to drop the metaslab's lock thus allowing
1358 	 * other consumers to change it's content. Set the
1359 	 * metaslab's ms_condensing flag to ensure that
1360 	 * allocations on this metaslab do not occur while we're
1361 	 * in the middle of committing it to disk. This is only critical
1362 	 * for the ms_tree as all other range trees use per txg
1363 	 * views of their content.
1364 	 */
1365 	msp->ms_condensing = B_TRUE;
1366 
1367 	mutex_exit(&msp->ms_lock);
1368 	space_map_truncate(sm, tx);
1369 	mutex_enter(&msp->ms_lock);
1370 
1371 	/*
1372 	 * While we would ideally like to create a space_map representation
1373 	 * that consists only of allocation records, doing so can be
1374 	 * prohibitively expensive because the in-core free tree can be
1375 	 * large, and therefore computationally expensive to subtract
1376 	 * from the condense_tree. Instead we sync out two trees, a cheap
1377 	 * allocation only tree followed by the in-core free tree. While not
1378 	 * optimal, this is typically close to optimal, and much cheaper to
1379 	 * compute.
1380 	 */
1381 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1382 	range_tree_vacate(condense_tree, NULL, NULL);
1383 	range_tree_destroy(condense_tree);
1384 
1385 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1386 	msp->ms_condensing = B_FALSE;
1387 }
1388 
1389 /*
1390  * Write a metaslab to disk in the context of the specified transaction group.
1391  */
1392 void
1393 metaslab_sync(metaslab_t *msp, uint64_t txg)
1394 {
1395 	metaslab_group_t *mg = msp->ms_group;
1396 	vdev_t *vd = mg->mg_vd;
1397 	spa_t *spa = vd->vdev_spa;
1398 	objset_t *mos = spa_meta_objset(spa);
1399 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1400 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1401 	range_tree_t **freed_tree =
1402 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1403 	dmu_tx_t *tx;
1404 	uint64_t object = space_map_object(msp->ms_sm);
1405 
1406 	ASSERT(!vd->vdev_ishole);
1407 
1408 	/*
1409 	 * This metaslab has just been added so there's no work to do now.
1410 	 */
1411 	if (*freetree == NULL) {
1412 		ASSERT3P(alloctree, ==, NULL);
1413 		return;
1414 	}
1415 
1416 	ASSERT3P(alloctree, !=, NULL);
1417 	ASSERT3P(*freetree, !=, NULL);
1418 	ASSERT3P(*freed_tree, !=, NULL);
1419 
1420 	if (range_tree_space(alloctree) == 0 &&
1421 	    range_tree_space(*freetree) == 0)
1422 		return;
1423 
1424 	/*
1425 	 * The only state that can actually be changing concurrently with
1426 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1427 	 * be modifying this txg's alloctree, freetree, freed_tree, or
1428 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1429 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1430 	 * because the DMU can call down to us (e.g. via zio_free()) at
1431 	 * any time.
1432 	 */
1433 
1434 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1435 
1436 	if (msp->ms_sm == NULL) {
1437 		uint64_t new_object;
1438 
1439 		new_object = space_map_alloc(mos, tx);
1440 		VERIFY3U(new_object, !=, 0);
1441 
1442 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1443 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1444 		    &msp->ms_lock));
1445 		ASSERT(msp->ms_sm != NULL);
1446 	}
1447 
1448 	mutex_enter(&msp->ms_lock);
1449 
1450 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1451 	    metaslab_should_condense(msp)) {
1452 		metaslab_condense(msp, txg, tx);
1453 	} else {
1454 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1455 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1456 	}
1457 
1458 	range_tree_vacate(alloctree, NULL, NULL);
1459 
1460 	if (msp->ms_loaded) {
1461 		/*
1462 		 * When the space map is loaded, we have an accruate
1463 		 * histogram in the range tree. This gives us an opportunity
1464 		 * to bring the space map's histogram up-to-date so we clear
1465 		 * it first before updating it.
1466 		 */
1467 		space_map_histogram_clear(msp->ms_sm);
1468 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1469 	} else {
1470 		/*
1471 		 * Since the space map is not loaded we simply update the
1472 		 * exisiting histogram with what was freed in this txg. This
1473 		 * means that the on-disk histogram may not have an accurate
1474 		 * view of the free space but it's close enough to allow
1475 		 * us to make allocation decisions.
1476 		 */
1477 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1478 	}
1479 
1480 	/*
1481 	 * For sync pass 1, we avoid traversing this txg's free range tree
1482 	 * and instead will just swap the pointers for freetree and
1483 	 * freed_tree. We can safely do this since the freed_tree is
1484 	 * guaranteed to be empty on the initial pass.
1485 	 */
1486 	if (spa_sync_pass(spa) == 1) {
1487 		range_tree_swap(freetree, freed_tree);
1488 	} else {
1489 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1490 	}
1491 
1492 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1493 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1494 
1495 	mutex_exit(&msp->ms_lock);
1496 
1497 	if (object != space_map_object(msp->ms_sm)) {
1498 		object = space_map_object(msp->ms_sm);
1499 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1500 		    msp->ms_id, sizeof (uint64_t), &object, tx);
1501 	}
1502 	dmu_tx_commit(tx);
1503 }
1504 
1505 /*
1506  * Called after a transaction group has completely synced to mark
1507  * all of the metaslab's free space as usable.
1508  */
1509 void
1510 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1511 {
1512 	metaslab_group_t *mg = msp->ms_group;
1513 	vdev_t *vd = mg->mg_vd;
1514 	range_tree_t **freed_tree;
1515 	range_tree_t **defer_tree;
1516 	int64_t alloc_delta, defer_delta;
1517 
1518 	ASSERT(!vd->vdev_ishole);
1519 
1520 	mutex_enter(&msp->ms_lock);
1521 
1522 	/*
1523 	 * If this metaslab is just becoming available, initialize its
1524 	 * alloctrees, freetrees, and defertree and add its capacity to
1525 	 * the vdev.
1526 	 */
1527 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1528 		for (int t = 0; t < TXG_SIZE; t++) {
1529 			ASSERT(msp->ms_alloctree[t] == NULL);
1530 			ASSERT(msp->ms_freetree[t] == NULL);
1531 
1532 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1533 			    &msp->ms_lock);
1534 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1535 			    &msp->ms_lock);
1536 		}
1537 
1538 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1539 			ASSERT(msp->ms_defertree[t] == NULL);
1540 
1541 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1542 			    &msp->ms_lock);
1543 		}
1544 
1545 		vdev_space_update(vd, 0, 0, msp->ms_size);
1546 	}
1547 
1548 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1549 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1550 
1551 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1552 	defer_delta = range_tree_space(*freed_tree) -
1553 	    range_tree_space(*defer_tree);
1554 
1555 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1556 
1557 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1558 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1559 
1560 	/*
1561 	 * If there's a metaslab_load() in progress, wait for it to complete
1562 	 * so that we have a consistent view of the in-core space map.
1563 	 */
1564 	metaslab_load_wait(msp);
1565 
1566 	/*
1567 	 * Move the frees from the defer_tree back to the free
1568 	 * range tree (if it's loaded). Swap the freed_tree and the
1569 	 * defer_tree -- this is safe to do because we've just emptied out
1570 	 * the defer_tree.
1571 	 */
1572 	range_tree_vacate(*defer_tree,
1573 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1574 	range_tree_swap(freed_tree, defer_tree);
1575 
1576 	space_map_update(msp->ms_sm);
1577 
1578 	msp->ms_deferspace += defer_delta;
1579 	ASSERT3S(msp->ms_deferspace, >=, 0);
1580 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1581 	if (msp->ms_deferspace != 0) {
1582 		/*
1583 		 * Keep syncing this metaslab until all deferred frees
1584 		 * are back in circulation.
1585 		 */
1586 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1587 	}
1588 
1589 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1590 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1591 			VERIFY0(range_tree_space(
1592 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1593 		}
1594 
1595 		if (!metaslab_debug_unload)
1596 			metaslab_unload(msp);
1597 	}
1598 
1599 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1600 	mutex_exit(&msp->ms_lock);
1601 
1602 }
1603 
1604 void
1605 metaslab_sync_reassess(metaslab_group_t *mg)
1606 {
1607 	metaslab_group_alloc_update(mg);
1608 
1609 	/*
1610 	 * Preload the next potential metaslabs
1611 	 */
1612 	metaslab_group_preload(mg);
1613 }
1614 
1615 static uint64_t
1616 metaslab_distance(metaslab_t *msp, dva_t *dva)
1617 {
1618 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1619 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1620 	uint64_t start = msp->ms_id;
1621 
1622 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1623 		return (1ULL << 63);
1624 
1625 	if (offset < start)
1626 		return ((start - offset) << ms_shift);
1627 	if (offset > start)
1628 		return ((offset - start) << ms_shift);
1629 	return (0);
1630 }
1631 
1632 static uint64_t
1633 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1634     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
1635 {
1636 	spa_t *spa = mg->mg_vd->vdev_spa;
1637 	metaslab_t *msp = NULL;
1638 	uint64_t offset = -1ULL;
1639 	avl_tree_t *t = &mg->mg_metaslab_tree;
1640 	uint64_t activation_weight;
1641 	uint64_t target_distance;
1642 	int i;
1643 
1644 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1645 	for (i = 0; i < d; i++) {
1646 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1647 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1648 			break;
1649 		}
1650 	}
1651 
1652 	for (;;) {
1653 		boolean_t was_active;
1654 
1655 		mutex_enter(&mg->mg_lock);
1656 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1657 			if (msp->ms_weight < asize) {
1658 				spa_dbgmsg(spa, "%s: failed to meet weight "
1659 				    "requirement: vdev %llu, txg %llu, mg %p, "
1660 				    "msp %p, psize %llu, asize %llu, "
1661 				    "weight %llu", spa_name(spa),
1662 				    mg->mg_vd->vdev_id, txg,
1663 				    mg, msp, psize, asize, msp->ms_weight);
1664 				mutex_exit(&mg->mg_lock);
1665 				return (-1ULL);
1666 			}
1667 
1668 			/*
1669 			 * If the selected metaslab is condensing, skip it.
1670 			 */
1671 			if (msp->ms_condensing)
1672 				continue;
1673 
1674 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1675 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1676 				break;
1677 
1678 			target_distance = min_distance +
1679 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1680 			    min_distance >> 1);
1681 
1682 			for (i = 0; i < d; i++)
1683 				if (metaslab_distance(msp, &dva[i]) <
1684 				    target_distance)
1685 					break;
1686 			if (i == d)
1687 				break;
1688 		}
1689 		mutex_exit(&mg->mg_lock);
1690 		if (msp == NULL)
1691 			return (-1ULL);
1692 
1693 		mutex_enter(&msp->ms_lock);
1694 
1695 		/*
1696 		 * Ensure that the metaslab we have selected is still
1697 		 * capable of handling our request. It's possible that
1698 		 * another thread may have changed the weight while we
1699 		 * were blocked on the metaslab lock.
1700 		 */
1701 		if (msp->ms_weight < asize || (was_active &&
1702 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1703 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1704 			mutex_exit(&msp->ms_lock);
1705 			continue;
1706 		}
1707 
1708 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1709 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1710 			metaslab_passivate(msp,
1711 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1712 			mutex_exit(&msp->ms_lock);
1713 			continue;
1714 		}
1715 
1716 		if (metaslab_activate(msp, activation_weight) != 0) {
1717 			mutex_exit(&msp->ms_lock);
1718 			continue;
1719 		}
1720 
1721 		/*
1722 		 * If this metaslab is currently condensing then pick again as
1723 		 * we can't manipulate this metaslab until it's committed
1724 		 * to disk.
1725 		 */
1726 		if (msp->ms_condensing) {
1727 			mutex_exit(&msp->ms_lock);
1728 			continue;
1729 		}
1730 
1731 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1732 			break;
1733 
1734 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1735 		mutex_exit(&msp->ms_lock);
1736 	}
1737 
1738 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1739 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1740 
1741 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1742 	msp->ms_access_txg = txg + metaslab_unload_delay;
1743 
1744 	mutex_exit(&msp->ms_lock);
1745 
1746 	return (offset);
1747 }
1748 
1749 /*
1750  * Allocate a block for the specified i/o.
1751  */
1752 static int
1753 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1754     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1755 {
1756 	metaslab_group_t *mg, *rotor;
1757 	vdev_t *vd;
1758 	int dshift = 3;
1759 	int all_zero;
1760 	int zio_lock = B_FALSE;
1761 	boolean_t allocatable;
1762 	uint64_t offset = -1ULL;
1763 	uint64_t asize;
1764 	uint64_t distance;
1765 
1766 	ASSERT(!DVA_IS_VALID(&dva[d]));
1767 
1768 	/*
1769 	 * For testing, make some blocks above a certain size be gang blocks.
1770 	 */
1771 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1772 		return (SET_ERROR(ENOSPC));
1773 
1774 	/*
1775 	 * Start at the rotor and loop through all mgs until we find something.
1776 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1777 	 * nothing actually breaks if we miss a few updates -- we just won't
1778 	 * allocate quite as evenly.  It all balances out over time.
1779 	 *
1780 	 * If we are doing ditto or log blocks, try to spread them across
1781 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1782 	 * allocated all of our ditto blocks, then try and spread them out on
1783 	 * that vdev as much as possible.  If it turns out to not be possible,
1784 	 * gradually lower our standards until anything becomes acceptable.
1785 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1786 	 * gives us hope of containing our fault domains to something we're
1787 	 * able to reason about.  Otherwise, any two top-level vdev failures
1788 	 * will guarantee the loss of data.  With consecutive allocation,
1789 	 * only two adjacent top-level vdev failures will result in data loss.
1790 	 *
1791 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1792 	 * ourselves on the same vdev as our gang block header.  That
1793 	 * way, we can hope for locality in vdev_cache, plus it makes our
1794 	 * fault domains something tractable.
1795 	 */
1796 	if (hintdva) {
1797 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1798 
1799 		/*
1800 		 * It's possible the vdev we're using as the hint no
1801 		 * longer exists (i.e. removed). Consult the rotor when
1802 		 * all else fails.
1803 		 */
1804 		if (vd != NULL) {
1805 			mg = vd->vdev_mg;
1806 
1807 			if (flags & METASLAB_HINTBP_AVOID &&
1808 			    mg->mg_next != NULL)
1809 				mg = mg->mg_next;
1810 		} else {
1811 			mg = mc->mc_rotor;
1812 		}
1813 	} else if (d != 0) {
1814 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1815 		mg = vd->vdev_mg->mg_next;
1816 	} else {
1817 		mg = mc->mc_rotor;
1818 	}
1819 
1820 	/*
1821 	 * If the hint put us into the wrong metaslab class, or into a
1822 	 * metaslab group that has been passivated, just follow the rotor.
1823 	 */
1824 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1825 		mg = mc->mc_rotor;
1826 
1827 	rotor = mg;
1828 top:
1829 	all_zero = B_TRUE;
1830 	do {
1831 		ASSERT(mg->mg_activation_count == 1);
1832 
1833 		vd = mg->mg_vd;
1834 
1835 		/*
1836 		 * Don't allocate from faulted devices.
1837 		 */
1838 		if (zio_lock) {
1839 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1840 			allocatable = vdev_allocatable(vd);
1841 			spa_config_exit(spa, SCL_ZIO, FTAG);
1842 		} else {
1843 			allocatable = vdev_allocatable(vd);
1844 		}
1845 
1846 		/*
1847 		 * Determine if the selected metaslab group is eligible
1848 		 * for allocations. If we're ganging or have requested
1849 		 * an allocation for the smallest gang block size
1850 		 * then we don't want to avoid allocating to the this
1851 		 * metaslab group. If we're in this condition we should
1852 		 * try to allocate from any device possible so that we
1853 		 * don't inadvertently return ENOSPC and suspend the pool
1854 		 * even though space is still available.
1855 		 */
1856 		if (allocatable && CAN_FASTGANG(flags) &&
1857 		    psize > SPA_GANGBLOCKSIZE)
1858 			allocatable = metaslab_group_allocatable(mg);
1859 
1860 		if (!allocatable)
1861 			goto next;
1862 
1863 		/*
1864 		 * Avoid writing single-copy data to a failing vdev
1865 		 * unless the user instructs us that it is okay.
1866 		 */
1867 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1868 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1869 		    d == 0 && dshift == 3 &&
1870 		    !(zfs_write_to_degraded && vd->vdev_state ==
1871 		    VDEV_STATE_DEGRADED)) {
1872 			all_zero = B_FALSE;
1873 			goto next;
1874 		}
1875 
1876 		ASSERT(mg->mg_class == mc);
1877 
1878 		distance = vd->vdev_asize >> dshift;
1879 		if (distance <= (1ULL << vd->vdev_ms_shift))
1880 			distance = 0;
1881 		else
1882 			all_zero = B_FALSE;
1883 
1884 		asize = vdev_psize_to_asize(vd, psize);
1885 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1886 
1887 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1888 		    dva, d);
1889 		if (offset != -1ULL) {
1890 			/*
1891 			 * If we've just selected this metaslab group,
1892 			 * figure out whether the corresponding vdev is
1893 			 * over- or under-used relative to the pool,
1894 			 * and set an allocation bias to even it out.
1895 			 */
1896 			if (mc->mc_aliquot == 0) {
1897 				vdev_stat_t *vs = &vd->vdev_stat;
1898 				int64_t vu, cu;
1899 
1900 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1901 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1902 
1903 				/*
1904 				 * Calculate how much more or less we should
1905 				 * try to allocate from this device during
1906 				 * this iteration around the rotor.
1907 				 * For example, if a device is 80% full
1908 				 * and the pool is 20% full then we should
1909 				 * reduce allocations by 60% on this device.
1910 				 *
1911 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1912 				 *
1913 				 * This reduces allocations by 307K for this
1914 				 * iteration.
1915 				 */
1916 				mg->mg_bias = ((cu - vu) *
1917 				    (int64_t)mg->mg_aliquot) / 100;
1918 			}
1919 
1920 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1921 			    mg->mg_aliquot + mg->mg_bias) {
1922 				mc->mc_rotor = mg->mg_next;
1923 				mc->mc_aliquot = 0;
1924 			}
1925 
1926 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1927 			DVA_SET_OFFSET(&dva[d], offset);
1928 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1929 			DVA_SET_ASIZE(&dva[d], asize);
1930 
1931 			return (0);
1932 		}
1933 next:
1934 		mc->mc_rotor = mg->mg_next;
1935 		mc->mc_aliquot = 0;
1936 	} while ((mg = mg->mg_next) != rotor);
1937 
1938 	if (!all_zero) {
1939 		dshift++;
1940 		ASSERT(dshift < 64);
1941 		goto top;
1942 	}
1943 
1944 	if (!allocatable && !zio_lock) {
1945 		dshift = 3;
1946 		zio_lock = B_TRUE;
1947 		goto top;
1948 	}
1949 
1950 	bzero(&dva[d], sizeof (dva_t));
1951 
1952 	return (SET_ERROR(ENOSPC));
1953 }
1954 
1955 /*
1956  * Free the block represented by DVA in the context of the specified
1957  * transaction group.
1958  */
1959 static void
1960 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1961 {
1962 	uint64_t vdev = DVA_GET_VDEV(dva);
1963 	uint64_t offset = DVA_GET_OFFSET(dva);
1964 	uint64_t size = DVA_GET_ASIZE(dva);
1965 	vdev_t *vd;
1966 	metaslab_t *msp;
1967 
1968 	ASSERT(DVA_IS_VALID(dva));
1969 
1970 	if (txg > spa_freeze_txg(spa))
1971 		return;
1972 
1973 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1974 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1975 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1976 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1977 		ASSERT(0);
1978 		return;
1979 	}
1980 
1981 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1982 
1983 	if (DVA_GET_GANG(dva))
1984 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1985 
1986 	mutex_enter(&msp->ms_lock);
1987 
1988 	if (now) {
1989 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
1990 		    offset, size);
1991 
1992 		VERIFY(!msp->ms_condensing);
1993 		VERIFY3U(offset, >=, msp->ms_start);
1994 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
1995 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
1996 		    msp->ms_size);
1997 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
1998 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1999 		range_tree_add(msp->ms_tree, offset, size);
2000 	} else {
2001 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2002 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2003 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2004 		    offset, size);
2005 	}
2006 
2007 	mutex_exit(&msp->ms_lock);
2008 }
2009 
2010 /*
2011  * Intent log support: upon opening the pool after a crash, notify the SPA
2012  * of blocks that the intent log has allocated for immediate write, but
2013  * which are still considered free by the SPA because the last transaction
2014  * group didn't commit yet.
2015  */
2016 static int
2017 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2018 {
2019 	uint64_t vdev = DVA_GET_VDEV(dva);
2020 	uint64_t offset = DVA_GET_OFFSET(dva);
2021 	uint64_t size = DVA_GET_ASIZE(dva);
2022 	vdev_t *vd;
2023 	metaslab_t *msp;
2024 	int error = 0;
2025 
2026 	ASSERT(DVA_IS_VALID(dva));
2027 
2028 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2029 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2030 		return (SET_ERROR(ENXIO));
2031 
2032 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2033 
2034 	if (DVA_GET_GANG(dva))
2035 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2036 
2037 	mutex_enter(&msp->ms_lock);
2038 
2039 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2040 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2041 
2042 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2043 		error = SET_ERROR(ENOENT);
2044 
2045 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2046 		mutex_exit(&msp->ms_lock);
2047 		return (error);
2048 	}
2049 
2050 	VERIFY(!msp->ms_condensing);
2051 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2052 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2053 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2054 	range_tree_remove(msp->ms_tree, offset, size);
2055 
2056 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2057 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2058 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2059 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2060 	}
2061 
2062 	mutex_exit(&msp->ms_lock);
2063 
2064 	return (0);
2065 }
2066 
2067 int
2068 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2069     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2070 {
2071 	dva_t *dva = bp->blk_dva;
2072 	dva_t *hintdva = hintbp->blk_dva;
2073 	int error = 0;
2074 
2075 	ASSERT(bp->blk_birth == 0);
2076 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2077 
2078 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2079 
2080 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2081 		spa_config_exit(spa, SCL_ALLOC, FTAG);
2082 		return (SET_ERROR(ENOSPC));
2083 	}
2084 
2085 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2086 	ASSERT(BP_GET_NDVAS(bp) == 0);
2087 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2088 
2089 	for (int d = 0; d < ndvas; d++) {
2090 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2091 		    txg, flags);
2092 		if (error != 0) {
2093 			for (d--; d >= 0; d--) {
2094 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2095 				bzero(&dva[d], sizeof (dva_t));
2096 			}
2097 			spa_config_exit(spa, SCL_ALLOC, FTAG);
2098 			return (error);
2099 		}
2100 	}
2101 	ASSERT(error == 0);
2102 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2103 
2104 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2105 
2106 	BP_SET_BIRTH(bp, txg, txg);
2107 
2108 	return (0);
2109 }
2110 
2111 void
2112 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2113 {
2114 	const dva_t *dva = bp->blk_dva;
2115 	int ndvas = BP_GET_NDVAS(bp);
2116 
2117 	ASSERT(!BP_IS_HOLE(bp));
2118 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2119 
2120 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2121 
2122 	for (int d = 0; d < ndvas; d++)
2123 		metaslab_free_dva(spa, &dva[d], txg, now);
2124 
2125 	spa_config_exit(spa, SCL_FREE, FTAG);
2126 }
2127 
2128 int
2129 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2130 {
2131 	const dva_t *dva = bp->blk_dva;
2132 	int ndvas = BP_GET_NDVAS(bp);
2133 	int error = 0;
2134 
2135 	ASSERT(!BP_IS_HOLE(bp));
2136 
2137 	if (txg != 0) {
2138 		/*
2139 		 * First do a dry run to make sure all DVAs are claimable,
2140 		 * so we don't have to unwind from partial failures below.
2141 		 */
2142 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2143 			return (error);
2144 	}
2145 
2146 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2147 
2148 	for (int d = 0; d < ndvas; d++)
2149 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2150 			break;
2151 
2152 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2153 
2154 	ASSERT(error == 0 || txg == 0);
2155 
2156 	return (error);
2157 }
2158 
2159 void
2160 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2161 {
2162 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2163 		return;
2164 
2165 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2166 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2167 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2168 		vdev_t *vd = vdev_lookup_top(spa, vdev);
2169 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2170 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2171 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2172 
2173 		if (msp->ms_loaded)
2174 			range_tree_verify(msp->ms_tree, offset, size);
2175 
2176 		for (int j = 0; j < TXG_SIZE; j++)
2177 			range_tree_verify(msp->ms_freetree[j], offset, size);
2178 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2179 			range_tree_verify(msp->ms_defertree[j], offset, size);
2180 	}
2181 	spa_config_exit(spa, SCL_VDEV, FTAG);
2182 }
2183