xref: /titanic_50/usr/src/uts/common/fs/zfs/metaslab.c (revision 33392354a001a7e6e504717bd3f249456d36cc00)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35 
36 /*
37  * Allow allocations to switch to gang blocks quickly. We do this to
38  * avoid having to load lots of space_maps in a given txg. There are,
39  * however, some cases where we want to avoid "fast" ganging and instead
40  * we want to do an exhaustive search of all metaslabs on this device.
41  * Currently we don't allow any gang, slog, or dump device related allocations
42  * to "fast" gang.
43  */
44 #define	CAN_FASTGANG(flags) \
45 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46 	METASLAB_GANG_AVOID)))
47 
48 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
49 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
50 #define	METASLAB_ACTIVE_MASK		\
51 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
52 
53 uint64_t metaslab_aliquot = 512ULL << 10;
54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
55 
56 /*
57  * The in-core space map representation is more compact than its on-disk form.
58  * The zfs_condense_pct determines how much more compact the in-core
59  * space_map representation must be before we compact it on-disk.
60  * Values should be greater than or equal to 100.
61  */
62 int zfs_condense_pct = 200;
63 
64 /*
65  * The zfs_mg_noalloc_threshold defines which metaslab groups should
66  * be eligible for allocation. The value is defined as a percentage of
67  * a free space. Metaslab groups that have more free space than
68  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
69  * a metaslab group's free space is less than or equal to the
70  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
71  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
72  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
73  * groups are allowed to accept allocations. Gang blocks are always
74  * eligible to allocate on any metaslab group. The default value of 0 means
75  * no metaslab group will be excluded based on this criterion.
76  */
77 int zfs_mg_noalloc_threshold = 0;
78 
79 /*
80  * When set will load all metaslabs when pool is first opened.
81  */
82 int metaslab_debug_load = 0;
83 
84 /*
85  * When set will prevent metaslabs from being unloaded.
86  */
87 int metaslab_debug_unload = 0;
88 
89 /*
90  * Minimum size which forces the dynamic allocator to change
91  * it's allocation strategy.  Once the space map cannot satisfy
92  * an allocation of this size then it switches to using more
93  * aggressive strategy (i.e search by size rather than offset).
94  */
95 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
96 
97 /*
98  * The minimum free space, in percent, which must be available
99  * in a space map to continue allocations in a first-fit fashion.
100  * Once the space_map's free space drops below this level we dynamically
101  * switch to using best-fit allocations.
102  */
103 int metaslab_df_free_pct = 4;
104 
105 /*
106  * A metaslab is considered "free" if it contains a contiguous
107  * segment which is greater than metaslab_min_alloc_size.
108  */
109 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
110 
111 /*
112  * Percentage of all cpus that can be used by the metaslab taskq.
113  */
114 int metaslab_load_pct = 50;
115 
116 /*
117  * Determines how many txgs a metaslab may remain loaded without having any
118  * allocations from it. As long as a metaslab continues to be used we will
119  * keep it loaded.
120  */
121 int metaslab_unload_delay = TXG_SIZE * 2;
122 
123 /*
124  * Should we be willing to write data to degraded vdevs?
125  */
126 boolean_t zfs_write_to_degraded = B_FALSE;
127 
128 /*
129  * Max number of metaslabs per group to preload.
130  */
131 int metaslab_preload_limit = SPA_DVAS_PER_BP;
132 
133 /*
134  * Enable/disable preloading of metaslab.
135  */
136 boolean_t metaslab_preload_enabled = B_TRUE;
137 
138 /*
139  * Enable/disable additional weight factor for each metaslab.
140  */
141 boolean_t metaslab_weight_factor_enable = B_FALSE;
142 
143 
144 /*
145  * ==========================================================================
146  * Metaslab classes
147  * ==========================================================================
148  */
149 metaslab_class_t *
150 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
151 {
152 	metaslab_class_t *mc;
153 
154 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
155 
156 	mc->mc_spa = spa;
157 	mc->mc_rotor = NULL;
158 	mc->mc_ops = ops;
159 
160 	return (mc);
161 }
162 
163 void
164 metaslab_class_destroy(metaslab_class_t *mc)
165 {
166 	ASSERT(mc->mc_rotor == NULL);
167 	ASSERT(mc->mc_alloc == 0);
168 	ASSERT(mc->mc_deferred == 0);
169 	ASSERT(mc->mc_space == 0);
170 	ASSERT(mc->mc_dspace == 0);
171 
172 	kmem_free(mc, sizeof (metaslab_class_t));
173 }
174 
175 int
176 metaslab_class_validate(metaslab_class_t *mc)
177 {
178 	metaslab_group_t *mg;
179 	vdev_t *vd;
180 
181 	/*
182 	 * Must hold one of the spa_config locks.
183 	 */
184 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
185 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
186 
187 	if ((mg = mc->mc_rotor) == NULL)
188 		return (0);
189 
190 	do {
191 		vd = mg->mg_vd;
192 		ASSERT(vd->vdev_mg != NULL);
193 		ASSERT3P(vd->vdev_top, ==, vd);
194 		ASSERT3P(mg->mg_class, ==, mc);
195 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
196 	} while ((mg = mg->mg_next) != mc->mc_rotor);
197 
198 	return (0);
199 }
200 
201 void
202 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
203     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
204 {
205 	atomic_add_64(&mc->mc_alloc, alloc_delta);
206 	atomic_add_64(&mc->mc_deferred, defer_delta);
207 	atomic_add_64(&mc->mc_space, space_delta);
208 	atomic_add_64(&mc->mc_dspace, dspace_delta);
209 }
210 
211 uint64_t
212 metaslab_class_get_alloc(metaslab_class_t *mc)
213 {
214 	return (mc->mc_alloc);
215 }
216 
217 uint64_t
218 metaslab_class_get_deferred(metaslab_class_t *mc)
219 {
220 	return (mc->mc_deferred);
221 }
222 
223 uint64_t
224 metaslab_class_get_space(metaslab_class_t *mc)
225 {
226 	return (mc->mc_space);
227 }
228 
229 uint64_t
230 metaslab_class_get_dspace(metaslab_class_t *mc)
231 {
232 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
233 }
234 
235 /*
236  * ==========================================================================
237  * Metaslab groups
238  * ==========================================================================
239  */
240 static int
241 metaslab_compare(const void *x1, const void *x2)
242 {
243 	const metaslab_t *m1 = x1;
244 	const metaslab_t *m2 = x2;
245 
246 	if (m1->ms_weight < m2->ms_weight)
247 		return (1);
248 	if (m1->ms_weight > m2->ms_weight)
249 		return (-1);
250 
251 	/*
252 	 * If the weights are identical, use the offset to force uniqueness.
253 	 */
254 	if (m1->ms_start < m2->ms_start)
255 		return (-1);
256 	if (m1->ms_start > m2->ms_start)
257 		return (1);
258 
259 	ASSERT3P(m1, ==, m2);
260 
261 	return (0);
262 }
263 
264 /*
265  * Update the allocatable flag and the metaslab group's capacity.
266  * The allocatable flag is set to true if the capacity is below
267  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
268  * from allocatable to non-allocatable or vice versa then the metaslab
269  * group's class is updated to reflect the transition.
270  */
271 static void
272 metaslab_group_alloc_update(metaslab_group_t *mg)
273 {
274 	vdev_t *vd = mg->mg_vd;
275 	metaslab_class_t *mc = mg->mg_class;
276 	vdev_stat_t *vs = &vd->vdev_stat;
277 	boolean_t was_allocatable;
278 
279 	ASSERT(vd == vd->vdev_top);
280 
281 	mutex_enter(&mg->mg_lock);
282 	was_allocatable = mg->mg_allocatable;
283 
284 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
285 	    (vs->vs_space + 1);
286 
287 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
288 
289 	/*
290 	 * The mc_alloc_groups maintains a count of the number of
291 	 * groups in this metaslab class that are still above the
292 	 * zfs_mg_noalloc_threshold. This is used by the allocating
293 	 * threads to determine if they should avoid allocations to
294 	 * a given group. The allocator will avoid allocations to a group
295 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
296 	 * and there are still other groups that are above the threshold.
297 	 * When a group transitions from allocatable to non-allocatable or
298 	 * vice versa we update the metaslab class to reflect that change.
299 	 * When the mc_alloc_groups value drops to 0 that means that all
300 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
301 	 * eligible for allocations. This effectively means that all devices
302 	 * are balanced again.
303 	 */
304 	if (was_allocatable && !mg->mg_allocatable)
305 		mc->mc_alloc_groups--;
306 	else if (!was_allocatable && mg->mg_allocatable)
307 		mc->mc_alloc_groups++;
308 	mutex_exit(&mg->mg_lock);
309 }
310 
311 metaslab_group_t *
312 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
313 {
314 	metaslab_group_t *mg;
315 
316 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
317 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
318 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
319 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
320 	mg->mg_vd = vd;
321 	mg->mg_class = mc;
322 	mg->mg_activation_count = 0;
323 
324 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
325 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
326 
327 	return (mg);
328 }
329 
330 void
331 metaslab_group_destroy(metaslab_group_t *mg)
332 {
333 	ASSERT(mg->mg_prev == NULL);
334 	ASSERT(mg->mg_next == NULL);
335 	/*
336 	 * We may have gone below zero with the activation count
337 	 * either because we never activated in the first place or
338 	 * because we're done, and possibly removing the vdev.
339 	 */
340 	ASSERT(mg->mg_activation_count <= 0);
341 
342 	taskq_destroy(mg->mg_taskq);
343 	avl_destroy(&mg->mg_metaslab_tree);
344 	mutex_destroy(&mg->mg_lock);
345 	kmem_free(mg, sizeof (metaslab_group_t));
346 }
347 
348 void
349 metaslab_group_activate(metaslab_group_t *mg)
350 {
351 	metaslab_class_t *mc = mg->mg_class;
352 	metaslab_group_t *mgprev, *mgnext;
353 
354 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
355 
356 	ASSERT(mc->mc_rotor != mg);
357 	ASSERT(mg->mg_prev == NULL);
358 	ASSERT(mg->mg_next == NULL);
359 	ASSERT(mg->mg_activation_count <= 0);
360 
361 	if (++mg->mg_activation_count <= 0)
362 		return;
363 
364 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
365 	metaslab_group_alloc_update(mg);
366 
367 	if ((mgprev = mc->mc_rotor) == NULL) {
368 		mg->mg_prev = mg;
369 		mg->mg_next = mg;
370 	} else {
371 		mgnext = mgprev->mg_next;
372 		mg->mg_prev = mgprev;
373 		mg->mg_next = mgnext;
374 		mgprev->mg_next = mg;
375 		mgnext->mg_prev = mg;
376 	}
377 	mc->mc_rotor = mg;
378 }
379 
380 void
381 metaslab_group_passivate(metaslab_group_t *mg)
382 {
383 	metaslab_class_t *mc = mg->mg_class;
384 	metaslab_group_t *mgprev, *mgnext;
385 
386 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
387 
388 	if (--mg->mg_activation_count != 0) {
389 		ASSERT(mc->mc_rotor != mg);
390 		ASSERT(mg->mg_prev == NULL);
391 		ASSERT(mg->mg_next == NULL);
392 		ASSERT(mg->mg_activation_count < 0);
393 		return;
394 	}
395 
396 	taskq_wait(mg->mg_taskq);
397 
398 	mgprev = mg->mg_prev;
399 	mgnext = mg->mg_next;
400 
401 	if (mg == mgnext) {
402 		mc->mc_rotor = NULL;
403 	} else {
404 		mc->mc_rotor = mgnext;
405 		mgprev->mg_next = mgnext;
406 		mgnext->mg_prev = mgprev;
407 	}
408 
409 	mg->mg_prev = NULL;
410 	mg->mg_next = NULL;
411 }
412 
413 static void
414 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
415 {
416 	mutex_enter(&mg->mg_lock);
417 	ASSERT(msp->ms_group == NULL);
418 	msp->ms_group = mg;
419 	msp->ms_weight = 0;
420 	avl_add(&mg->mg_metaslab_tree, msp);
421 	mutex_exit(&mg->mg_lock);
422 }
423 
424 static void
425 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
426 {
427 	mutex_enter(&mg->mg_lock);
428 	ASSERT(msp->ms_group == mg);
429 	avl_remove(&mg->mg_metaslab_tree, msp);
430 	msp->ms_group = NULL;
431 	mutex_exit(&mg->mg_lock);
432 }
433 
434 static void
435 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
436 {
437 	/*
438 	 * Although in principle the weight can be any value, in
439 	 * practice we do not use values in the range [1, 510].
440 	 */
441 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
442 	ASSERT(MUTEX_HELD(&msp->ms_lock));
443 
444 	mutex_enter(&mg->mg_lock);
445 	ASSERT(msp->ms_group == mg);
446 	avl_remove(&mg->mg_metaslab_tree, msp);
447 	msp->ms_weight = weight;
448 	avl_add(&mg->mg_metaslab_tree, msp);
449 	mutex_exit(&mg->mg_lock);
450 }
451 
452 /*
453  * Determine if a given metaslab group should skip allocations. A metaslab
454  * group should avoid allocations if its used capacity has crossed the
455  * zfs_mg_noalloc_threshold and there is at least one metaslab group
456  * that can still handle allocations.
457  */
458 static boolean_t
459 metaslab_group_allocatable(metaslab_group_t *mg)
460 {
461 	vdev_t *vd = mg->mg_vd;
462 	spa_t *spa = vd->vdev_spa;
463 	metaslab_class_t *mc = mg->mg_class;
464 
465 	/*
466 	 * A metaslab group is considered allocatable if its free capacity
467 	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
468 	 * associated with a slog, or there are no other metaslab groups
469 	 * with free capacity greater than zfs_mg_noalloc_threshold.
470 	 */
471 	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
472 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
473 }
474 
475 /*
476  * ==========================================================================
477  * Range tree callbacks
478  * ==========================================================================
479  */
480 
481 /*
482  * Comparison function for the private size-ordered tree. Tree is sorted
483  * by size, larger sizes at the end of the tree.
484  */
485 static int
486 metaslab_rangesize_compare(const void *x1, const void *x2)
487 {
488 	const range_seg_t *r1 = x1;
489 	const range_seg_t *r2 = x2;
490 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
491 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
492 
493 	if (rs_size1 < rs_size2)
494 		return (-1);
495 	if (rs_size1 > rs_size2)
496 		return (1);
497 
498 	if (r1->rs_start < r2->rs_start)
499 		return (-1);
500 
501 	if (r1->rs_start > r2->rs_start)
502 		return (1);
503 
504 	return (0);
505 }
506 
507 /*
508  * Create any block allocator specific components. The current allocators
509  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
510  */
511 static void
512 metaslab_rt_create(range_tree_t *rt, void *arg)
513 {
514 	metaslab_t *msp = arg;
515 
516 	ASSERT3P(rt->rt_arg, ==, msp);
517 	ASSERT(msp->ms_tree == NULL);
518 
519 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
520 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
521 }
522 
523 /*
524  * Destroy the block allocator specific components.
525  */
526 static void
527 metaslab_rt_destroy(range_tree_t *rt, void *arg)
528 {
529 	metaslab_t *msp = arg;
530 
531 	ASSERT3P(rt->rt_arg, ==, msp);
532 	ASSERT3P(msp->ms_tree, ==, rt);
533 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
534 
535 	avl_destroy(&msp->ms_size_tree);
536 }
537 
538 static void
539 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
540 {
541 	metaslab_t *msp = arg;
542 
543 	ASSERT3P(rt->rt_arg, ==, msp);
544 	ASSERT3P(msp->ms_tree, ==, rt);
545 	VERIFY(!msp->ms_condensing);
546 	avl_add(&msp->ms_size_tree, rs);
547 }
548 
549 static void
550 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
551 {
552 	metaslab_t *msp = arg;
553 
554 	ASSERT3P(rt->rt_arg, ==, msp);
555 	ASSERT3P(msp->ms_tree, ==, rt);
556 	VERIFY(!msp->ms_condensing);
557 	avl_remove(&msp->ms_size_tree, rs);
558 }
559 
560 static void
561 metaslab_rt_vacate(range_tree_t *rt, void *arg)
562 {
563 	metaslab_t *msp = arg;
564 
565 	ASSERT3P(rt->rt_arg, ==, msp);
566 	ASSERT3P(msp->ms_tree, ==, rt);
567 
568 	/*
569 	 * Normally one would walk the tree freeing nodes along the way.
570 	 * Since the nodes are shared with the range trees we can avoid
571 	 * walking all nodes and just reinitialize the avl tree. The nodes
572 	 * will be freed by the range tree, so we don't want to free them here.
573 	 */
574 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
575 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
576 }
577 
578 static range_tree_ops_t metaslab_rt_ops = {
579 	metaslab_rt_create,
580 	metaslab_rt_destroy,
581 	metaslab_rt_add,
582 	metaslab_rt_remove,
583 	metaslab_rt_vacate
584 };
585 
586 /*
587  * ==========================================================================
588  * Metaslab block operations
589  * ==========================================================================
590  */
591 
592 /*
593  * Return the maximum contiguous segment within the metaslab.
594  */
595 uint64_t
596 metaslab_block_maxsize(metaslab_t *msp)
597 {
598 	avl_tree_t *t = &msp->ms_size_tree;
599 	range_seg_t *rs;
600 
601 	if (t == NULL || (rs = avl_last(t)) == NULL)
602 		return (0ULL);
603 
604 	return (rs->rs_end - rs->rs_start);
605 }
606 
607 uint64_t
608 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
609 {
610 	uint64_t start;
611 	range_tree_t *rt = msp->ms_tree;
612 
613 	VERIFY(!msp->ms_condensing);
614 
615 	start = msp->ms_ops->msop_alloc(msp, size);
616 	if (start != -1ULL) {
617 		vdev_t *vd = msp->ms_group->mg_vd;
618 
619 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
620 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
621 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
622 		range_tree_remove(rt, start, size);
623 	}
624 	return (start);
625 }
626 
627 /*
628  * ==========================================================================
629  * Common allocator routines
630  * ==========================================================================
631  */
632 
633 /*
634  * This is a helper function that can be used by the allocator to find
635  * a suitable block to allocate. This will search the specified AVL
636  * tree looking for a block that matches the specified criteria.
637  */
638 static uint64_t
639 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
640     uint64_t align)
641 {
642 	range_seg_t *rs, rsearch;
643 	avl_index_t where;
644 
645 	rsearch.rs_start = *cursor;
646 	rsearch.rs_end = *cursor + size;
647 
648 	rs = avl_find(t, &rsearch, &where);
649 	if (rs == NULL)
650 		rs = avl_nearest(t, where, AVL_AFTER);
651 
652 	while (rs != NULL) {
653 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
654 
655 		if (offset + size <= rs->rs_end) {
656 			*cursor = offset + size;
657 			return (offset);
658 		}
659 		rs = AVL_NEXT(t, rs);
660 	}
661 
662 	/*
663 	 * If we know we've searched the whole map (*cursor == 0), give up.
664 	 * Otherwise, reset the cursor to the beginning and try again.
665 	 */
666 	if (*cursor == 0)
667 		return (-1ULL);
668 
669 	*cursor = 0;
670 	return (metaslab_block_picker(t, cursor, size, align));
671 }
672 
673 /*
674  * ==========================================================================
675  * The first-fit block allocator
676  * ==========================================================================
677  */
678 static uint64_t
679 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
680 {
681 	/*
682 	 * Find the largest power of 2 block size that evenly divides the
683 	 * requested size. This is used to try to allocate blocks with similar
684 	 * alignment from the same area of the metaslab (i.e. same cursor
685 	 * bucket) but it does not guarantee that other allocations sizes
686 	 * may exist in the same region.
687 	 */
688 	uint64_t align = size & -size;
689 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
690 	avl_tree_t *t = &msp->ms_tree->rt_root;
691 
692 	return (metaslab_block_picker(t, cursor, size, align));
693 }
694 
695 /* ARGSUSED */
696 static boolean_t
697 metaslab_ff_fragmented(metaslab_t *msp)
698 {
699 	return (B_TRUE);
700 }
701 
702 static metaslab_ops_t metaslab_ff_ops = {
703 	metaslab_ff_alloc,
704 	metaslab_ff_fragmented
705 };
706 
707 /*
708  * ==========================================================================
709  * Dynamic block allocator -
710  * Uses the first fit allocation scheme until space get low and then
711  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
712  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
713  * ==========================================================================
714  */
715 static uint64_t
716 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
717 {
718 	/*
719 	 * Find the largest power of 2 block size that evenly divides the
720 	 * requested size. This is used to try to allocate blocks with similar
721 	 * alignment from the same area of the metaslab (i.e. same cursor
722 	 * bucket) but it does not guarantee that other allocations sizes
723 	 * may exist in the same region.
724 	 */
725 	uint64_t align = size & -size;
726 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
727 	range_tree_t *rt = msp->ms_tree;
728 	avl_tree_t *t = &rt->rt_root;
729 	uint64_t max_size = metaslab_block_maxsize(msp);
730 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
731 
732 	ASSERT(MUTEX_HELD(&msp->ms_lock));
733 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
734 
735 	if (max_size < size)
736 		return (-1ULL);
737 
738 	/*
739 	 * If we're running low on space switch to using the size
740 	 * sorted AVL tree (best-fit).
741 	 */
742 	if (max_size < metaslab_df_alloc_threshold ||
743 	    free_pct < metaslab_df_free_pct) {
744 		t = &msp->ms_size_tree;
745 		*cursor = 0;
746 	}
747 
748 	return (metaslab_block_picker(t, cursor, size, 1ULL));
749 }
750 
751 static boolean_t
752 metaslab_df_fragmented(metaslab_t *msp)
753 {
754 	range_tree_t *rt = msp->ms_tree;
755 	uint64_t max_size = metaslab_block_maxsize(msp);
756 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
757 
758 	if (max_size >= metaslab_df_alloc_threshold &&
759 	    free_pct >= metaslab_df_free_pct)
760 		return (B_FALSE);
761 
762 	return (B_TRUE);
763 }
764 
765 static metaslab_ops_t metaslab_df_ops = {
766 	metaslab_df_alloc,
767 	metaslab_df_fragmented
768 };
769 
770 /*
771  * ==========================================================================
772  * Cursor fit block allocator -
773  * Select the largest region in the metaslab, set the cursor to the beginning
774  * of the range and the cursor_end to the end of the range. As allocations
775  * are made advance the cursor. Continue allocating from the cursor until
776  * the range is exhausted and then find a new range.
777  * ==========================================================================
778  */
779 static uint64_t
780 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
781 {
782 	range_tree_t *rt = msp->ms_tree;
783 	avl_tree_t *t = &msp->ms_size_tree;
784 	uint64_t *cursor = &msp->ms_lbas[0];
785 	uint64_t *cursor_end = &msp->ms_lbas[1];
786 	uint64_t offset = 0;
787 
788 	ASSERT(MUTEX_HELD(&msp->ms_lock));
789 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
790 
791 	ASSERT3U(*cursor_end, >=, *cursor);
792 
793 	if ((*cursor + size) > *cursor_end) {
794 		range_seg_t *rs;
795 
796 		rs = avl_last(&msp->ms_size_tree);
797 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
798 			return (-1ULL);
799 
800 		*cursor = rs->rs_start;
801 		*cursor_end = rs->rs_end;
802 	}
803 
804 	offset = *cursor;
805 	*cursor += size;
806 
807 	return (offset);
808 }
809 
810 static boolean_t
811 metaslab_cf_fragmented(metaslab_t *msp)
812 {
813 	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
814 }
815 
816 static metaslab_ops_t metaslab_cf_ops = {
817 	metaslab_cf_alloc,
818 	metaslab_cf_fragmented
819 };
820 
821 /*
822  * ==========================================================================
823  * New dynamic fit allocator -
824  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
825  * contiguous blocks. If no region is found then just use the largest segment
826  * that remains.
827  * ==========================================================================
828  */
829 
830 /*
831  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
832  * to request from the allocator.
833  */
834 uint64_t metaslab_ndf_clump_shift = 4;
835 
836 static uint64_t
837 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
838 {
839 	avl_tree_t *t = &msp->ms_tree->rt_root;
840 	avl_index_t where;
841 	range_seg_t *rs, rsearch;
842 	uint64_t hbit = highbit64(size);
843 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
844 	uint64_t max_size = metaslab_block_maxsize(msp);
845 
846 	ASSERT(MUTEX_HELD(&msp->ms_lock));
847 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
848 
849 	if (max_size < size)
850 		return (-1ULL);
851 
852 	rsearch.rs_start = *cursor;
853 	rsearch.rs_end = *cursor + size;
854 
855 	rs = avl_find(t, &rsearch, &where);
856 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
857 		t = &msp->ms_size_tree;
858 
859 		rsearch.rs_start = 0;
860 		rsearch.rs_end = MIN(max_size,
861 		    1ULL << (hbit + metaslab_ndf_clump_shift));
862 		rs = avl_find(t, &rsearch, &where);
863 		if (rs == NULL)
864 			rs = avl_nearest(t, where, AVL_AFTER);
865 		ASSERT(rs != NULL);
866 	}
867 
868 	if ((rs->rs_end - rs->rs_start) >= size) {
869 		*cursor = rs->rs_start + size;
870 		return (rs->rs_start);
871 	}
872 	return (-1ULL);
873 }
874 
875 static boolean_t
876 metaslab_ndf_fragmented(metaslab_t *msp)
877 {
878 	return (metaslab_block_maxsize(msp) <=
879 	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
880 }
881 
882 static metaslab_ops_t metaslab_ndf_ops = {
883 	metaslab_ndf_alloc,
884 	metaslab_ndf_fragmented
885 };
886 
887 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
888 
889 /*
890  * ==========================================================================
891  * Metaslabs
892  * ==========================================================================
893  */
894 
895 /*
896  * Wait for any in-progress metaslab loads to complete.
897  */
898 void
899 metaslab_load_wait(metaslab_t *msp)
900 {
901 	ASSERT(MUTEX_HELD(&msp->ms_lock));
902 
903 	while (msp->ms_loading) {
904 		ASSERT(!msp->ms_loaded);
905 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
906 	}
907 }
908 
909 int
910 metaslab_load(metaslab_t *msp)
911 {
912 	int error = 0;
913 
914 	ASSERT(MUTEX_HELD(&msp->ms_lock));
915 	ASSERT(!msp->ms_loaded);
916 	ASSERT(!msp->ms_loading);
917 
918 	msp->ms_loading = B_TRUE;
919 
920 	/*
921 	 * If the space map has not been allocated yet, then treat
922 	 * all the space in the metaslab as free and add it to the
923 	 * ms_tree.
924 	 */
925 	if (msp->ms_sm != NULL)
926 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
927 	else
928 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
929 
930 	msp->ms_loaded = (error == 0);
931 	msp->ms_loading = B_FALSE;
932 
933 	if (msp->ms_loaded) {
934 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
935 			range_tree_walk(msp->ms_defertree[t],
936 			    range_tree_remove, msp->ms_tree);
937 		}
938 	}
939 	cv_broadcast(&msp->ms_load_cv);
940 	return (error);
941 }
942 
943 void
944 metaslab_unload(metaslab_t *msp)
945 {
946 	ASSERT(MUTEX_HELD(&msp->ms_lock));
947 	range_tree_vacate(msp->ms_tree, NULL, NULL);
948 	msp->ms_loaded = B_FALSE;
949 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
950 }
951 
952 metaslab_t *
953 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
954 {
955 	vdev_t *vd = mg->mg_vd;
956 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
957 	metaslab_t *msp;
958 
959 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
960 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
961 	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
962 	msp->ms_id = id;
963 	msp->ms_start = id << vd->vdev_ms_shift;
964 	msp->ms_size = 1ULL << vd->vdev_ms_shift;
965 
966 	/*
967 	 * We only open space map objects that already exist. All others
968 	 * will be opened when we finally allocate an object for it.
969 	 */
970 	if (object != 0) {
971 		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
972 		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
973 		ASSERT(msp->ms_sm != NULL);
974 	}
975 
976 	/*
977 	 * We create the main range tree here, but we don't create the
978 	 * alloctree and freetree until metaslab_sync_done().  This serves
979 	 * two purposes: it allows metaslab_sync_done() to detect the
980 	 * addition of new space; and for debugging, it ensures that we'd
981 	 * data fault on any attempt to use this metaslab before it's ready.
982 	 */
983 	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
984 	metaslab_group_add(mg, msp);
985 
986 	msp->ms_ops = mg->mg_class->mc_ops;
987 
988 	/*
989 	 * If we're opening an existing pool (txg == 0) or creating
990 	 * a new one (txg == TXG_INITIAL), all space is available now.
991 	 * If we're adding space to an existing pool, the new space
992 	 * does not become available until after this txg has synced.
993 	 */
994 	if (txg <= TXG_INITIAL)
995 		metaslab_sync_done(msp, 0);
996 
997 	/*
998 	 * If metaslab_debug_load is set and we're initializing a metaslab
999 	 * that has an allocated space_map object then load the its space
1000 	 * map so that can verify frees.
1001 	 */
1002 	if (metaslab_debug_load && msp->ms_sm != NULL) {
1003 		mutex_enter(&msp->ms_lock);
1004 		VERIFY0(metaslab_load(msp));
1005 		mutex_exit(&msp->ms_lock);
1006 	}
1007 
1008 	if (txg != 0) {
1009 		vdev_dirty(vd, 0, NULL, txg);
1010 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1011 	}
1012 
1013 	return (msp);
1014 }
1015 
1016 void
1017 metaslab_fini(metaslab_t *msp)
1018 {
1019 	metaslab_group_t *mg = msp->ms_group;
1020 
1021 	metaslab_group_remove(mg, msp);
1022 
1023 	mutex_enter(&msp->ms_lock);
1024 
1025 	VERIFY(msp->ms_group == NULL);
1026 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1027 	    0, -msp->ms_size);
1028 	space_map_close(msp->ms_sm);
1029 
1030 	metaslab_unload(msp);
1031 	range_tree_destroy(msp->ms_tree);
1032 
1033 	for (int t = 0; t < TXG_SIZE; t++) {
1034 		range_tree_destroy(msp->ms_alloctree[t]);
1035 		range_tree_destroy(msp->ms_freetree[t]);
1036 	}
1037 
1038 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1039 		range_tree_destroy(msp->ms_defertree[t]);
1040 	}
1041 
1042 	ASSERT0(msp->ms_deferspace);
1043 
1044 	mutex_exit(&msp->ms_lock);
1045 	cv_destroy(&msp->ms_load_cv);
1046 	mutex_destroy(&msp->ms_lock);
1047 
1048 	kmem_free(msp, sizeof (metaslab_t));
1049 }
1050 
1051 /*
1052  * Apply a weighting factor based on the histogram information for this
1053  * metaslab. The current weighting factor is somewhat arbitrary and requires
1054  * additional investigation. The implementation provides a measure of
1055  * "weighted" free space and gives a higher weighting for larger contiguous
1056  * regions. The weighting factor is determined by counting the number of
1057  * sm_shift sectors that exist in each region represented by the histogram.
1058  * That value is then multiplied by the power of 2 exponent and the sm_shift
1059  * value.
1060  *
1061  * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1062  * metaslab has an sm_shift value of 9 (512B):
1063  *
1064  * 1) calculate the number of sm_shift sectors in the region:
1065  *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1066  * 2) multiply by the power of 2 exponent and the sm_shift value:
1067  *	16384 * 21 * 9 = 3096576
1068  * This value will be added to the weighting of the metaslab.
1069  */
1070 static uint64_t
1071 metaslab_weight_factor(metaslab_t *msp)
1072 {
1073 	uint64_t factor = 0;
1074 	uint64_t sectors;
1075 	int i;
1076 
1077 	/*
1078 	 * A null space map means that the entire metaslab is free,
1079 	 * calculate a weight factor that spans the entire size of the
1080 	 * metaslab.
1081 	 */
1082 	if (msp->ms_sm == NULL) {
1083 		vdev_t *vd = msp->ms_group->mg_vd;
1084 
1085 		i = highbit64(msp->ms_size) - 1;
1086 		sectors = msp->ms_size >> vd->vdev_ashift;
1087 		return (sectors * i * vd->vdev_ashift);
1088 	}
1089 
1090 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1091 		return (0);
1092 
1093 	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1094 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1095 			continue;
1096 
1097 		/*
1098 		 * Determine the number of sm_shift sectors in the region
1099 		 * indicated by the histogram. For example, given an
1100 		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1101 		 * that we're looking at an 8K region in the histogram
1102 		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1103 		 * number of sm_shift sectors (512 bytes in this example),
1104 		 * we would take 8192 / 512 = 16. Since the histogram
1105 		 * is offset by sm_shift we can simply use the value of
1106 		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1107 		 */
1108 		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1109 		factor += (i + msp->ms_sm->sm_shift) * sectors;
1110 	}
1111 	return (factor * msp->ms_sm->sm_shift);
1112 }
1113 
1114 static uint64_t
1115 metaslab_weight(metaslab_t *msp)
1116 {
1117 	metaslab_group_t *mg = msp->ms_group;
1118 	vdev_t *vd = mg->mg_vd;
1119 	uint64_t weight, space;
1120 
1121 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1122 
1123 	/*
1124 	 * This vdev is in the process of being removed so there is nothing
1125 	 * for us to do here.
1126 	 */
1127 	if (vd->vdev_removing) {
1128 		ASSERT0(space_map_allocated(msp->ms_sm));
1129 		ASSERT0(vd->vdev_ms_shift);
1130 		return (0);
1131 	}
1132 
1133 	/*
1134 	 * The baseline weight is the metaslab's free space.
1135 	 */
1136 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1137 	weight = space;
1138 
1139 	/*
1140 	 * Modern disks have uniform bit density and constant angular velocity.
1141 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1142 	 * than the inner zones by the ratio of outer to inner track diameter,
1143 	 * which is typically around 2:1.  We account for this by assigning
1144 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1145 	 * In effect, this means that we'll select the metaslab with the most
1146 	 * free bandwidth rather than simply the one with the most free space.
1147 	 */
1148 	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1149 	ASSERT(weight >= space && weight <= 2 * space);
1150 
1151 	msp->ms_factor = metaslab_weight_factor(msp);
1152 	if (metaslab_weight_factor_enable)
1153 		weight += msp->ms_factor;
1154 
1155 	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1156 		/*
1157 		 * If this metaslab is one we're actively using, adjust its
1158 		 * weight to make it preferable to any inactive metaslab so
1159 		 * we'll polish it off.
1160 		 */
1161 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1162 	}
1163 
1164 	return (weight);
1165 }
1166 
1167 static int
1168 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1169 {
1170 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1171 
1172 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1173 		metaslab_load_wait(msp);
1174 		if (!msp->ms_loaded) {
1175 			int error = metaslab_load(msp);
1176 			if (error) {
1177 				metaslab_group_sort(msp->ms_group, msp, 0);
1178 				return (error);
1179 			}
1180 		}
1181 
1182 		metaslab_group_sort(msp->ms_group, msp,
1183 		    msp->ms_weight | activation_weight);
1184 	}
1185 	ASSERT(msp->ms_loaded);
1186 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1187 
1188 	return (0);
1189 }
1190 
1191 static void
1192 metaslab_passivate(metaslab_t *msp, uint64_t size)
1193 {
1194 	/*
1195 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1196 	 * this metaslab again.  In that case, it had better be empty,
1197 	 * or we would be leaving space on the table.
1198 	 */
1199 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1200 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1201 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1202 }
1203 
1204 static void
1205 metaslab_preload(void *arg)
1206 {
1207 	metaslab_t *msp = arg;
1208 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1209 
1210 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1211 
1212 	mutex_enter(&msp->ms_lock);
1213 	metaslab_load_wait(msp);
1214 	if (!msp->ms_loaded)
1215 		(void) metaslab_load(msp);
1216 
1217 	/*
1218 	 * Set the ms_access_txg value so that we don't unload it right away.
1219 	 */
1220 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1221 	mutex_exit(&msp->ms_lock);
1222 }
1223 
1224 static void
1225 metaslab_group_preload(metaslab_group_t *mg)
1226 {
1227 	spa_t *spa = mg->mg_vd->vdev_spa;
1228 	metaslab_t *msp;
1229 	avl_tree_t *t = &mg->mg_metaslab_tree;
1230 	int m = 0;
1231 
1232 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1233 		taskq_wait(mg->mg_taskq);
1234 		return;
1235 	}
1236 
1237 	mutex_enter(&mg->mg_lock);
1238 	/*
1239 	 * Load the next potential metaslabs
1240 	 */
1241 	msp = avl_first(t);
1242 	while (msp != NULL) {
1243 		metaslab_t *msp_next = AVL_NEXT(t, msp);
1244 
1245 		/* If we have reached our preload limit then we're done */
1246 		if (++m > metaslab_preload_limit)
1247 			break;
1248 
1249 		/*
1250 		 * We must drop the metaslab group lock here to preserve
1251 		 * lock ordering with the ms_lock (when grabbing both
1252 		 * the mg_lock and the ms_lock, the ms_lock must be taken
1253 		 * first).  As a result, it is possible that the ordering
1254 		 * of the metaslabs within the avl tree may change before
1255 		 * we reacquire the lock. The metaslab cannot be removed from
1256 		 * the tree while we're in syncing context so it is safe to
1257 		 * drop the mg_lock here. If the metaslabs are reordered
1258 		 * nothing will break -- we just may end up loading a
1259 		 * less than optimal one.
1260 		 */
1261 		mutex_exit(&mg->mg_lock);
1262 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1263 		    msp, TQ_SLEEP) != NULL);
1264 		mutex_enter(&mg->mg_lock);
1265 		msp = msp_next;
1266 	}
1267 	mutex_exit(&mg->mg_lock);
1268 }
1269 
1270 /*
1271  * Determine if the space map's on-disk footprint is past our tolerance
1272  * for inefficiency. We would like to use the following criteria to make
1273  * our decision:
1274  *
1275  * 1. The size of the space map object should not dramatically increase as a
1276  * result of writing out the free space range tree.
1277  *
1278  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1279  * times the size than the free space range tree representation
1280  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1281  *
1282  * Checking the first condition is tricky since we don't want to walk
1283  * the entire AVL tree calculating the estimated on-disk size. Instead we
1284  * use the size-ordered range tree in the metaslab and calculate the
1285  * size required to write out the largest segment in our free tree. If the
1286  * size required to represent that segment on disk is larger than the space
1287  * map object then we avoid condensing this map.
1288  *
1289  * To determine the second criterion we use a best-case estimate and assume
1290  * each segment can be represented on-disk as a single 64-bit entry. We refer
1291  * to this best-case estimate as the space map's minimal form.
1292  */
1293 static boolean_t
1294 metaslab_should_condense(metaslab_t *msp)
1295 {
1296 	space_map_t *sm = msp->ms_sm;
1297 	range_seg_t *rs;
1298 	uint64_t size, entries, segsz;
1299 
1300 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1301 	ASSERT(msp->ms_loaded);
1302 
1303 	/*
1304 	 * Use the ms_size_tree range tree, which is ordered by size, to
1305 	 * obtain the largest segment in the free tree. If the tree is empty
1306 	 * then we should condense the map.
1307 	 */
1308 	rs = avl_last(&msp->ms_size_tree);
1309 	if (rs == NULL)
1310 		return (B_TRUE);
1311 
1312 	/*
1313 	 * Calculate the number of 64-bit entries this segment would
1314 	 * require when written to disk. If this single segment would be
1315 	 * larger on-disk than the entire current on-disk structure, then
1316 	 * clearly condensing will increase the on-disk structure size.
1317 	 */
1318 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1319 	entries = size / (MIN(size, SM_RUN_MAX));
1320 	segsz = entries * sizeof (uint64_t);
1321 
1322 	return (segsz <= space_map_length(msp->ms_sm) &&
1323 	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1324 	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1325 }
1326 
1327 /*
1328  * Condense the on-disk space map representation to its minimized form.
1329  * The minimized form consists of a small number of allocations followed by
1330  * the entries of the free range tree.
1331  */
1332 static void
1333 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1334 {
1335 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1336 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1337 	range_tree_t *condense_tree;
1338 	space_map_t *sm = msp->ms_sm;
1339 
1340 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1341 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1342 	ASSERT(msp->ms_loaded);
1343 
1344 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1345 	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1346 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1347 
1348 	/*
1349 	 * Create an range tree that is 100% allocated. We remove segments
1350 	 * that have been freed in this txg, any deferred frees that exist,
1351 	 * and any allocation in the future. Removing segments should be
1352 	 * a relatively inexpensive operation since we expect these trees to
1353 	 * have a small number of nodes.
1354 	 */
1355 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1356 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1357 
1358 	/*
1359 	 * Remove what's been freed in this txg from the condense_tree.
1360 	 * Since we're in sync_pass 1, we know that all the frees from
1361 	 * this txg are in the freetree.
1362 	 */
1363 	range_tree_walk(freetree, range_tree_remove, condense_tree);
1364 
1365 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1366 		range_tree_walk(msp->ms_defertree[t],
1367 		    range_tree_remove, condense_tree);
1368 	}
1369 
1370 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1371 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1372 		    range_tree_remove, condense_tree);
1373 	}
1374 
1375 	/*
1376 	 * We're about to drop the metaslab's lock thus allowing
1377 	 * other consumers to change it's content. Set the
1378 	 * metaslab's ms_condensing flag to ensure that
1379 	 * allocations on this metaslab do not occur while we're
1380 	 * in the middle of committing it to disk. This is only critical
1381 	 * for the ms_tree as all other range trees use per txg
1382 	 * views of their content.
1383 	 */
1384 	msp->ms_condensing = B_TRUE;
1385 
1386 	mutex_exit(&msp->ms_lock);
1387 	space_map_truncate(sm, tx);
1388 	mutex_enter(&msp->ms_lock);
1389 
1390 	/*
1391 	 * While we would ideally like to create a space_map representation
1392 	 * that consists only of allocation records, doing so can be
1393 	 * prohibitively expensive because the in-core free tree can be
1394 	 * large, and therefore computationally expensive to subtract
1395 	 * from the condense_tree. Instead we sync out two trees, a cheap
1396 	 * allocation only tree followed by the in-core free tree. While not
1397 	 * optimal, this is typically close to optimal, and much cheaper to
1398 	 * compute.
1399 	 */
1400 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1401 	range_tree_vacate(condense_tree, NULL, NULL);
1402 	range_tree_destroy(condense_tree);
1403 
1404 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1405 	msp->ms_condensing = B_FALSE;
1406 }
1407 
1408 /*
1409  * Write a metaslab to disk in the context of the specified transaction group.
1410  */
1411 void
1412 metaslab_sync(metaslab_t *msp, uint64_t txg)
1413 {
1414 	metaslab_group_t *mg = msp->ms_group;
1415 	vdev_t *vd = mg->mg_vd;
1416 	spa_t *spa = vd->vdev_spa;
1417 	objset_t *mos = spa_meta_objset(spa);
1418 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1419 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1420 	range_tree_t **freed_tree =
1421 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1422 	dmu_tx_t *tx;
1423 	uint64_t object = space_map_object(msp->ms_sm);
1424 
1425 	ASSERT(!vd->vdev_ishole);
1426 
1427 	/*
1428 	 * This metaslab has just been added so there's no work to do now.
1429 	 */
1430 	if (*freetree == NULL) {
1431 		ASSERT3P(alloctree, ==, NULL);
1432 		return;
1433 	}
1434 
1435 	ASSERT3P(alloctree, !=, NULL);
1436 	ASSERT3P(*freetree, !=, NULL);
1437 	ASSERT3P(*freed_tree, !=, NULL);
1438 
1439 	if (range_tree_space(alloctree) == 0 &&
1440 	    range_tree_space(*freetree) == 0)
1441 		return;
1442 
1443 	/*
1444 	 * The only state that can actually be changing concurrently with
1445 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1446 	 * be modifying this txg's alloctree, freetree, freed_tree, or
1447 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1448 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1449 	 * because the DMU can call down to us (e.g. via zio_free()) at
1450 	 * any time.
1451 	 */
1452 
1453 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1454 
1455 	if (msp->ms_sm == NULL) {
1456 		uint64_t new_object;
1457 
1458 		new_object = space_map_alloc(mos, tx);
1459 		VERIFY3U(new_object, !=, 0);
1460 
1461 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1462 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1463 		    &msp->ms_lock));
1464 		ASSERT(msp->ms_sm != NULL);
1465 	}
1466 
1467 	mutex_enter(&msp->ms_lock);
1468 
1469 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1470 	    metaslab_should_condense(msp)) {
1471 		metaslab_condense(msp, txg, tx);
1472 	} else {
1473 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1474 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1475 	}
1476 
1477 	range_tree_vacate(alloctree, NULL, NULL);
1478 
1479 	if (msp->ms_loaded) {
1480 		/*
1481 		 * When the space map is loaded, we have an accruate
1482 		 * histogram in the range tree. This gives us an opportunity
1483 		 * to bring the space map's histogram up-to-date so we clear
1484 		 * it first before updating it.
1485 		 */
1486 		space_map_histogram_clear(msp->ms_sm);
1487 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1488 	} else {
1489 		/*
1490 		 * Since the space map is not loaded we simply update the
1491 		 * exisiting histogram with what was freed in this txg. This
1492 		 * means that the on-disk histogram may not have an accurate
1493 		 * view of the free space but it's close enough to allow
1494 		 * us to make allocation decisions.
1495 		 */
1496 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1497 	}
1498 
1499 	/*
1500 	 * For sync pass 1, we avoid traversing this txg's free range tree
1501 	 * and instead will just swap the pointers for freetree and
1502 	 * freed_tree. We can safely do this since the freed_tree is
1503 	 * guaranteed to be empty on the initial pass.
1504 	 */
1505 	if (spa_sync_pass(spa) == 1) {
1506 		range_tree_swap(freetree, freed_tree);
1507 	} else {
1508 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1509 	}
1510 
1511 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1512 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1513 
1514 	mutex_exit(&msp->ms_lock);
1515 
1516 	if (object != space_map_object(msp->ms_sm)) {
1517 		object = space_map_object(msp->ms_sm);
1518 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1519 		    msp->ms_id, sizeof (uint64_t), &object, tx);
1520 	}
1521 	dmu_tx_commit(tx);
1522 }
1523 
1524 /*
1525  * Called after a transaction group has completely synced to mark
1526  * all of the metaslab's free space as usable.
1527  */
1528 void
1529 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1530 {
1531 	metaslab_group_t *mg = msp->ms_group;
1532 	vdev_t *vd = mg->mg_vd;
1533 	range_tree_t **freed_tree;
1534 	range_tree_t **defer_tree;
1535 	int64_t alloc_delta, defer_delta;
1536 
1537 	ASSERT(!vd->vdev_ishole);
1538 
1539 	mutex_enter(&msp->ms_lock);
1540 
1541 	/*
1542 	 * If this metaslab is just becoming available, initialize its
1543 	 * alloctrees, freetrees, and defertree and add its capacity to
1544 	 * the vdev.
1545 	 */
1546 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1547 		for (int t = 0; t < TXG_SIZE; t++) {
1548 			ASSERT(msp->ms_alloctree[t] == NULL);
1549 			ASSERT(msp->ms_freetree[t] == NULL);
1550 
1551 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1552 			    &msp->ms_lock);
1553 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1554 			    &msp->ms_lock);
1555 		}
1556 
1557 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1558 			ASSERT(msp->ms_defertree[t] == NULL);
1559 
1560 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1561 			    &msp->ms_lock);
1562 		}
1563 
1564 		vdev_space_update(vd, 0, 0, msp->ms_size);
1565 	}
1566 
1567 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1568 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1569 
1570 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1571 	defer_delta = range_tree_space(*freed_tree) -
1572 	    range_tree_space(*defer_tree);
1573 
1574 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1575 
1576 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1577 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1578 
1579 	/*
1580 	 * If there's a metaslab_load() in progress, wait for it to complete
1581 	 * so that we have a consistent view of the in-core space map.
1582 	 */
1583 	metaslab_load_wait(msp);
1584 
1585 	/*
1586 	 * Move the frees from the defer_tree back to the free
1587 	 * range tree (if it's loaded). Swap the freed_tree and the
1588 	 * defer_tree -- this is safe to do because we've just emptied out
1589 	 * the defer_tree.
1590 	 */
1591 	range_tree_vacate(*defer_tree,
1592 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1593 	range_tree_swap(freed_tree, defer_tree);
1594 
1595 	space_map_update(msp->ms_sm);
1596 
1597 	msp->ms_deferspace += defer_delta;
1598 	ASSERT3S(msp->ms_deferspace, >=, 0);
1599 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1600 	if (msp->ms_deferspace != 0) {
1601 		/*
1602 		 * Keep syncing this metaslab until all deferred frees
1603 		 * are back in circulation.
1604 		 */
1605 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1606 	}
1607 
1608 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1609 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1610 			VERIFY0(range_tree_space(
1611 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1612 		}
1613 
1614 		if (!metaslab_debug_unload)
1615 			metaslab_unload(msp);
1616 	}
1617 
1618 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1619 	mutex_exit(&msp->ms_lock);
1620 
1621 }
1622 
1623 void
1624 metaslab_sync_reassess(metaslab_group_t *mg)
1625 {
1626 	metaslab_group_alloc_update(mg);
1627 
1628 	/*
1629 	 * Preload the next potential metaslabs
1630 	 */
1631 	metaslab_group_preload(mg);
1632 }
1633 
1634 static uint64_t
1635 metaslab_distance(metaslab_t *msp, dva_t *dva)
1636 {
1637 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1638 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1639 	uint64_t start = msp->ms_id;
1640 
1641 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1642 		return (1ULL << 63);
1643 
1644 	if (offset < start)
1645 		return ((start - offset) << ms_shift);
1646 	if (offset > start)
1647 		return ((offset - start) << ms_shift);
1648 	return (0);
1649 }
1650 
1651 static uint64_t
1652 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1653     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
1654 {
1655 	spa_t *spa = mg->mg_vd->vdev_spa;
1656 	metaslab_t *msp = NULL;
1657 	uint64_t offset = -1ULL;
1658 	avl_tree_t *t = &mg->mg_metaslab_tree;
1659 	uint64_t activation_weight;
1660 	uint64_t target_distance;
1661 	int i;
1662 
1663 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1664 	for (i = 0; i < d; i++) {
1665 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1666 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1667 			break;
1668 		}
1669 	}
1670 
1671 	for (;;) {
1672 		boolean_t was_active;
1673 
1674 		mutex_enter(&mg->mg_lock);
1675 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1676 			if (msp->ms_weight < asize) {
1677 				spa_dbgmsg(spa, "%s: failed to meet weight "
1678 				    "requirement: vdev %llu, txg %llu, mg %p, "
1679 				    "msp %p, psize %llu, asize %llu, "
1680 				    "weight %llu", spa_name(spa),
1681 				    mg->mg_vd->vdev_id, txg,
1682 				    mg, msp, psize, asize, msp->ms_weight);
1683 				mutex_exit(&mg->mg_lock);
1684 				return (-1ULL);
1685 			}
1686 
1687 			/*
1688 			 * If the selected metaslab is condensing, skip it.
1689 			 */
1690 			if (msp->ms_condensing)
1691 				continue;
1692 
1693 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1694 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1695 				break;
1696 
1697 			target_distance = min_distance +
1698 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1699 			    min_distance >> 1);
1700 
1701 			for (i = 0; i < d; i++)
1702 				if (metaslab_distance(msp, &dva[i]) <
1703 				    target_distance)
1704 					break;
1705 			if (i == d)
1706 				break;
1707 		}
1708 		mutex_exit(&mg->mg_lock);
1709 		if (msp == NULL)
1710 			return (-1ULL);
1711 
1712 		mutex_enter(&msp->ms_lock);
1713 
1714 		/*
1715 		 * Ensure that the metaslab we have selected is still
1716 		 * capable of handling our request. It's possible that
1717 		 * another thread may have changed the weight while we
1718 		 * were blocked on the metaslab lock.
1719 		 */
1720 		if (msp->ms_weight < asize || (was_active &&
1721 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1722 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1723 			mutex_exit(&msp->ms_lock);
1724 			continue;
1725 		}
1726 
1727 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1728 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1729 			metaslab_passivate(msp,
1730 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1731 			mutex_exit(&msp->ms_lock);
1732 			continue;
1733 		}
1734 
1735 		if (metaslab_activate(msp, activation_weight) != 0) {
1736 			mutex_exit(&msp->ms_lock);
1737 			continue;
1738 		}
1739 
1740 		/*
1741 		 * If this metaslab is currently condensing then pick again as
1742 		 * we can't manipulate this metaslab until it's committed
1743 		 * to disk.
1744 		 */
1745 		if (msp->ms_condensing) {
1746 			mutex_exit(&msp->ms_lock);
1747 			continue;
1748 		}
1749 
1750 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1751 			break;
1752 
1753 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1754 		mutex_exit(&msp->ms_lock);
1755 	}
1756 
1757 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1758 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1759 
1760 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1761 	msp->ms_access_txg = txg + metaslab_unload_delay;
1762 
1763 	mutex_exit(&msp->ms_lock);
1764 
1765 	return (offset);
1766 }
1767 
1768 /*
1769  * Allocate a block for the specified i/o.
1770  */
1771 static int
1772 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1773     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1774 {
1775 	metaslab_group_t *mg, *rotor;
1776 	vdev_t *vd;
1777 	int dshift = 3;
1778 	int all_zero;
1779 	int zio_lock = B_FALSE;
1780 	boolean_t allocatable;
1781 	uint64_t offset = -1ULL;
1782 	uint64_t asize;
1783 	uint64_t distance;
1784 
1785 	ASSERT(!DVA_IS_VALID(&dva[d]));
1786 
1787 	/*
1788 	 * For testing, make some blocks above a certain size be gang blocks.
1789 	 */
1790 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1791 		return (SET_ERROR(ENOSPC));
1792 
1793 	/*
1794 	 * Start at the rotor and loop through all mgs until we find something.
1795 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1796 	 * nothing actually breaks if we miss a few updates -- we just won't
1797 	 * allocate quite as evenly.  It all balances out over time.
1798 	 *
1799 	 * If we are doing ditto or log blocks, try to spread them across
1800 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1801 	 * allocated all of our ditto blocks, then try and spread them out on
1802 	 * that vdev as much as possible.  If it turns out to not be possible,
1803 	 * gradually lower our standards until anything becomes acceptable.
1804 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1805 	 * gives us hope of containing our fault domains to something we're
1806 	 * able to reason about.  Otherwise, any two top-level vdev failures
1807 	 * will guarantee the loss of data.  With consecutive allocation,
1808 	 * only two adjacent top-level vdev failures will result in data loss.
1809 	 *
1810 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1811 	 * ourselves on the same vdev as our gang block header.  That
1812 	 * way, we can hope for locality in vdev_cache, plus it makes our
1813 	 * fault domains something tractable.
1814 	 */
1815 	if (hintdva) {
1816 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1817 
1818 		/*
1819 		 * It's possible the vdev we're using as the hint no
1820 		 * longer exists (i.e. removed). Consult the rotor when
1821 		 * all else fails.
1822 		 */
1823 		if (vd != NULL) {
1824 			mg = vd->vdev_mg;
1825 
1826 			if (flags & METASLAB_HINTBP_AVOID &&
1827 			    mg->mg_next != NULL)
1828 				mg = mg->mg_next;
1829 		} else {
1830 			mg = mc->mc_rotor;
1831 		}
1832 	} else if (d != 0) {
1833 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1834 		mg = vd->vdev_mg->mg_next;
1835 	} else {
1836 		mg = mc->mc_rotor;
1837 	}
1838 
1839 	/*
1840 	 * If the hint put us into the wrong metaslab class, or into a
1841 	 * metaslab group that has been passivated, just follow the rotor.
1842 	 */
1843 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1844 		mg = mc->mc_rotor;
1845 
1846 	rotor = mg;
1847 top:
1848 	all_zero = B_TRUE;
1849 	do {
1850 		ASSERT(mg->mg_activation_count == 1);
1851 
1852 		vd = mg->mg_vd;
1853 
1854 		/*
1855 		 * Don't allocate from faulted devices.
1856 		 */
1857 		if (zio_lock) {
1858 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1859 			allocatable = vdev_allocatable(vd);
1860 			spa_config_exit(spa, SCL_ZIO, FTAG);
1861 		} else {
1862 			allocatable = vdev_allocatable(vd);
1863 		}
1864 
1865 		/*
1866 		 * Determine if the selected metaslab group is eligible
1867 		 * for allocations. If we're ganging or have requested
1868 		 * an allocation for the smallest gang block size
1869 		 * then we don't want to avoid allocating to the this
1870 		 * metaslab group. If we're in this condition we should
1871 		 * try to allocate from any device possible so that we
1872 		 * don't inadvertently return ENOSPC and suspend the pool
1873 		 * even though space is still available.
1874 		 */
1875 		if (allocatable && CAN_FASTGANG(flags) &&
1876 		    psize > SPA_GANGBLOCKSIZE)
1877 			allocatable = metaslab_group_allocatable(mg);
1878 
1879 		if (!allocatable)
1880 			goto next;
1881 
1882 		/*
1883 		 * Avoid writing single-copy data to a failing vdev
1884 		 * unless the user instructs us that it is okay.
1885 		 */
1886 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1887 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1888 		    d == 0 && dshift == 3 &&
1889 		    !(zfs_write_to_degraded && vd->vdev_state ==
1890 		    VDEV_STATE_DEGRADED)) {
1891 			all_zero = B_FALSE;
1892 			goto next;
1893 		}
1894 
1895 		ASSERT(mg->mg_class == mc);
1896 
1897 		distance = vd->vdev_asize >> dshift;
1898 		if (distance <= (1ULL << vd->vdev_ms_shift))
1899 			distance = 0;
1900 		else
1901 			all_zero = B_FALSE;
1902 
1903 		asize = vdev_psize_to_asize(vd, psize);
1904 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1905 
1906 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1907 		    dva, d);
1908 		if (offset != -1ULL) {
1909 			/*
1910 			 * If we've just selected this metaslab group,
1911 			 * figure out whether the corresponding vdev is
1912 			 * over- or under-used relative to the pool,
1913 			 * and set an allocation bias to even it out.
1914 			 */
1915 			if (mc->mc_aliquot == 0) {
1916 				vdev_stat_t *vs = &vd->vdev_stat;
1917 				int64_t vu, cu;
1918 
1919 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1920 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1921 
1922 				/*
1923 				 * Calculate how much more or less we should
1924 				 * try to allocate from this device during
1925 				 * this iteration around the rotor.
1926 				 * For example, if a device is 80% full
1927 				 * and the pool is 20% full then we should
1928 				 * reduce allocations by 60% on this device.
1929 				 *
1930 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1931 				 *
1932 				 * This reduces allocations by 307K for this
1933 				 * iteration.
1934 				 */
1935 				mg->mg_bias = ((cu - vu) *
1936 				    (int64_t)mg->mg_aliquot) / 100;
1937 			}
1938 
1939 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1940 			    mg->mg_aliquot + mg->mg_bias) {
1941 				mc->mc_rotor = mg->mg_next;
1942 				mc->mc_aliquot = 0;
1943 			}
1944 
1945 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1946 			DVA_SET_OFFSET(&dva[d], offset);
1947 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1948 			DVA_SET_ASIZE(&dva[d], asize);
1949 
1950 			return (0);
1951 		}
1952 next:
1953 		mc->mc_rotor = mg->mg_next;
1954 		mc->mc_aliquot = 0;
1955 	} while ((mg = mg->mg_next) != rotor);
1956 
1957 	if (!all_zero) {
1958 		dshift++;
1959 		ASSERT(dshift < 64);
1960 		goto top;
1961 	}
1962 
1963 	if (!allocatable && !zio_lock) {
1964 		dshift = 3;
1965 		zio_lock = B_TRUE;
1966 		goto top;
1967 	}
1968 
1969 	bzero(&dva[d], sizeof (dva_t));
1970 
1971 	return (SET_ERROR(ENOSPC));
1972 }
1973 
1974 /*
1975  * Free the block represented by DVA in the context of the specified
1976  * transaction group.
1977  */
1978 static void
1979 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1980 {
1981 	uint64_t vdev = DVA_GET_VDEV(dva);
1982 	uint64_t offset = DVA_GET_OFFSET(dva);
1983 	uint64_t size = DVA_GET_ASIZE(dva);
1984 	vdev_t *vd;
1985 	metaslab_t *msp;
1986 
1987 	ASSERT(DVA_IS_VALID(dva));
1988 
1989 	if (txg > spa_freeze_txg(spa))
1990 		return;
1991 
1992 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1993 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1994 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1995 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1996 		ASSERT(0);
1997 		return;
1998 	}
1999 
2000 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2001 
2002 	if (DVA_GET_GANG(dva))
2003 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2004 
2005 	mutex_enter(&msp->ms_lock);
2006 
2007 	if (now) {
2008 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2009 		    offset, size);
2010 
2011 		VERIFY(!msp->ms_condensing);
2012 		VERIFY3U(offset, >=, msp->ms_start);
2013 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2014 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2015 		    msp->ms_size);
2016 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2017 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2018 		range_tree_add(msp->ms_tree, offset, size);
2019 	} else {
2020 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2021 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2022 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2023 		    offset, size);
2024 	}
2025 
2026 	mutex_exit(&msp->ms_lock);
2027 }
2028 
2029 /*
2030  * Intent log support: upon opening the pool after a crash, notify the SPA
2031  * of blocks that the intent log has allocated for immediate write, but
2032  * which are still considered free by the SPA because the last transaction
2033  * group didn't commit yet.
2034  */
2035 static int
2036 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2037 {
2038 	uint64_t vdev = DVA_GET_VDEV(dva);
2039 	uint64_t offset = DVA_GET_OFFSET(dva);
2040 	uint64_t size = DVA_GET_ASIZE(dva);
2041 	vdev_t *vd;
2042 	metaslab_t *msp;
2043 	int error = 0;
2044 
2045 	ASSERT(DVA_IS_VALID(dva));
2046 
2047 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2048 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2049 		return (SET_ERROR(ENXIO));
2050 
2051 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2052 
2053 	if (DVA_GET_GANG(dva))
2054 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2055 
2056 	mutex_enter(&msp->ms_lock);
2057 
2058 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2059 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2060 
2061 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2062 		error = SET_ERROR(ENOENT);
2063 
2064 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2065 		mutex_exit(&msp->ms_lock);
2066 		return (error);
2067 	}
2068 
2069 	VERIFY(!msp->ms_condensing);
2070 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2071 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2072 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2073 	range_tree_remove(msp->ms_tree, offset, size);
2074 
2075 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2076 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2077 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2078 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2079 	}
2080 
2081 	mutex_exit(&msp->ms_lock);
2082 
2083 	return (0);
2084 }
2085 
2086 int
2087 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2088     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2089 {
2090 	dva_t *dva = bp->blk_dva;
2091 	dva_t *hintdva = hintbp->blk_dva;
2092 	int error = 0;
2093 
2094 	ASSERT(bp->blk_birth == 0);
2095 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2096 
2097 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2098 
2099 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2100 		spa_config_exit(spa, SCL_ALLOC, FTAG);
2101 		return (SET_ERROR(ENOSPC));
2102 	}
2103 
2104 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2105 	ASSERT(BP_GET_NDVAS(bp) == 0);
2106 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2107 
2108 	for (int d = 0; d < ndvas; d++) {
2109 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2110 		    txg, flags);
2111 		if (error != 0) {
2112 			for (d--; d >= 0; d--) {
2113 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2114 				bzero(&dva[d], sizeof (dva_t));
2115 			}
2116 			spa_config_exit(spa, SCL_ALLOC, FTAG);
2117 			return (error);
2118 		}
2119 	}
2120 	ASSERT(error == 0);
2121 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2122 
2123 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2124 
2125 	BP_SET_BIRTH(bp, txg, txg);
2126 
2127 	return (0);
2128 }
2129 
2130 void
2131 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2132 {
2133 	const dva_t *dva = bp->blk_dva;
2134 	int ndvas = BP_GET_NDVAS(bp);
2135 
2136 	ASSERT(!BP_IS_HOLE(bp));
2137 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2138 
2139 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2140 
2141 	for (int d = 0; d < ndvas; d++)
2142 		metaslab_free_dva(spa, &dva[d], txg, now);
2143 
2144 	spa_config_exit(spa, SCL_FREE, FTAG);
2145 }
2146 
2147 int
2148 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2149 {
2150 	const dva_t *dva = bp->blk_dva;
2151 	int ndvas = BP_GET_NDVAS(bp);
2152 	int error = 0;
2153 
2154 	ASSERT(!BP_IS_HOLE(bp));
2155 
2156 	if (txg != 0) {
2157 		/*
2158 		 * First do a dry run to make sure all DVAs are claimable,
2159 		 * so we don't have to unwind from partial failures below.
2160 		 */
2161 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2162 			return (error);
2163 	}
2164 
2165 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2166 
2167 	for (int d = 0; d < ndvas; d++)
2168 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2169 			break;
2170 
2171 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2172 
2173 	ASSERT(error == 0 || txg == 0);
2174 
2175 	return (error);
2176 }
2177 
2178 void
2179 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2180 {
2181 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2182 		return;
2183 
2184 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2185 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2186 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2187 		vdev_t *vd = vdev_lookup_top(spa, vdev);
2188 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2189 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2190 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2191 
2192 		if (msp->ms_loaded)
2193 			range_tree_verify(msp->ms_tree, offset, size);
2194 
2195 		for (int j = 0; j < TXG_SIZE; j++)
2196 			range_tree_verify(msp->ms_freetree[j], offset, size);
2197 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2198 			range_tree_verify(msp->ms_defertree[j], offset, size);
2199 	}
2200 	spa_config_exit(spa, SCL_VDEV, FTAG);
2201 }
2202