xref: /titanic_50/usr/src/uts/common/fs/zfs/metaslab.c (revision a7fe1d5bb55904d4c79638b8778bc9dd8ed7fd7b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 
35 /*
36  * Allow allocations to switch to gang blocks quickly. We do this to
37  * avoid having to load lots of space_maps in a given txg. There are,
38  * however, some cases where we want to avoid "fast" ganging and instead
39  * we want to do an exhaustive search of all metaslabs on this device.
40  * Currently we don't allow any gang, zil, or dump device related allocations
41  * to "fast" gang.
42  */
43 #define	CAN_FASTGANG(flags) \
44 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
45 	METASLAB_GANG_AVOID)))
46 
47 uint64_t metaslab_aliquot = 512ULL << 10;
48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
49 
50 /*
51  * The in-core space map representation is more compact than its on-disk form.
52  * The zfs_condense_pct determines how much more compact the in-core
53  * space_map representation must be before we compact it on-disk.
54  * Values should be greater than or equal to 100.
55  */
56 int zfs_condense_pct = 200;
57 
58 /*
59  * This value defines the number of allowed allocation failures per vdev.
60  * If a device reaches this threshold in a given txg then we consider skipping
61  * allocations on that device. The value of zfs_mg_alloc_failures is computed
62  * in zio_init() unless it has been overridden in /etc/system.
63  */
64 int zfs_mg_alloc_failures = 0;
65 
66 /*
67  * The zfs_mg_noalloc_threshold defines which metaslab groups should
68  * be eligible for allocation. The value is defined as a percentage of
69  * a free space. Metaslab groups that have more free space than
70  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
71  * a metaslab group's free space is less than or equal to the
72  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
73  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
74  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
75  * groups are allowed to accept allocations. Gang blocks are always
76  * eligible to allocate on any metaslab group. The default value of 0 means
77  * no metaslab group will be excluded based on this criterion.
78  */
79 int zfs_mg_noalloc_threshold = 0;
80 
81 /*
82  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
83  */
84 static int metaslab_debug = 0;
85 
86 /*
87  * Minimum size which forces the dynamic allocator to change
88  * it's allocation strategy.  Once the space map cannot satisfy
89  * an allocation of this size then it switches to using more
90  * aggressive strategy (i.e search by size rather than offset).
91  */
92 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
93 
94 /*
95  * The minimum free space, in percent, which must be available
96  * in a space map to continue allocations in a first-fit fashion.
97  * Once the space_map's free space drops below this level we dynamically
98  * switch to using best-fit allocations.
99  */
100 int metaslab_df_free_pct = 4;
101 
102 /*
103  * A metaslab is considered "free" if it contains a contiguous
104  * segment which is greater than metaslab_min_alloc_size.
105  */
106 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
107 
108 /*
109  * Max number of space_maps to prefetch.
110  */
111 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
112 
113 /*
114  * Percentage bonus multiplier for metaslabs that are in the bonus area.
115  */
116 int metaslab_smo_bonus_pct = 150;
117 
118 /*
119  * Should we be willing to write data to degraded vdevs?
120  */
121 boolean_t zfs_write_to_degraded = B_FALSE;
122 
123 /*
124  * ==========================================================================
125  * Metaslab classes
126  * ==========================================================================
127  */
128 metaslab_class_t *
129 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
130 {
131 	metaslab_class_t *mc;
132 
133 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
134 
135 	mc->mc_spa = spa;
136 	mc->mc_rotor = NULL;
137 	mc->mc_ops = ops;
138 
139 	return (mc);
140 }
141 
142 void
143 metaslab_class_destroy(metaslab_class_t *mc)
144 {
145 	ASSERT(mc->mc_rotor == NULL);
146 	ASSERT(mc->mc_alloc == 0);
147 	ASSERT(mc->mc_deferred == 0);
148 	ASSERT(mc->mc_space == 0);
149 	ASSERT(mc->mc_dspace == 0);
150 
151 	kmem_free(mc, sizeof (metaslab_class_t));
152 }
153 
154 int
155 metaslab_class_validate(metaslab_class_t *mc)
156 {
157 	metaslab_group_t *mg;
158 	vdev_t *vd;
159 
160 	/*
161 	 * Must hold one of the spa_config locks.
162 	 */
163 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
164 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
165 
166 	if ((mg = mc->mc_rotor) == NULL)
167 		return (0);
168 
169 	do {
170 		vd = mg->mg_vd;
171 		ASSERT(vd->vdev_mg != NULL);
172 		ASSERT3P(vd->vdev_top, ==, vd);
173 		ASSERT3P(mg->mg_class, ==, mc);
174 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
175 	} while ((mg = mg->mg_next) != mc->mc_rotor);
176 
177 	return (0);
178 }
179 
180 void
181 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
182     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
183 {
184 	atomic_add_64(&mc->mc_alloc, alloc_delta);
185 	atomic_add_64(&mc->mc_deferred, defer_delta);
186 	atomic_add_64(&mc->mc_space, space_delta);
187 	atomic_add_64(&mc->mc_dspace, dspace_delta);
188 }
189 
190 uint64_t
191 metaslab_class_get_alloc(metaslab_class_t *mc)
192 {
193 	return (mc->mc_alloc);
194 }
195 
196 uint64_t
197 metaslab_class_get_deferred(metaslab_class_t *mc)
198 {
199 	return (mc->mc_deferred);
200 }
201 
202 uint64_t
203 metaslab_class_get_space(metaslab_class_t *mc)
204 {
205 	return (mc->mc_space);
206 }
207 
208 uint64_t
209 metaslab_class_get_dspace(metaslab_class_t *mc)
210 {
211 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
212 }
213 
214 /*
215  * ==========================================================================
216  * Metaslab groups
217  * ==========================================================================
218  */
219 static int
220 metaslab_compare(const void *x1, const void *x2)
221 {
222 	const metaslab_t *m1 = x1;
223 	const metaslab_t *m2 = x2;
224 
225 	if (m1->ms_weight < m2->ms_weight)
226 		return (1);
227 	if (m1->ms_weight > m2->ms_weight)
228 		return (-1);
229 
230 	/*
231 	 * If the weights are identical, use the offset to force uniqueness.
232 	 */
233 	if (m1->ms_map->sm_start < m2->ms_map->sm_start)
234 		return (-1);
235 	if (m1->ms_map->sm_start > m2->ms_map->sm_start)
236 		return (1);
237 
238 	ASSERT3P(m1, ==, m2);
239 
240 	return (0);
241 }
242 
243 /*
244  * Update the allocatable flag and the metaslab group's capacity.
245  * The allocatable flag is set to true if the capacity is below
246  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
247  * from allocatable to non-allocatable or vice versa then the metaslab
248  * group's class is updated to reflect the transition.
249  */
250 static void
251 metaslab_group_alloc_update(metaslab_group_t *mg)
252 {
253 	vdev_t *vd = mg->mg_vd;
254 	metaslab_class_t *mc = mg->mg_class;
255 	vdev_stat_t *vs = &vd->vdev_stat;
256 	boolean_t was_allocatable;
257 
258 	ASSERT(vd == vd->vdev_top);
259 
260 	mutex_enter(&mg->mg_lock);
261 	was_allocatable = mg->mg_allocatable;
262 
263 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
264 	    (vs->vs_space + 1);
265 
266 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
267 
268 	/*
269 	 * The mc_alloc_groups maintains a count of the number of
270 	 * groups in this metaslab class that are still above the
271 	 * zfs_mg_noalloc_threshold. This is used by the allocating
272 	 * threads to determine if they should avoid allocations to
273 	 * a given group. The allocator will avoid allocations to a group
274 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
275 	 * and there are still other groups that are above the threshold.
276 	 * When a group transitions from allocatable to non-allocatable or
277 	 * vice versa we update the metaslab class to reflect that change.
278 	 * When the mc_alloc_groups value drops to 0 that means that all
279 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
280 	 * eligible for allocations. This effectively means that all devices
281 	 * are balanced again.
282 	 */
283 	if (was_allocatable && !mg->mg_allocatable)
284 		mc->mc_alloc_groups--;
285 	else if (!was_allocatable && mg->mg_allocatable)
286 		mc->mc_alloc_groups++;
287 	mutex_exit(&mg->mg_lock);
288 }
289 
290 metaslab_group_t *
291 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
292 {
293 	metaslab_group_t *mg;
294 
295 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
296 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
297 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
298 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
299 	mg->mg_vd = vd;
300 	mg->mg_class = mc;
301 	mg->mg_activation_count = 0;
302 
303 	return (mg);
304 }
305 
306 void
307 metaslab_group_destroy(metaslab_group_t *mg)
308 {
309 	ASSERT(mg->mg_prev == NULL);
310 	ASSERT(mg->mg_next == NULL);
311 	/*
312 	 * We may have gone below zero with the activation count
313 	 * either because we never activated in the first place or
314 	 * because we're done, and possibly removing the vdev.
315 	 */
316 	ASSERT(mg->mg_activation_count <= 0);
317 
318 	avl_destroy(&mg->mg_metaslab_tree);
319 	mutex_destroy(&mg->mg_lock);
320 	kmem_free(mg, sizeof (metaslab_group_t));
321 }
322 
323 void
324 metaslab_group_activate(metaslab_group_t *mg)
325 {
326 	metaslab_class_t *mc = mg->mg_class;
327 	metaslab_group_t *mgprev, *mgnext;
328 
329 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
330 
331 	ASSERT(mc->mc_rotor != mg);
332 	ASSERT(mg->mg_prev == NULL);
333 	ASSERT(mg->mg_next == NULL);
334 	ASSERT(mg->mg_activation_count <= 0);
335 
336 	if (++mg->mg_activation_count <= 0)
337 		return;
338 
339 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
340 	metaslab_group_alloc_update(mg);
341 
342 	if ((mgprev = mc->mc_rotor) == NULL) {
343 		mg->mg_prev = mg;
344 		mg->mg_next = mg;
345 	} else {
346 		mgnext = mgprev->mg_next;
347 		mg->mg_prev = mgprev;
348 		mg->mg_next = mgnext;
349 		mgprev->mg_next = mg;
350 		mgnext->mg_prev = mg;
351 	}
352 	mc->mc_rotor = mg;
353 }
354 
355 void
356 metaslab_group_passivate(metaslab_group_t *mg)
357 {
358 	metaslab_class_t *mc = mg->mg_class;
359 	metaslab_group_t *mgprev, *mgnext;
360 
361 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
362 
363 	if (--mg->mg_activation_count != 0) {
364 		ASSERT(mc->mc_rotor != mg);
365 		ASSERT(mg->mg_prev == NULL);
366 		ASSERT(mg->mg_next == NULL);
367 		ASSERT(mg->mg_activation_count < 0);
368 		return;
369 	}
370 
371 	mgprev = mg->mg_prev;
372 	mgnext = mg->mg_next;
373 
374 	if (mg == mgnext) {
375 		mc->mc_rotor = NULL;
376 	} else {
377 		mc->mc_rotor = mgnext;
378 		mgprev->mg_next = mgnext;
379 		mgnext->mg_prev = mgprev;
380 	}
381 
382 	mg->mg_prev = NULL;
383 	mg->mg_next = NULL;
384 }
385 
386 static void
387 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
388 {
389 	mutex_enter(&mg->mg_lock);
390 	ASSERT(msp->ms_group == NULL);
391 	msp->ms_group = mg;
392 	msp->ms_weight = 0;
393 	avl_add(&mg->mg_metaslab_tree, msp);
394 	mutex_exit(&mg->mg_lock);
395 }
396 
397 static void
398 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
399 {
400 	mutex_enter(&mg->mg_lock);
401 	ASSERT(msp->ms_group == mg);
402 	avl_remove(&mg->mg_metaslab_tree, msp);
403 	msp->ms_group = NULL;
404 	mutex_exit(&mg->mg_lock);
405 }
406 
407 static void
408 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
409 {
410 	/*
411 	 * Although in principle the weight can be any value, in
412 	 * practice we do not use values in the range [1, 510].
413 	 */
414 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
415 	ASSERT(MUTEX_HELD(&msp->ms_lock));
416 
417 	mutex_enter(&mg->mg_lock);
418 	ASSERT(msp->ms_group == mg);
419 	avl_remove(&mg->mg_metaslab_tree, msp);
420 	msp->ms_weight = weight;
421 	avl_add(&mg->mg_metaslab_tree, msp);
422 	mutex_exit(&mg->mg_lock);
423 }
424 
425 /*
426  * Determine if a given metaslab group should skip allocations. A metaslab
427  * group should avoid allocations if its used capacity has crossed the
428  * zfs_mg_noalloc_threshold and there is at least one metaslab group
429  * that can still handle allocations.
430  */
431 static boolean_t
432 metaslab_group_allocatable(metaslab_group_t *mg)
433 {
434 	vdev_t *vd = mg->mg_vd;
435 	spa_t *spa = vd->vdev_spa;
436 	metaslab_class_t *mc = mg->mg_class;
437 
438 	/*
439 	 * A metaslab group is considered allocatable if its free capacity
440 	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
441 	 * associated with a slog, or there are no other metaslab groups
442 	 * with free capacity greater than zfs_mg_noalloc_threshold.
443 	 */
444 	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
445 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
446 }
447 
448 /*
449  * ==========================================================================
450  * Common allocator routines
451  * ==========================================================================
452  */
453 static int
454 metaslab_segsize_compare(const void *x1, const void *x2)
455 {
456 	const space_seg_t *s1 = x1;
457 	const space_seg_t *s2 = x2;
458 	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
459 	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
460 
461 	if (ss_size1 < ss_size2)
462 		return (-1);
463 	if (ss_size1 > ss_size2)
464 		return (1);
465 
466 	if (s1->ss_start < s2->ss_start)
467 		return (-1);
468 	if (s1->ss_start > s2->ss_start)
469 		return (1);
470 
471 	return (0);
472 }
473 
474 /*
475  * This is a helper function that can be used by the allocator to find
476  * a suitable block to allocate. This will search the specified AVL
477  * tree looking for a block that matches the specified criteria.
478  */
479 static uint64_t
480 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
481     uint64_t align)
482 {
483 	space_seg_t *ss, ssearch;
484 	avl_index_t where;
485 
486 	ssearch.ss_start = *cursor;
487 	ssearch.ss_end = *cursor + size;
488 
489 	ss = avl_find(t, &ssearch, &where);
490 	if (ss == NULL)
491 		ss = avl_nearest(t, where, AVL_AFTER);
492 
493 	while (ss != NULL) {
494 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
495 
496 		if (offset + size <= ss->ss_end) {
497 			*cursor = offset + size;
498 			return (offset);
499 		}
500 		ss = AVL_NEXT(t, ss);
501 	}
502 
503 	/*
504 	 * If we know we've searched the whole map (*cursor == 0), give up.
505 	 * Otherwise, reset the cursor to the beginning and try again.
506 	 */
507 	if (*cursor == 0)
508 		return (-1ULL);
509 
510 	*cursor = 0;
511 	return (metaslab_block_picker(t, cursor, size, align));
512 }
513 
514 static void
515 metaslab_pp_load(space_map_t *sm)
516 {
517 	space_seg_t *ss;
518 
519 	ASSERT(sm->sm_ppd == NULL);
520 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
521 
522 	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
523 	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
524 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
525 
526 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
527 		avl_add(sm->sm_pp_root, ss);
528 }
529 
530 static void
531 metaslab_pp_unload(space_map_t *sm)
532 {
533 	void *cookie = NULL;
534 
535 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
536 	sm->sm_ppd = NULL;
537 
538 	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
539 		/* tear down the tree */
540 	}
541 
542 	avl_destroy(sm->sm_pp_root);
543 	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
544 	sm->sm_pp_root = NULL;
545 }
546 
547 /* ARGSUSED */
548 static void
549 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
550 {
551 	/* No need to update cursor */
552 }
553 
554 /* ARGSUSED */
555 static void
556 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
557 {
558 	/* No need to update cursor */
559 }
560 
561 /*
562  * Return the maximum contiguous segment within the metaslab.
563  */
564 uint64_t
565 metaslab_pp_maxsize(space_map_t *sm)
566 {
567 	avl_tree_t *t = sm->sm_pp_root;
568 	space_seg_t *ss;
569 
570 	if (t == NULL || (ss = avl_last(t)) == NULL)
571 		return (0ULL);
572 
573 	return (ss->ss_end - ss->ss_start);
574 }
575 
576 /*
577  * ==========================================================================
578  * The first-fit block allocator
579  * ==========================================================================
580  */
581 static uint64_t
582 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
583 {
584 	avl_tree_t *t = &sm->sm_root;
585 	uint64_t align = size & -size;
586 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
587 
588 	return (metaslab_block_picker(t, cursor, size, align));
589 }
590 
591 /* ARGSUSED */
592 boolean_t
593 metaslab_ff_fragmented(space_map_t *sm)
594 {
595 	return (B_TRUE);
596 }
597 
598 static space_map_ops_t metaslab_ff_ops = {
599 	metaslab_pp_load,
600 	metaslab_pp_unload,
601 	metaslab_ff_alloc,
602 	metaslab_pp_claim,
603 	metaslab_pp_free,
604 	metaslab_pp_maxsize,
605 	metaslab_ff_fragmented
606 };
607 
608 /*
609  * ==========================================================================
610  * Dynamic block allocator -
611  * Uses the first fit allocation scheme until space get low and then
612  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
613  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
614  * ==========================================================================
615  */
616 static uint64_t
617 metaslab_df_alloc(space_map_t *sm, uint64_t size)
618 {
619 	avl_tree_t *t = &sm->sm_root;
620 	uint64_t align = size & -size;
621 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
622 	uint64_t max_size = metaslab_pp_maxsize(sm);
623 	int free_pct = sm->sm_space * 100 / sm->sm_size;
624 
625 	ASSERT(MUTEX_HELD(sm->sm_lock));
626 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
627 
628 	if (max_size < size)
629 		return (-1ULL);
630 
631 	/*
632 	 * If we're running low on space switch to using the size
633 	 * sorted AVL tree (best-fit).
634 	 */
635 	if (max_size < metaslab_df_alloc_threshold ||
636 	    free_pct < metaslab_df_free_pct) {
637 		t = sm->sm_pp_root;
638 		*cursor = 0;
639 	}
640 
641 	return (metaslab_block_picker(t, cursor, size, 1ULL));
642 }
643 
644 static boolean_t
645 metaslab_df_fragmented(space_map_t *sm)
646 {
647 	uint64_t max_size = metaslab_pp_maxsize(sm);
648 	int free_pct = sm->sm_space * 100 / sm->sm_size;
649 
650 	if (max_size >= metaslab_df_alloc_threshold &&
651 	    free_pct >= metaslab_df_free_pct)
652 		return (B_FALSE);
653 
654 	return (B_TRUE);
655 }
656 
657 static space_map_ops_t metaslab_df_ops = {
658 	metaslab_pp_load,
659 	metaslab_pp_unload,
660 	metaslab_df_alloc,
661 	metaslab_pp_claim,
662 	metaslab_pp_free,
663 	metaslab_pp_maxsize,
664 	metaslab_df_fragmented
665 };
666 
667 /*
668  * ==========================================================================
669  * Other experimental allocators
670  * ==========================================================================
671  */
672 static uint64_t
673 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
674 {
675 	avl_tree_t *t = &sm->sm_root;
676 	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
677 	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
678 	uint64_t max_size = metaslab_pp_maxsize(sm);
679 	uint64_t rsize = size;
680 	uint64_t offset = 0;
681 
682 	ASSERT(MUTEX_HELD(sm->sm_lock));
683 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
684 
685 	if (max_size < size)
686 		return (-1ULL);
687 
688 	ASSERT3U(*extent_end, >=, *cursor);
689 
690 	/*
691 	 * If we're running low on space switch to using the size
692 	 * sorted AVL tree (best-fit).
693 	 */
694 	if ((*cursor + size) > *extent_end) {
695 
696 		t = sm->sm_pp_root;
697 		*cursor = *extent_end = 0;
698 
699 		if (max_size > 2 * SPA_MAXBLOCKSIZE)
700 			rsize = MIN(metaslab_min_alloc_size, max_size);
701 		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
702 		if (offset != -1)
703 			*cursor = offset + size;
704 	} else {
705 		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
706 	}
707 	ASSERT3U(*cursor, <=, *extent_end);
708 	return (offset);
709 }
710 
711 static boolean_t
712 metaslab_cdf_fragmented(space_map_t *sm)
713 {
714 	uint64_t max_size = metaslab_pp_maxsize(sm);
715 
716 	if (max_size > (metaslab_min_alloc_size * 10))
717 		return (B_FALSE);
718 	return (B_TRUE);
719 }
720 
721 static space_map_ops_t metaslab_cdf_ops = {
722 	metaslab_pp_load,
723 	metaslab_pp_unload,
724 	metaslab_cdf_alloc,
725 	metaslab_pp_claim,
726 	metaslab_pp_free,
727 	metaslab_pp_maxsize,
728 	metaslab_cdf_fragmented
729 };
730 
731 uint64_t metaslab_ndf_clump_shift = 4;
732 
733 static uint64_t
734 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
735 {
736 	avl_tree_t *t = &sm->sm_root;
737 	avl_index_t where;
738 	space_seg_t *ss, ssearch;
739 	uint64_t hbit = highbit(size);
740 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
741 	uint64_t max_size = metaslab_pp_maxsize(sm);
742 
743 	ASSERT(MUTEX_HELD(sm->sm_lock));
744 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
745 
746 	if (max_size < size)
747 		return (-1ULL);
748 
749 	ssearch.ss_start = *cursor;
750 	ssearch.ss_end = *cursor + size;
751 
752 	ss = avl_find(t, &ssearch, &where);
753 	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
754 		t = sm->sm_pp_root;
755 
756 		ssearch.ss_start = 0;
757 		ssearch.ss_end = MIN(max_size,
758 		    1ULL << (hbit + metaslab_ndf_clump_shift));
759 		ss = avl_find(t, &ssearch, &where);
760 		if (ss == NULL)
761 			ss = avl_nearest(t, where, AVL_AFTER);
762 		ASSERT(ss != NULL);
763 	}
764 
765 	if (ss != NULL) {
766 		if (ss->ss_start + size <= ss->ss_end) {
767 			*cursor = ss->ss_start + size;
768 			return (ss->ss_start);
769 		}
770 	}
771 	return (-1ULL);
772 }
773 
774 static boolean_t
775 metaslab_ndf_fragmented(space_map_t *sm)
776 {
777 	uint64_t max_size = metaslab_pp_maxsize(sm);
778 
779 	if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
780 		return (B_FALSE);
781 	return (B_TRUE);
782 }
783 
784 
785 static space_map_ops_t metaslab_ndf_ops = {
786 	metaslab_pp_load,
787 	metaslab_pp_unload,
788 	metaslab_ndf_alloc,
789 	metaslab_pp_claim,
790 	metaslab_pp_free,
791 	metaslab_pp_maxsize,
792 	metaslab_ndf_fragmented
793 };
794 
795 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
796 
797 /*
798  * ==========================================================================
799  * Metaslabs
800  * ==========================================================================
801  */
802 metaslab_t *
803 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
804 	uint64_t start, uint64_t size, uint64_t txg)
805 {
806 	vdev_t *vd = mg->mg_vd;
807 	metaslab_t *msp;
808 
809 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
810 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
811 
812 	msp->ms_smo_syncing = *smo;
813 
814 	/*
815 	 * We create the main space map here, but we don't create the
816 	 * allocmaps and freemaps until metaslab_sync_done().  This serves
817 	 * two purposes: it allows metaslab_sync_done() to detect the
818 	 * addition of new space; and for debugging, it ensures that we'd
819 	 * data fault on any attempt to use this metaslab before it's ready.
820 	 */
821 	msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
822 	space_map_create(msp->ms_map, start, size,
823 	    vd->vdev_ashift, &msp->ms_lock);
824 
825 	metaslab_group_add(mg, msp);
826 
827 	if (metaslab_debug && smo->smo_object != 0) {
828 		mutex_enter(&msp->ms_lock);
829 		VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
830 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
831 		mutex_exit(&msp->ms_lock);
832 	}
833 
834 	/*
835 	 * If we're opening an existing pool (txg == 0) or creating
836 	 * a new one (txg == TXG_INITIAL), all space is available now.
837 	 * If we're adding space to an existing pool, the new space
838 	 * does not become available until after this txg has synced.
839 	 */
840 	if (txg <= TXG_INITIAL)
841 		metaslab_sync_done(msp, 0);
842 
843 	if (txg != 0) {
844 		vdev_dirty(vd, 0, NULL, txg);
845 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
846 	}
847 
848 	return (msp);
849 }
850 
851 void
852 metaslab_fini(metaslab_t *msp)
853 {
854 	metaslab_group_t *mg = msp->ms_group;
855 
856 	vdev_space_update(mg->mg_vd,
857 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
858 
859 	metaslab_group_remove(mg, msp);
860 
861 	mutex_enter(&msp->ms_lock);
862 
863 	space_map_unload(msp->ms_map);
864 	space_map_destroy(msp->ms_map);
865 	kmem_free(msp->ms_map, sizeof (*msp->ms_map));
866 
867 	for (int t = 0; t < TXG_SIZE; t++) {
868 		space_map_destroy(msp->ms_allocmap[t]);
869 		space_map_destroy(msp->ms_freemap[t]);
870 		kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
871 		kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
872 	}
873 
874 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
875 		space_map_destroy(msp->ms_defermap[t]);
876 		kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
877 	}
878 
879 	ASSERT0(msp->ms_deferspace);
880 
881 	mutex_exit(&msp->ms_lock);
882 	mutex_destroy(&msp->ms_lock);
883 
884 	kmem_free(msp, sizeof (metaslab_t));
885 }
886 
887 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
888 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
889 #define	METASLAB_ACTIVE_MASK		\
890 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
891 
892 static uint64_t
893 metaslab_weight(metaslab_t *msp)
894 {
895 	metaslab_group_t *mg = msp->ms_group;
896 	space_map_t *sm = msp->ms_map;
897 	space_map_obj_t *smo = &msp->ms_smo;
898 	vdev_t *vd = mg->mg_vd;
899 	uint64_t weight, space;
900 
901 	ASSERT(MUTEX_HELD(&msp->ms_lock));
902 
903 	/*
904 	 * This vdev is in the process of being removed so there is nothing
905 	 * for us to do here.
906 	 */
907 	if (vd->vdev_removing) {
908 		ASSERT0(smo->smo_alloc);
909 		ASSERT0(vd->vdev_ms_shift);
910 		return (0);
911 	}
912 
913 	/*
914 	 * The baseline weight is the metaslab's free space.
915 	 */
916 	space = sm->sm_size - smo->smo_alloc;
917 	weight = space;
918 
919 	/*
920 	 * Modern disks have uniform bit density and constant angular velocity.
921 	 * Therefore, the outer recording zones are faster (higher bandwidth)
922 	 * than the inner zones by the ratio of outer to inner track diameter,
923 	 * which is typically around 2:1.  We account for this by assigning
924 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
925 	 * In effect, this means that we'll select the metaslab with the most
926 	 * free bandwidth rather than simply the one with the most free space.
927 	 */
928 	weight = 2 * weight -
929 	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
930 	ASSERT(weight >= space && weight <= 2 * space);
931 
932 	/*
933 	 * For locality, assign higher weight to metaslabs which have
934 	 * a lower offset than what we've already activated.
935 	 */
936 	if (sm->sm_start <= mg->mg_bonus_area)
937 		weight *= (metaslab_smo_bonus_pct / 100);
938 	ASSERT(weight >= space &&
939 	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
940 
941 	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
942 		/*
943 		 * If this metaslab is one we're actively using, adjust its
944 		 * weight to make it preferable to any inactive metaslab so
945 		 * we'll polish it off.
946 		 */
947 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
948 	}
949 	return (weight);
950 }
951 
952 static void
953 metaslab_prefetch(metaslab_group_t *mg)
954 {
955 	spa_t *spa = mg->mg_vd->vdev_spa;
956 	metaslab_t *msp;
957 	avl_tree_t *t = &mg->mg_metaslab_tree;
958 	int m;
959 
960 	mutex_enter(&mg->mg_lock);
961 
962 	/*
963 	 * Prefetch the next potential metaslabs
964 	 */
965 	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
966 		space_map_t *sm = msp->ms_map;
967 		space_map_obj_t *smo = &msp->ms_smo;
968 
969 		/* If we have reached our prefetch limit then we're done */
970 		if (m >= metaslab_prefetch_limit)
971 			break;
972 
973 		if (!sm->sm_loaded && smo->smo_object != 0) {
974 			mutex_exit(&mg->mg_lock);
975 			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
976 			    0ULL, smo->smo_objsize);
977 			mutex_enter(&mg->mg_lock);
978 		}
979 	}
980 	mutex_exit(&mg->mg_lock);
981 }
982 
983 static int
984 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
985 {
986 	metaslab_group_t *mg = msp->ms_group;
987 	space_map_t *sm = msp->ms_map;
988 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
989 
990 	ASSERT(MUTEX_HELD(&msp->ms_lock));
991 
992 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
993 		space_map_load_wait(sm);
994 		if (!sm->sm_loaded) {
995 			space_map_obj_t *smo = &msp->ms_smo;
996 
997 			int error = space_map_load(sm, sm_ops, SM_FREE, smo,
998 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
999 			if (error)  {
1000 				metaslab_group_sort(msp->ms_group, msp, 0);
1001 				return (error);
1002 			}
1003 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
1004 				space_map_walk(msp->ms_defermap[t],
1005 				    space_map_claim, sm);
1006 
1007 		}
1008 
1009 		/*
1010 		 * Track the bonus area as we activate new metaslabs.
1011 		 */
1012 		if (sm->sm_start > mg->mg_bonus_area) {
1013 			mutex_enter(&mg->mg_lock);
1014 			mg->mg_bonus_area = sm->sm_start;
1015 			mutex_exit(&mg->mg_lock);
1016 		}
1017 
1018 		metaslab_group_sort(msp->ms_group, msp,
1019 		    msp->ms_weight | activation_weight);
1020 	}
1021 	ASSERT(sm->sm_loaded);
1022 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1023 
1024 	return (0);
1025 }
1026 
1027 static void
1028 metaslab_passivate(metaslab_t *msp, uint64_t size)
1029 {
1030 	/*
1031 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1032 	 * this metaslab again.  In that case, it had better be empty,
1033 	 * or we would be leaving space on the table.
1034 	 */
1035 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
1036 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1037 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1038 }
1039 
1040 /*
1041  * Determine if the in-core space map representation can be condensed on-disk.
1042  * We would like to use the following criteria to make our decision:
1043  *
1044  * 1. The size of the space map object should not dramatically increase as a
1045  * result of writing out our in-core free map.
1046  *
1047  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1048  * times the size than the in-core representation (i.e. zfs_condense_pct = 110
1049  * and in-core = 1MB, minimal = 1.1.MB).
1050  *
1051  * Checking the first condition is tricky since we don't want to walk
1052  * the entire AVL tree calculating the estimated on-disk size. Instead we
1053  * use the size-ordered AVL tree in the space map and calculate the
1054  * size required for the largest segment in our in-core free map. If the
1055  * size required to represent that segment on disk is larger than the space
1056  * map object then we avoid condensing this map.
1057  *
1058  * To determine the second criterion we use a best-case estimate and assume
1059  * each segment can be represented on-disk as a single 64-bit entry. We refer
1060  * to this best-case estimate as the space map's minimal form.
1061  */
1062 static boolean_t
1063 metaslab_should_condense(metaslab_t *msp)
1064 {
1065 	space_map_t *sm = msp->ms_map;
1066 	space_map_obj_t *smo = &msp->ms_smo_syncing;
1067 	space_seg_t *ss;
1068 	uint64_t size, entries, segsz;
1069 
1070 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1071 	ASSERT(sm->sm_loaded);
1072 
1073 	/*
1074 	 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
1075 	 * the largest segment in the in-core free map. If the tree is
1076 	 * empty then we should condense the map.
1077 	 */
1078 	ss = avl_last(sm->sm_pp_root);
1079 	if (ss == NULL)
1080 		return (B_TRUE);
1081 
1082 	/*
1083 	 * Calculate the number of 64-bit entries this segment would
1084 	 * require when written to disk. If this single segment would be
1085 	 * larger on-disk than the entire current on-disk structure, then
1086 	 * clearly condensing will increase the on-disk structure size.
1087 	 */
1088 	size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
1089 	entries = size / (MIN(size, SM_RUN_MAX));
1090 	segsz = entries * sizeof (uint64_t);
1091 
1092 	return (segsz <= smo->smo_objsize &&
1093 	    smo->smo_objsize >= (zfs_condense_pct *
1094 	    sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
1095 }
1096 
1097 /*
1098  * Condense the on-disk space map representation to its minimized form.
1099  * The minimized form consists of a small number of allocations followed by
1100  * the in-core free map.
1101  */
1102 static void
1103 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1104 {
1105 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1106 	space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
1107 	space_map_t condense_map;
1108 	space_map_t *sm = msp->ms_map;
1109 	objset_t *mos = spa_meta_objset(spa);
1110 	space_map_obj_t *smo = &msp->ms_smo_syncing;
1111 
1112 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1113 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1114 	ASSERT(sm->sm_loaded);
1115 
1116 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1117 	    "smo size %llu, segments %lu", txg,
1118 	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1119 	    smo->smo_objsize, avl_numnodes(&sm->sm_root));
1120 
1121 	/*
1122 	 * Create an map that is a 100% allocated map. We remove segments
1123 	 * that have been freed in this txg, any deferred frees that exist,
1124 	 * and any allocation in the future. Removing segments should be
1125 	 * a relatively inexpensive operation since we expect these maps to
1126 	 * a small number of nodes.
1127 	 */
1128 	space_map_create(&condense_map, sm->sm_start, sm->sm_size,
1129 	    sm->sm_shift, sm->sm_lock);
1130 	space_map_add(&condense_map, condense_map.sm_start,
1131 	    condense_map.sm_size);
1132 
1133 	/*
1134 	 * Remove what's been freed in this txg from the condense_map.
1135 	 * Since we're in sync_pass 1, we know that all the frees from
1136 	 * this txg are in the freemap.
1137 	 */
1138 	space_map_walk(freemap, space_map_remove, &condense_map);
1139 
1140 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
1141 		space_map_walk(msp->ms_defermap[t],
1142 		    space_map_remove, &condense_map);
1143 
1144 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1145 		space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
1146 		    space_map_remove, &condense_map);
1147 
1148 	/*
1149 	 * We're about to drop the metaslab's lock thus allowing
1150 	 * other consumers to change it's content. Set the
1151 	 * space_map's sm_condensing flag to ensure that
1152 	 * allocations on this metaslab do not occur while we're
1153 	 * in the middle of committing it to disk. This is only critical
1154 	 * for the ms_map as all other space_maps use per txg
1155 	 * views of their content.
1156 	 */
1157 	sm->sm_condensing = B_TRUE;
1158 
1159 	mutex_exit(&msp->ms_lock);
1160 	space_map_truncate(smo, mos, tx);
1161 	mutex_enter(&msp->ms_lock);
1162 
1163 	/*
1164 	 * While we would ideally like to create a space_map representation
1165 	 * that consists only of allocation records, doing so can be
1166 	 * prohibitively expensive because the in-core free map can be
1167 	 * large, and therefore computationally expensive to subtract
1168 	 * from the condense_map. Instead we sync out two maps, a cheap
1169 	 * allocation only map followed by the in-core free map. While not
1170 	 * optimal, this is typically close to optimal, and much cheaper to
1171 	 * compute.
1172 	 */
1173 	space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
1174 	space_map_vacate(&condense_map, NULL, NULL);
1175 	space_map_destroy(&condense_map);
1176 
1177 	space_map_sync(sm, SM_FREE, smo, mos, tx);
1178 	sm->sm_condensing = B_FALSE;
1179 
1180 	spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
1181 	    "smo size %llu", txg,
1182 	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1183 	    smo->smo_objsize);
1184 }
1185 
1186 /*
1187  * Write a metaslab to disk in the context of the specified transaction group.
1188  */
1189 void
1190 metaslab_sync(metaslab_t *msp, uint64_t txg)
1191 {
1192 	vdev_t *vd = msp->ms_group->mg_vd;
1193 	spa_t *spa = vd->vdev_spa;
1194 	objset_t *mos = spa_meta_objset(spa);
1195 	space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
1196 	space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
1197 	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1198 	space_map_t *sm = msp->ms_map;
1199 	space_map_obj_t *smo = &msp->ms_smo_syncing;
1200 	dmu_buf_t *db;
1201 	dmu_tx_t *tx;
1202 
1203 	ASSERT(!vd->vdev_ishole);
1204 
1205 	/*
1206 	 * This metaslab has just been added so there's no work to do now.
1207 	 */
1208 	if (*freemap == NULL) {
1209 		ASSERT3P(allocmap, ==, NULL);
1210 		return;
1211 	}
1212 
1213 	ASSERT3P(allocmap, !=, NULL);
1214 	ASSERT3P(*freemap, !=, NULL);
1215 	ASSERT3P(*freed_map, !=, NULL);
1216 
1217 	if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
1218 		return;
1219 
1220 	/*
1221 	 * The only state that can actually be changing concurrently with
1222 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
1223 	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
1224 	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
1225 	 * We drop it whenever we call into the DMU, because the DMU
1226 	 * can call down to us (e.g. via zio_free()) at any time.
1227 	 */
1228 
1229 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1230 
1231 	if (smo->smo_object == 0) {
1232 		ASSERT(smo->smo_objsize == 0);
1233 		ASSERT(smo->smo_alloc == 0);
1234 		smo->smo_object = dmu_object_alloc(mos,
1235 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1236 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1237 		ASSERT(smo->smo_object != 0);
1238 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1239 		    (sm->sm_start >> vd->vdev_ms_shift),
1240 		    sizeof (uint64_t), &smo->smo_object, tx);
1241 	}
1242 
1243 	mutex_enter(&msp->ms_lock);
1244 
1245 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
1246 	    metaslab_should_condense(msp)) {
1247 		metaslab_condense(msp, txg, tx);
1248 	} else {
1249 		space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1250 		space_map_sync(*freemap, SM_FREE, smo, mos, tx);
1251 	}
1252 
1253 	space_map_vacate(allocmap, NULL, NULL);
1254 
1255 	/*
1256 	 * For sync pass 1, we avoid walking the entire space map and
1257 	 * instead will just swap the pointers for freemap and
1258 	 * freed_map. We can safely do this since the freed_map is
1259 	 * guaranteed to be empty on the initial pass.
1260 	 */
1261 	if (spa_sync_pass(spa) == 1) {
1262 		ASSERT0((*freed_map)->sm_space);
1263 		ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
1264 		space_map_swap(freemap, freed_map);
1265 	} else {
1266 		space_map_vacate(*freemap, space_map_add, *freed_map);
1267 	}
1268 
1269 	ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
1270 	ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
1271 
1272 	mutex_exit(&msp->ms_lock);
1273 
1274 	VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1275 	dmu_buf_will_dirty(db, tx);
1276 	ASSERT3U(db->db_size, >=, sizeof (*smo));
1277 	bcopy(smo, db->db_data, sizeof (*smo));
1278 	dmu_buf_rele(db, FTAG);
1279 
1280 	dmu_tx_commit(tx);
1281 }
1282 
1283 /*
1284  * Called after a transaction group has completely synced to mark
1285  * all of the metaslab's free space as usable.
1286  */
1287 void
1288 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1289 {
1290 	space_map_obj_t *smo = &msp->ms_smo;
1291 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
1292 	space_map_t *sm = msp->ms_map;
1293 	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1294 	space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1295 	metaslab_group_t *mg = msp->ms_group;
1296 	vdev_t *vd = mg->mg_vd;
1297 	int64_t alloc_delta, defer_delta;
1298 
1299 	ASSERT(!vd->vdev_ishole);
1300 
1301 	mutex_enter(&msp->ms_lock);
1302 
1303 	/*
1304 	 * If this metaslab is just becoming available, initialize its
1305 	 * allocmaps, freemaps, and defermap and add its capacity to the vdev.
1306 	 */
1307 	if (*freed_map == NULL) {
1308 		ASSERT(*defer_map == NULL);
1309 		for (int t = 0; t < TXG_SIZE; t++) {
1310 			msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
1311 			    KM_SLEEP);
1312 			space_map_create(msp->ms_allocmap[t], sm->sm_start,
1313 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1314 			msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
1315 			    KM_SLEEP);
1316 			space_map_create(msp->ms_freemap[t], sm->sm_start,
1317 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1318 		}
1319 
1320 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1321 			msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
1322 			    KM_SLEEP);
1323 			space_map_create(msp->ms_defermap[t], sm->sm_start,
1324 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1325 		}
1326 
1327 		freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1328 		defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1329 
1330 		vdev_space_update(vd, 0, 0, sm->sm_size);
1331 	}
1332 
1333 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1334 	defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
1335 
1336 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1337 
1338 	ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
1339 	ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
1340 
1341 	/*
1342 	 * If there's a space_map_load() in progress, wait for it to complete
1343 	 * so that we have a consistent view of the in-core space map.
1344 	 */
1345 	space_map_load_wait(sm);
1346 
1347 	/*
1348 	 * Move the frees from the defer_map to this map (if it's loaded).
1349 	 * Swap the freed_map and the defer_map -- this is safe to do
1350 	 * because we've just emptied out the defer_map.
1351 	 */
1352 	space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1353 	ASSERT0((*defer_map)->sm_space);
1354 	ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
1355 	space_map_swap(freed_map, defer_map);
1356 
1357 	*smo = *smosync;
1358 
1359 	msp->ms_deferspace += defer_delta;
1360 	ASSERT3S(msp->ms_deferspace, >=, 0);
1361 	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1362 	if (msp->ms_deferspace != 0) {
1363 		/*
1364 		 * Keep syncing this metaslab until all deferred frees
1365 		 * are back in circulation.
1366 		 */
1367 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1368 	}
1369 
1370 	/*
1371 	 * If the map is loaded but no longer active, evict it as soon as all
1372 	 * future allocations have synced.  (If we unloaded it now and then
1373 	 * loaded a moment later, the map wouldn't reflect those allocations.)
1374 	 */
1375 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1376 		int evictable = 1;
1377 
1378 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1379 			if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1380 				evictable = 0;
1381 
1382 		if (evictable && !metaslab_debug)
1383 			space_map_unload(sm);
1384 	}
1385 
1386 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1387 
1388 	mutex_exit(&msp->ms_lock);
1389 }
1390 
1391 void
1392 metaslab_sync_reassess(metaslab_group_t *mg)
1393 {
1394 	vdev_t *vd = mg->mg_vd;
1395 	int64_t failures = mg->mg_alloc_failures;
1396 
1397 	metaslab_group_alloc_update(mg);
1398 
1399 	/*
1400 	 * Re-evaluate all metaslabs which have lower offsets than the
1401 	 * bonus area.
1402 	 */
1403 	for (int m = 0; m < vd->vdev_ms_count; m++) {
1404 		metaslab_t *msp = vd->vdev_ms[m];
1405 
1406 		if (msp->ms_map->sm_start > mg->mg_bonus_area)
1407 			break;
1408 
1409 		mutex_enter(&msp->ms_lock);
1410 		metaslab_group_sort(mg, msp, metaslab_weight(msp));
1411 		mutex_exit(&msp->ms_lock);
1412 	}
1413 
1414 	atomic_add_64(&mg->mg_alloc_failures, -failures);
1415 
1416 	/*
1417 	 * Prefetch the next potential metaslabs
1418 	 */
1419 	metaslab_prefetch(mg);
1420 }
1421 
1422 static uint64_t
1423 metaslab_distance(metaslab_t *msp, dva_t *dva)
1424 {
1425 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1426 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1427 	uint64_t start = msp->ms_map->sm_start >> ms_shift;
1428 
1429 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1430 		return (1ULL << 63);
1431 
1432 	if (offset < start)
1433 		return ((start - offset) << ms_shift);
1434 	if (offset > start)
1435 		return ((offset - start) << ms_shift);
1436 	return (0);
1437 }
1438 
1439 static uint64_t
1440 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1441     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1442 {
1443 	spa_t *spa = mg->mg_vd->vdev_spa;
1444 	metaslab_t *msp = NULL;
1445 	uint64_t offset = -1ULL;
1446 	avl_tree_t *t = &mg->mg_metaslab_tree;
1447 	uint64_t activation_weight;
1448 	uint64_t target_distance;
1449 	int i;
1450 
1451 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1452 	for (i = 0; i < d; i++) {
1453 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1454 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1455 			break;
1456 		}
1457 	}
1458 
1459 	for (;;) {
1460 		boolean_t was_active;
1461 
1462 		mutex_enter(&mg->mg_lock);
1463 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1464 			if (msp->ms_weight < asize) {
1465 				spa_dbgmsg(spa, "%s: failed to meet weight "
1466 				    "requirement: vdev %llu, txg %llu, mg %p, "
1467 				    "msp %p, psize %llu, asize %llu, "
1468 				    "failures %llu, weight %llu",
1469 				    spa_name(spa), mg->mg_vd->vdev_id, txg,
1470 				    mg, msp, psize, asize,
1471 				    mg->mg_alloc_failures, msp->ms_weight);
1472 				mutex_exit(&mg->mg_lock);
1473 				return (-1ULL);
1474 			}
1475 
1476 			/*
1477 			 * If the selected metaslab is condensing, skip it.
1478 			 */
1479 			if (msp->ms_map->sm_condensing)
1480 				continue;
1481 
1482 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1483 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1484 				break;
1485 
1486 			target_distance = min_distance +
1487 			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1488 
1489 			for (i = 0; i < d; i++)
1490 				if (metaslab_distance(msp, &dva[i]) <
1491 				    target_distance)
1492 					break;
1493 			if (i == d)
1494 				break;
1495 		}
1496 		mutex_exit(&mg->mg_lock);
1497 		if (msp == NULL)
1498 			return (-1ULL);
1499 
1500 		mutex_enter(&msp->ms_lock);
1501 
1502 		/*
1503 		 * If we've already reached the allowable number of failed
1504 		 * allocation attempts on this metaslab group then we
1505 		 * consider skipping it. We skip it only if we're allowed
1506 		 * to "fast" gang, the physical size is larger than
1507 		 * a gang block, and we're attempting to allocate from
1508 		 * the primary metaslab.
1509 		 */
1510 		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1511 		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1512 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1513 			spa_dbgmsg(spa, "%s: skipping metaslab group: "
1514 			    "vdev %llu, txg %llu, mg %p, psize %llu, "
1515 			    "asize %llu, failures %llu", spa_name(spa),
1516 			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
1517 			    mg->mg_alloc_failures);
1518 			mutex_exit(&msp->ms_lock);
1519 			return (-1ULL);
1520 		}
1521 
1522 		/*
1523 		 * Ensure that the metaslab we have selected is still
1524 		 * capable of handling our request. It's possible that
1525 		 * another thread may have changed the weight while we
1526 		 * were blocked on the metaslab lock.
1527 		 */
1528 		if (msp->ms_weight < asize || (was_active &&
1529 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1530 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1531 			mutex_exit(&msp->ms_lock);
1532 			continue;
1533 		}
1534 
1535 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1536 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1537 			metaslab_passivate(msp,
1538 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1539 			mutex_exit(&msp->ms_lock);
1540 			continue;
1541 		}
1542 
1543 		if (metaslab_activate(msp, activation_weight) != 0) {
1544 			mutex_exit(&msp->ms_lock);
1545 			continue;
1546 		}
1547 
1548 		/*
1549 		 * If this metaslab is currently condensing then pick again as
1550 		 * we can't manipulate this metaslab until it's committed
1551 		 * to disk.
1552 		 */
1553 		if (msp->ms_map->sm_condensing) {
1554 			mutex_exit(&msp->ms_lock);
1555 			continue;
1556 		}
1557 
1558 		if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
1559 			break;
1560 
1561 		atomic_inc_64(&mg->mg_alloc_failures);
1562 
1563 		metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
1564 
1565 		mutex_exit(&msp->ms_lock);
1566 	}
1567 
1568 	if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1569 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1570 
1571 	space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1572 
1573 	mutex_exit(&msp->ms_lock);
1574 
1575 	return (offset);
1576 }
1577 
1578 /*
1579  * Allocate a block for the specified i/o.
1580  */
1581 static int
1582 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1583     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1584 {
1585 	metaslab_group_t *mg, *rotor;
1586 	vdev_t *vd;
1587 	int dshift = 3;
1588 	int all_zero;
1589 	int zio_lock = B_FALSE;
1590 	boolean_t allocatable;
1591 	uint64_t offset = -1ULL;
1592 	uint64_t asize;
1593 	uint64_t distance;
1594 
1595 	ASSERT(!DVA_IS_VALID(&dva[d]));
1596 
1597 	/*
1598 	 * For testing, make some blocks above a certain size be gang blocks.
1599 	 */
1600 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1601 		return (SET_ERROR(ENOSPC));
1602 
1603 	/*
1604 	 * Start at the rotor and loop through all mgs until we find something.
1605 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1606 	 * nothing actually breaks if we miss a few updates -- we just won't
1607 	 * allocate quite as evenly.  It all balances out over time.
1608 	 *
1609 	 * If we are doing ditto or log blocks, try to spread them across
1610 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1611 	 * allocated all of our ditto blocks, then try and spread them out on
1612 	 * that vdev as much as possible.  If it turns out to not be possible,
1613 	 * gradually lower our standards until anything becomes acceptable.
1614 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1615 	 * gives us hope of containing our fault domains to something we're
1616 	 * able to reason about.  Otherwise, any two top-level vdev failures
1617 	 * will guarantee the loss of data.  With consecutive allocation,
1618 	 * only two adjacent top-level vdev failures will result in data loss.
1619 	 *
1620 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1621 	 * ourselves on the same vdev as our gang block header.  That
1622 	 * way, we can hope for locality in vdev_cache, plus it makes our
1623 	 * fault domains something tractable.
1624 	 */
1625 	if (hintdva) {
1626 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1627 
1628 		/*
1629 		 * It's possible the vdev we're using as the hint no
1630 		 * longer exists (i.e. removed). Consult the rotor when
1631 		 * all else fails.
1632 		 */
1633 		if (vd != NULL) {
1634 			mg = vd->vdev_mg;
1635 
1636 			if (flags & METASLAB_HINTBP_AVOID &&
1637 			    mg->mg_next != NULL)
1638 				mg = mg->mg_next;
1639 		} else {
1640 			mg = mc->mc_rotor;
1641 		}
1642 	} else if (d != 0) {
1643 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1644 		mg = vd->vdev_mg->mg_next;
1645 	} else {
1646 		mg = mc->mc_rotor;
1647 	}
1648 
1649 	/*
1650 	 * If the hint put us into the wrong metaslab class, or into a
1651 	 * metaslab group that has been passivated, just follow the rotor.
1652 	 */
1653 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1654 		mg = mc->mc_rotor;
1655 
1656 	rotor = mg;
1657 top:
1658 	all_zero = B_TRUE;
1659 	do {
1660 		ASSERT(mg->mg_activation_count == 1);
1661 
1662 		vd = mg->mg_vd;
1663 
1664 		/*
1665 		 * Don't allocate from faulted devices.
1666 		 */
1667 		if (zio_lock) {
1668 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1669 			allocatable = vdev_allocatable(vd);
1670 			spa_config_exit(spa, SCL_ZIO, FTAG);
1671 		} else {
1672 			allocatable = vdev_allocatable(vd);
1673 		}
1674 
1675 		/*
1676 		 * Determine if the selected metaslab group is eligible
1677 		 * for allocations. If we're ganging or have requested
1678 		 * an allocation for the smallest gang block size
1679 		 * then we don't want to avoid allocating to the this
1680 		 * metaslab group. If we're in this condition we should
1681 		 * try to allocate from any device possible so that we
1682 		 * don't inadvertently return ENOSPC and suspend the pool
1683 		 * even though space is still available.
1684 		 */
1685 		if (allocatable && CAN_FASTGANG(flags) &&
1686 		    psize > SPA_GANGBLOCKSIZE)
1687 			allocatable = metaslab_group_allocatable(mg);
1688 
1689 		if (!allocatable)
1690 			goto next;
1691 
1692 		/*
1693 		 * Avoid writing single-copy data to a failing vdev
1694 		 * unless the user instructs us that it is okay.
1695 		 */
1696 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1697 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1698 		    d == 0 && dshift == 3 &&
1699 		    !(zfs_write_to_degraded && vd->vdev_state ==
1700 		    VDEV_STATE_DEGRADED)) {
1701 			all_zero = B_FALSE;
1702 			goto next;
1703 		}
1704 
1705 		ASSERT(mg->mg_class == mc);
1706 
1707 		distance = vd->vdev_asize >> dshift;
1708 		if (distance <= (1ULL << vd->vdev_ms_shift))
1709 			distance = 0;
1710 		else
1711 			all_zero = B_FALSE;
1712 
1713 		asize = vdev_psize_to_asize(vd, psize);
1714 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1715 
1716 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1717 		    dva, d, flags);
1718 		if (offset != -1ULL) {
1719 			/*
1720 			 * If we've just selected this metaslab group,
1721 			 * figure out whether the corresponding vdev is
1722 			 * over- or under-used relative to the pool,
1723 			 * and set an allocation bias to even it out.
1724 			 */
1725 			if (mc->mc_aliquot == 0) {
1726 				vdev_stat_t *vs = &vd->vdev_stat;
1727 				int64_t vu, cu;
1728 
1729 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1730 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1731 
1732 				/*
1733 				 * Calculate how much more or less we should
1734 				 * try to allocate from this device during
1735 				 * this iteration around the rotor.
1736 				 * For example, if a device is 80% full
1737 				 * and the pool is 20% full then we should
1738 				 * reduce allocations by 60% on this device.
1739 				 *
1740 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1741 				 *
1742 				 * This reduces allocations by 307K for this
1743 				 * iteration.
1744 				 */
1745 				mg->mg_bias = ((cu - vu) *
1746 				    (int64_t)mg->mg_aliquot) / 100;
1747 			}
1748 
1749 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1750 			    mg->mg_aliquot + mg->mg_bias) {
1751 				mc->mc_rotor = mg->mg_next;
1752 				mc->mc_aliquot = 0;
1753 			}
1754 
1755 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1756 			DVA_SET_OFFSET(&dva[d], offset);
1757 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1758 			DVA_SET_ASIZE(&dva[d], asize);
1759 
1760 			return (0);
1761 		}
1762 next:
1763 		mc->mc_rotor = mg->mg_next;
1764 		mc->mc_aliquot = 0;
1765 	} while ((mg = mg->mg_next) != rotor);
1766 
1767 	if (!all_zero) {
1768 		dshift++;
1769 		ASSERT(dshift < 64);
1770 		goto top;
1771 	}
1772 
1773 	if (!allocatable && !zio_lock) {
1774 		dshift = 3;
1775 		zio_lock = B_TRUE;
1776 		goto top;
1777 	}
1778 
1779 	bzero(&dva[d], sizeof (dva_t));
1780 
1781 	return (SET_ERROR(ENOSPC));
1782 }
1783 
1784 /*
1785  * Free the block represented by DVA in the context of the specified
1786  * transaction group.
1787  */
1788 static void
1789 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1790 {
1791 	uint64_t vdev = DVA_GET_VDEV(dva);
1792 	uint64_t offset = DVA_GET_OFFSET(dva);
1793 	uint64_t size = DVA_GET_ASIZE(dva);
1794 	vdev_t *vd;
1795 	metaslab_t *msp;
1796 
1797 	ASSERT(DVA_IS_VALID(dva));
1798 
1799 	if (txg > spa_freeze_txg(spa))
1800 		return;
1801 
1802 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1803 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1804 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1805 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1806 		ASSERT(0);
1807 		return;
1808 	}
1809 
1810 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1811 
1812 	if (DVA_GET_GANG(dva))
1813 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1814 
1815 	mutex_enter(&msp->ms_lock);
1816 
1817 	if (now) {
1818 		space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
1819 		    offset, size);
1820 		space_map_free(msp->ms_map, offset, size);
1821 	} else {
1822 		if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
1823 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1824 		space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
1825 	}
1826 
1827 	mutex_exit(&msp->ms_lock);
1828 }
1829 
1830 /*
1831  * Intent log support: upon opening the pool after a crash, notify the SPA
1832  * of blocks that the intent log has allocated for immediate write, but
1833  * which are still considered free by the SPA because the last transaction
1834  * group didn't commit yet.
1835  */
1836 static int
1837 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1838 {
1839 	uint64_t vdev = DVA_GET_VDEV(dva);
1840 	uint64_t offset = DVA_GET_OFFSET(dva);
1841 	uint64_t size = DVA_GET_ASIZE(dva);
1842 	vdev_t *vd;
1843 	metaslab_t *msp;
1844 	int error = 0;
1845 
1846 	ASSERT(DVA_IS_VALID(dva));
1847 
1848 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1849 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1850 		return (SET_ERROR(ENXIO));
1851 
1852 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1853 
1854 	if (DVA_GET_GANG(dva))
1855 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1856 
1857 	mutex_enter(&msp->ms_lock);
1858 
1859 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
1860 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1861 
1862 	if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
1863 		error = SET_ERROR(ENOENT);
1864 
1865 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
1866 		mutex_exit(&msp->ms_lock);
1867 		return (error);
1868 	}
1869 
1870 	space_map_claim(msp->ms_map, offset, size);
1871 
1872 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
1873 		if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1874 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1875 		space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
1876 	}
1877 
1878 	mutex_exit(&msp->ms_lock);
1879 
1880 	return (0);
1881 }
1882 
1883 int
1884 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1885     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1886 {
1887 	dva_t *dva = bp->blk_dva;
1888 	dva_t *hintdva = hintbp->blk_dva;
1889 	int error = 0;
1890 
1891 	ASSERT(bp->blk_birth == 0);
1892 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1893 
1894 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1895 
1896 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
1897 		spa_config_exit(spa, SCL_ALLOC, FTAG);
1898 		return (SET_ERROR(ENOSPC));
1899 	}
1900 
1901 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1902 	ASSERT(BP_GET_NDVAS(bp) == 0);
1903 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1904 
1905 	for (int d = 0; d < ndvas; d++) {
1906 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1907 		    txg, flags);
1908 		if (error) {
1909 			for (d--; d >= 0; d--) {
1910 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1911 				bzero(&dva[d], sizeof (dva_t));
1912 			}
1913 			spa_config_exit(spa, SCL_ALLOC, FTAG);
1914 			return (error);
1915 		}
1916 	}
1917 	ASSERT(error == 0);
1918 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
1919 
1920 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1921 
1922 	BP_SET_BIRTH(bp, txg, txg);
1923 
1924 	return (0);
1925 }
1926 
1927 void
1928 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1929 {
1930 	const dva_t *dva = bp->blk_dva;
1931 	int ndvas = BP_GET_NDVAS(bp);
1932 
1933 	ASSERT(!BP_IS_HOLE(bp));
1934 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1935 
1936 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1937 
1938 	for (int d = 0; d < ndvas; d++)
1939 		metaslab_free_dva(spa, &dva[d], txg, now);
1940 
1941 	spa_config_exit(spa, SCL_FREE, FTAG);
1942 }
1943 
1944 int
1945 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1946 {
1947 	const dva_t *dva = bp->blk_dva;
1948 	int ndvas = BP_GET_NDVAS(bp);
1949 	int error = 0;
1950 
1951 	ASSERT(!BP_IS_HOLE(bp));
1952 
1953 	if (txg != 0) {
1954 		/*
1955 		 * First do a dry run to make sure all DVAs are claimable,
1956 		 * so we don't have to unwind from partial failures below.
1957 		 */
1958 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
1959 			return (error);
1960 	}
1961 
1962 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1963 
1964 	for (int d = 0; d < ndvas; d++)
1965 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1966 			break;
1967 
1968 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1969 
1970 	ASSERT(error == 0 || txg == 0);
1971 
1972 	return (error);
1973 }
1974 
1975 static void
1976 checkmap(space_map_t *sm, uint64_t off, uint64_t size)
1977 {
1978 	space_seg_t *ss;
1979 	avl_index_t where;
1980 
1981 	mutex_enter(sm->sm_lock);
1982 	ss = space_map_find(sm, off, size, &where);
1983 	if (ss != NULL)
1984 		panic("freeing free block; ss=%p", (void *)ss);
1985 	mutex_exit(sm->sm_lock);
1986 }
1987 
1988 void
1989 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
1990 {
1991 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
1992 		return;
1993 
1994 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1995 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
1996 		uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
1997 		vdev_t *vd = vdev_lookup_top(spa, vdid);
1998 		uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
1999 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2000 		metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
2001 
2002 		if (ms->ms_map->sm_loaded)
2003 			checkmap(ms->ms_map, off, size);
2004 
2005 		for (int j = 0; j < TXG_SIZE; j++)
2006 			checkmap(ms->ms_freemap[j], off, size);
2007 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2008 			checkmap(ms->ms_defermap[j], off, size);
2009 	}
2010 	spa_config_exit(spa, SCL_VDEV, FTAG);
2011 }
2012