xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision 98c507c4288789fc67365c4cb51f80eb641e7182)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/dmu.h>
28 #include <sys/dmu_tx.h>
29 #include <sys/space_map.h>
30 #include <sys/metaslab_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 
34 uint64_t metaslab_aliquot = 512ULL << 10;
35 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
36 
37 /*
38  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
39  */
40 static int metaslab_debug = 0;
41 
42 /*
43  * Minimum size which forces the dynamic allocator to change
44  * it's allocation strategy. Once the space map cannot satisfy
45  * an allocation of this size then it switches to using more
46  * aggressive strategy (i.e search by size rather than offset).
47  */
48 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
49 
50 /*
51  * The minimum free space, in percent, which must be available
52  * in a space map to continue allocations in a first-fit fashion.
53  * Once the space_map's free space drops below this level we dynamically
54  * switch to using best-fit allocations.
55  */
56 int metaslab_df_free_pct = 30;
57 
58 /*
59  * ==========================================================================
60  * Metaslab classes
61  * ==========================================================================
62  */
63 metaslab_class_t *
64 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
65 {
66 	metaslab_class_t *mc;
67 
68 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
69 
70 	mc->mc_spa = spa;
71 	mc->mc_rotor = NULL;
72 	mc->mc_ops = ops;
73 
74 	return (mc);
75 }
76 
77 void
78 metaslab_class_destroy(metaslab_class_t *mc)
79 {
80 	ASSERT(mc->mc_rotor == NULL);
81 	ASSERT(mc->mc_alloc == 0);
82 	ASSERT(mc->mc_deferred == 0);
83 	ASSERT(mc->mc_space == 0);
84 	ASSERT(mc->mc_dspace == 0);
85 
86 	kmem_free(mc, sizeof (metaslab_class_t));
87 }
88 
89 int
90 metaslab_class_validate(metaslab_class_t *mc)
91 {
92 	metaslab_group_t *mg;
93 	vdev_t *vd;
94 
95 	/*
96 	 * Must hold one of the spa_config locks.
97 	 */
98 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
99 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
100 
101 	if ((mg = mc->mc_rotor) == NULL)
102 		return (0);
103 
104 	do {
105 		vd = mg->mg_vd;
106 		ASSERT(vd->vdev_mg != NULL);
107 		ASSERT3P(vd->vdev_top, ==, vd);
108 		ASSERT3P(mg->mg_class, ==, mc);
109 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
110 	} while ((mg = mg->mg_next) != mc->mc_rotor);
111 
112 	return (0);
113 }
114 
115 void
116 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
117     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
118 {
119 	atomic_add_64(&mc->mc_alloc, alloc_delta);
120 	atomic_add_64(&mc->mc_deferred, defer_delta);
121 	atomic_add_64(&mc->mc_space, space_delta);
122 	atomic_add_64(&mc->mc_dspace, dspace_delta);
123 }
124 
125 uint64_t
126 metaslab_class_get_alloc(metaslab_class_t *mc)
127 {
128 	return (mc->mc_alloc);
129 }
130 
131 uint64_t
132 metaslab_class_get_deferred(metaslab_class_t *mc)
133 {
134 	return (mc->mc_deferred);
135 }
136 
137 uint64_t
138 metaslab_class_get_space(metaslab_class_t *mc)
139 {
140 	return (mc->mc_space);
141 }
142 
143 uint64_t
144 metaslab_class_get_dspace(metaslab_class_t *mc)
145 {
146 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
147 }
148 
149 /*
150  * ==========================================================================
151  * Metaslab groups
152  * ==========================================================================
153  */
154 static int
155 metaslab_compare(const void *x1, const void *x2)
156 {
157 	const metaslab_t *m1 = x1;
158 	const metaslab_t *m2 = x2;
159 
160 	if (m1->ms_weight < m2->ms_weight)
161 		return (1);
162 	if (m1->ms_weight > m2->ms_weight)
163 		return (-1);
164 
165 	/*
166 	 * If the weights are identical, use the offset to force uniqueness.
167 	 */
168 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
169 		return (-1);
170 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
171 		return (1);
172 
173 	ASSERT3P(m1, ==, m2);
174 
175 	return (0);
176 }
177 
178 metaslab_group_t *
179 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
180 {
181 	metaslab_group_t *mg;
182 
183 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
184 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
185 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
186 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
187 	mg->mg_vd = vd;
188 	mg->mg_class = mc;
189 	mg->mg_activation_count = 0;
190 
191 	return (mg);
192 }
193 
194 void
195 metaslab_group_destroy(metaslab_group_t *mg)
196 {
197 	ASSERT(mg->mg_prev == NULL);
198 	ASSERT(mg->mg_next == NULL);
199 	ASSERT(mg->mg_activation_count + mg->mg_vd->vdev_removing == 0);
200 
201 	avl_destroy(&mg->mg_metaslab_tree);
202 	mutex_destroy(&mg->mg_lock);
203 	kmem_free(mg, sizeof (metaslab_group_t));
204 }
205 
206 void
207 metaslab_group_activate(metaslab_group_t *mg)
208 {
209 	metaslab_class_t *mc = mg->mg_class;
210 	metaslab_group_t *mgprev, *mgnext;
211 
212 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
213 
214 	ASSERT(mc->mc_rotor != mg);
215 	ASSERT(mg->mg_prev == NULL);
216 	ASSERT(mg->mg_next == NULL);
217 	ASSERT(mg->mg_activation_count <= 0);
218 
219 	if (++mg->mg_activation_count <= 0)
220 		return;
221 
222 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
223 
224 	if ((mgprev = mc->mc_rotor) == NULL) {
225 		mg->mg_prev = mg;
226 		mg->mg_next = mg;
227 	} else {
228 		mgnext = mgprev->mg_next;
229 		mg->mg_prev = mgprev;
230 		mg->mg_next = mgnext;
231 		mgprev->mg_next = mg;
232 		mgnext->mg_prev = mg;
233 	}
234 	mc->mc_rotor = mg;
235 }
236 
237 void
238 metaslab_group_passivate(metaslab_group_t *mg)
239 {
240 	metaslab_class_t *mc = mg->mg_class;
241 	metaslab_group_t *mgprev, *mgnext;
242 
243 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
244 
245 	if (--mg->mg_activation_count != 0) {
246 		ASSERT(mc->mc_rotor != mg);
247 		ASSERT(mg->mg_prev == NULL);
248 		ASSERT(mg->mg_next == NULL);
249 		ASSERT(mg->mg_activation_count < 0);
250 		return;
251 	}
252 
253 	mgprev = mg->mg_prev;
254 	mgnext = mg->mg_next;
255 
256 	if (mg == mgnext) {
257 		mc->mc_rotor = NULL;
258 	} else {
259 		mc->mc_rotor = mgnext;
260 		mgprev->mg_next = mgnext;
261 		mgnext->mg_prev = mgprev;
262 	}
263 
264 	mg->mg_prev = NULL;
265 	mg->mg_next = NULL;
266 }
267 
268 static void
269 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
270 {
271 	mutex_enter(&mg->mg_lock);
272 	ASSERT(msp->ms_group == NULL);
273 	msp->ms_group = mg;
274 	msp->ms_weight = 0;
275 	avl_add(&mg->mg_metaslab_tree, msp);
276 	mutex_exit(&mg->mg_lock);
277 }
278 
279 static void
280 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
281 {
282 	mutex_enter(&mg->mg_lock);
283 	ASSERT(msp->ms_group == mg);
284 	avl_remove(&mg->mg_metaslab_tree, msp);
285 	msp->ms_group = NULL;
286 	mutex_exit(&mg->mg_lock);
287 }
288 
289 static void
290 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
291 {
292 	/*
293 	 * Although in principle the weight can be any value, in
294 	 * practice we do not use values in the range [1, 510].
295 	 */
296 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
297 	ASSERT(MUTEX_HELD(&msp->ms_lock));
298 
299 	mutex_enter(&mg->mg_lock);
300 	ASSERT(msp->ms_group == mg);
301 	avl_remove(&mg->mg_metaslab_tree, msp);
302 	msp->ms_weight = weight;
303 	avl_add(&mg->mg_metaslab_tree, msp);
304 	mutex_exit(&mg->mg_lock);
305 }
306 
307 /*
308  * This is a helper function that can be used by the allocator to find
309  * a suitable block to allocate. This will search the specified AVL
310  * tree looking for a block that matches the specified criteria.
311  */
312 static uint64_t
313 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
314     uint64_t align)
315 {
316 	space_seg_t *ss, ssearch;
317 	avl_index_t where;
318 
319 	ssearch.ss_start = *cursor;
320 	ssearch.ss_end = *cursor + size;
321 
322 	ss = avl_find(t, &ssearch, &where);
323 	if (ss == NULL)
324 		ss = avl_nearest(t, where, AVL_AFTER);
325 
326 	while (ss != NULL) {
327 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
328 
329 		if (offset + size <= ss->ss_end) {
330 			*cursor = offset + size;
331 			return (offset);
332 		}
333 		ss = AVL_NEXT(t, ss);
334 	}
335 
336 	/*
337 	 * If we know we've searched the whole map (*cursor == 0), give up.
338 	 * Otherwise, reset the cursor to the beginning and try again.
339 	 */
340 	if (*cursor == 0)
341 		return (-1ULL);
342 
343 	*cursor = 0;
344 	return (metaslab_block_picker(t, cursor, size, align));
345 }
346 
347 /*
348  * ==========================================================================
349  * The first-fit block allocator
350  * ==========================================================================
351  */
352 static void
353 metaslab_ff_load(space_map_t *sm)
354 {
355 	ASSERT(sm->sm_ppd == NULL);
356 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
357 	sm->sm_pp_root = NULL;
358 }
359 
360 static void
361 metaslab_ff_unload(space_map_t *sm)
362 {
363 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
364 	sm->sm_ppd = NULL;
365 }
366 
367 static uint64_t
368 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
369 {
370 	avl_tree_t *t = &sm->sm_root;
371 	uint64_t align = size & -size;
372 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
373 
374 	return (metaslab_block_picker(t, cursor, size, align));
375 }
376 
377 /* ARGSUSED */
378 static void
379 metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
380 {
381 	/* No need to update cursor */
382 }
383 
384 /* ARGSUSED */
385 static void
386 metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
387 {
388 	/* No need to update cursor */
389 }
390 
391 static space_map_ops_t metaslab_ff_ops = {
392 	metaslab_ff_load,
393 	metaslab_ff_unload,
394 	metaslab_ff_alloc,
395 	metaslab_ff_claim,
396 	metaslab_ff_free,
397 	NULL	/* maxsize */
398 };
399 
400 /*
401  * Dynamic block allocator -
402  * Uses the first fit allocation scheme until space get low and then
403  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
404  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
405  */
406 
407 uint64_t
408 metaslab_df_maxsize(space_map_t *sm)
409 {
410 	avl_tree_t *t = sm->sm_pp_root;
411 	space_seg_t *ss;
412 
413 	if (t == NULL || (ss = avl_last(t)) == NULL)
414 		return (0ULL);
415 
416 	return (ss->ss_end - ss->ss_start);
417 }
418 
419 static int
420 metaslab_df_seg_compare(const void *x1, const void *x2)
421 {
422 	const space_seg_t *s1 = x1;
423 	const space_seg_t *s2 = x2;
424 	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
425 	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
426 
427 	if (ss_size1 < ss_size2)
428 		return (-1);
429 	if (ss_size1 > ss_size2)
430 		return (1);
431 
432 	if (s1->ss_start < s2->ss_start)
433 		return (-1);
434 	if (s1->ss_start > s2->ss_start)
435 		return (1);
436 
437 	return (0);
438 }
439 
440 static void
441 metaslab_df_load(space_map_t *sm)
442 {
443 	space_seg_t *ss;
444 
445 	ASSERT(sm->sm_ppd == NULL);
446 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
447 
448 	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
449 	avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
450 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
451 
452 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
453 		avl_add(sm->sm_pp_root, ss);
454 }
455 
456 static void
457 metaslab_df_unload(space_map_t *sm)
458 {
459 	void *cookie = NULL;
460 
461 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
462 	sm->sm_ppd = NULL;
463 
464 	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
465 		/* tear down the tree */
466 	}
467 
468 	avl_destroy(sm->sm_pp_root);
469 	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
470 	sm->sm_pp_root = NULL;
471 }
472 
473 static uint64_t
474 metaslab_df_alloc(space_map_t *sm, uint64_t size)
475 {
476 	avl_tree_t *t = &sm->sm_root;
477 	uint64_t align = size & -size;
478 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
479 	uint64_t max_size = metaslab_df_maxsize(sm);
480 	int free_pct = sm->sm_space * 100 / sm->sm_size;
481 
482 	ASSERT(MUTEX_HELD(sm->sm_lock));
483 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
484 
485 	if (max_size < size)
486 		return (-1ULL);
487 
488 	/*
489 	 * If we're running low on space switch to using the size
490 	 * sorted AVL tree (best-fit).
491 	 */
492 	if (max_size < metaslab_df_alloc_threshold ||
493 	    free_pct < metaslab_df_free_pct) {
494 		t = sm->sm_pp_root;
495 		*cursor = 0;
496 	}
497 
498 	return (metaslab_block_picker(t, cursor, size, 1ULL));
499 }
500 
501 /* ARGSUSED */
502 static void
503 metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
504 {
505 	/* No need to update cursor */
506 }
507 
508 /* ARGSUSED */
509 static void
510 metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
511 {
512 	/* No need to update cursor */
513 }
514 
515 static space_map_ops_t metaslab_df_ops = {
516 	metaslab_df_load,
517 	metaslab_df_unload,
518 	metaslab_df_alloc,
519 	metaslab_df_claim,
520 	metaslab_df_free,
521 	metaslab_df_maxsize
522 };
523 
524 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
525 
526 /*
527  * ==========================================================================
528  * Metaslabs
529  * ==========================================================================
530  */
531 metaslab_t *
532 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
533 	uint64_t start, uint64_t size, uint64_t txg)
534 {
535 	vdev_t *vd = mg->mg_vd;
536 	metaslab_t *msp;
537 
538 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
539 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
540 
541 	msp->ms_smo_syncing = *smo;
542 
543 	/*
544 	 * We create the main space map here, but we don't create the
545 	 * allocmaps and freemaps until metaslab_sync_done().  This serves
546 	 * two purposes: it allows metaslab_sync_done() to detect the
547 	 * addition of new space; and for debugging, it ensures that we'd
548 	 * data fault on any attempt to use this metaslab before it's ready.
549 	 */
550 	space_map_create(&msp->ms_map, start, size,
551 	    vd->vdev_ashift, &msp->ms_lock);
552 
553 	metaslab_group_add(mg, msp);
554 
555 	if (metaslab_debug && smo->smo_object != 0) {
556 		mutex_enter(&msp->ms_lock);
557 		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
558 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
559 		mutex_exit(&msp->ms_lock);
560 	}
561 
562 	/*
563 	 * If we're opening an existing pool (txg == 0) or creating
564 	 * a new one (txg == TXG_INITIAL), all space is available now.
565 	 * If we're adding space to an existing pool, the new space
566 	 * does not become available until after this txg has synced.
567 	 */
568 	if (txg <= TXG_INITIAL)
569 		metaslab_sync_done(msp, 0);
570 
571 	if (txg != 0) {
572 		vdev_dirty(vd, 0, NULL, txg);
573 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
574 	}
575 
576 	return (msp);
577 }
578 
579 void
580 metaslab_fini(metaslab_t *msp)
581 {
582 	metaslab_group_t *mg = msp->ms_group;
583 
584 	vdev_space_update(mg->mg_vd,
585 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
586 
587 	metaslab_group_remove(mg, msp);
588 
589 	mutex_enter(&msp->ms_lock);
590 
591 	space_map_unload(&msp->ms_map);
592 	space_map_destroy(&msp->ms_map);
593 
594 	for (int t = 0; t < TXG_SIZE; t++) {
595 		space_map_destroy(&msp->ms_allocmap[t]);
596 		space_map_destroy(&msp->ms_freemap[t]);
597 	}
598 
599 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
600 		space_map_destroy(&msp->ms_defermap[t]);
601 
602 	ASSERT3S(msp->ms_deferspace, ==, 0);
603 
604 	mutex_exit(&msp->ms_lock);
605 	mutex_destroy(&msp->ms_lock);
606 
607 	kmem_free(msp, sizeof (metaslab_t));
608 }
609 
610 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
611 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
612 #define	METASLAB_ACTIVE_MASK		\
613 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
614 #define	METASLAB_SMO_BONUS_MULTIPLIER	2
615 
616 static uint64_t
617 metaslab_weight(metaslab_t *msp)
618 {
619 	metaslab_group_t *mg = msp->ms_group;
620 	space_map_t *sm = &msp->ms_map;
621 	space_map_obj_t *smo = &msp->ms_smo;
622 	vdev_t *vd = mg->mg_vd;
623 	uint64_t weight, space;
624 
625 	ASSERT(MUTEX_HELD(&msp->ms_lock));
626 
627 	/*
628 	 * The baseline weight is the metaslab's free space.
629 	 */
630 	space = sm->sm_size - smo->smo_alloc;
631 	weight = space;
632 
633 	/*
634 	 * Modern disks have uniform bit density and constant angular velocity.
635 	 * Therefore, the outer recording zones are faster (higher bandwidth)
636 	 * than the inner zones by the ratio of outer to inner track diameter,
637 	 * which is typically around 2:1.  We account for this by assigning
638 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
639 	 * In effect, this means that we'll select the metaslab with the most
640 	 * free bandwidth rather than simply the one with the most free space.
641 	 */
642 	weight = 2 * weight -
643 	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
644 	ASSERT(weight >= space && weight <= 2 * space);
645 
646 	/*
647 	 * For locality, assign higher weight to metaslabs we've used before.
648 	 */
649 	if (smo->smo_object != 0)
650 		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
651 	ASSERT(weight >= space &&
652 	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
653 
654 	/*
655 	 * If this metaslab is one we're actively using, adjust its weight to
656 	 * make it preferable to any inactive metaslab so we'll polish it off.
657 	 */
658 	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
659 
660 	return (weight);
661 }
662 
663 static int
664 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
665 {
666 	space_map_t *sm = &msp->ms_map;
667 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
668 
669 	ASSERT(MUTEX_HELD(&msp->ms_lock));
670 
671 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
672 		space_map_load_wait(sm);
673 		if (!sm->sm_loaded) {
674 			int error = space_map_load(sm, sm_ops, SM_FREE,
675 			    &msp->ms_smo,
676 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
677 			if (error) {
678 				metaslab_group_sort(msp->ms_group, msp, 0);
679 				return (error);
680 			}
681 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
682 				space_map_walk(&msp->ms_defermap[t],
683 				    space_map_claim, sm);
684 		}
685 
686 		/*
687 		 * If we were able to load the map then make sure
688 		 * that this map is still able to satisfy our request.
689 		 */
690 		if (msp->ms_weight < size)
691 			return (ENOSPC);
692 
693 		metaslab_group_sort(msp->ms_group, msp,
694 		    msp->ms_weight | activation_weight);
695 	}
696 	ASSERT(sm->sm_loaded);
697 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
698 
699 	return (0);
700 }
701 
702 static void
703 metaslab_passivate(metaslab_t *msp, uint64_t size)
704 {
705 	/*
706 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
707 	 * this metaslab again.  In that case, it had better be empty,
708 	 * or we would be leaving space on the table.
709 	 */
710 #if 0
711 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
712 #endif
713 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
714 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
715 }
716 
717 /*
718  * Write a metaslab to disk in the context of the specified transaction group.
719  */
720 void
721 metaslab_sync(metaslab_t *msp, uint64_t txg)
722 {
723 	vdev_t *vd = msp->ms_group->mg_vd;
724 	spa_t *spa = vd->vdev_spa;
725 	objset_t *mos = spa_meta_objset(spa);
726 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
727 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
728 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
729 	space_map_t *sm = &msp->ms_map;
730 	space_map_obj_t *smo = &msp->ms_smo_syncing;
731 	dmu_buf_t *db;
732 	dmu_tx_t *tx;
733 
734 	ASSERT(!vd->vdev_ishole);
735 
736 	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
737 		return;
738 
739 	/*
740 	 * The only state that can actually be changing concurrently with
741 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
742 	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
743 	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
744 	 * We drop it whenever we call into the DMU, because the DMU
745 	 * can call down to us (e.g. via zio_free()) at any time.
746 	 */
747 
748 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
749 
750 	if (smo->smo_object == 0) {
751 		ASSERT(smo->smo_objsize == 0);
752 		ASSERT(smo->smo_alloc == 0);
753 		smo->smo_object = dmu_object_alloc(mos,
754 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
755 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
756 		ASSERT(smo->smo_object != 0);
757 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
758 		    (sm->sm_start >> vd->vdev_ms_shift),
759 		    sizeof (uint64_t), &smo->smo_object, tx);
760 	}
761 
762 	mutex_enter(&msp->ms_lock);
763 
764 	space_map_walk(freemap, space_map_add, freed_map);
765 
766 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
767 	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
768 		/*
769 		 * The in-core space map representation is twice as compact
770 		 * as the on-disk one, so it's time to condense the latter
771 		 * by generating a pure allocmap from first principles.
772 		 *
773 		 * This metaslab is 100% allocated,
774 		 * minus the content of the in-core map (sm),
775 		 * minus what's been freed this txg (freed_map),
776 		 * minus deferred frees (ms_defermap[]),
777 		 * minus allocations from txgs in the future
778 		 * (because they haven't been committed yet).
779 		 */
780 		space_map_vacate(allocmap, NULL, NULL);
781 		space_map_vacate(freemap, NULL, NULL);
782 
783 		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
784 
785 		space_map_walk(sm, space_map_remove, allocmap);
786 		space_map_walk(freed_map, space_map_remove, allocmap);
787 
788 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
789 			space_map_walk(&msp->ms_defermap[t],
790 			    space_map_remove, allocmap);
791 
792 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
793 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
794 			    space_map_remove, allocmap);
795 
796 		mutex_exit(&msp->ms_lock);
797 		space_map_truncate(smo, mos, tx);
798 		mutex_enter(&msp->ms_lock);
799 	}
800 
801 	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
802 	space_map_sync(freemap, SM_FREE, smo, mos, tx);
803 
804 	mutex_exit(&msp->ms_lock);
805 
806 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
807 	dmu_buf_will_dirty(db, tx);
808 	ASSERT3U(db->db_size, >=, sizeof (*smo));
809 	bcopy(smo, db->db_data, sizeof (*smo));
810 	dmu_buf_rele(db, FTAG);
811 
812 	dmu_tx_commit(tx);
813 }
814 
815 /*
816  * Called after a transaction group has completely synced to mark
817  * all of the metaslab's free space as usable.
818  */
819 void
820 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
821 {
822 	space_map_obj_t *smo = &msp->ms_smo;
823 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
824 	space_map_t *sm = &msp->ms_map;
825 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
826 	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
827 	metaslab_group_t *mg = msp->ms_group;
828 	vdev_t *vd = mg->mg_vd;
829 	int64_t alloc_delta, defer_delta;
830 
831 	ASSERT(!vd->vdev_ishole);
832 
833 	mutex_enter(&msp->ms_lock);
834 
835 	/*
836 	 * If this metaslab is just becoming available, initialize its
837 	 * allocmaps and freemaps and add its capacity to the vdev.
838 	 */
839 	if (freed_map->sm_size == 0) {
840 		for (int t = 0; t < TXG_SIZE; t++) {
841 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
842 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
843 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
844 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
845 		}
846 
847 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
848 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
849 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
850 
851 		vdev_space_update(vd, 0, 0, sm->sm_size);
852 	}
853 
854 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
855 	defer_delta = freed_map->sm_space - defer_map->sm_space;
856 
857 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
858 
859 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
860 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
861 
862 	/*
863 	 * If there's a space_map_load() in progress, wait for it to complete
864 	 * so that we have a consistent view of the in-core space map.
865 	 * Then, add defer_map (oldest deferred frees) to this map and
866 	 * transfer freed_map (this txg's frees) to defer_map.
867 	 */
868 	space_map_load_wait(sm);
869 	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
870 	space_map_vacate(freed_map, space_map_add, defer_map);
871 
872 	*smo = *smosync;
873 
874 	msp->ms_deferspace += defer_delta;
875 	ASSERT3S(msp->ms_deferspace, >=, 0);
876 	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
877 	if (msp->ms_deferspace != 0) {
878 		/*
879 		 * Keep syncing this metaslab until all deferred frees
880 		 * are back in circulation.
881 		 */
882 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
883 	}
884 
885 	/*
886 	 * If the map is loaded but no longer active, evict it as soon as all
887 	 * future allocations have synced.  (If we unloaded it now and then
888 	 * loaded a moment later, the map wouldn't reflect those allocations.)
889 	 */
890 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
891 		int evictable = 1;
892 
893 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
894 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
895 				evictable = 0;
896 
897 		if (evictable && !metaslab_debug)
898 			space_map_unload(sm);
899 	}
900 
901 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
902 
903 	mutex_exit(&msp->ms_lock);
904 }
905 
906 static uint64_t
907 metaslab_distance(metaslab_t *msp, dva_t *dva)
908 {
909 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
910 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
911 	uint64_t start = msp->ms_map.sm_start >> ms_shift;
912 
913 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
914 		return (1ULL << 63);
915 
916 	if (offset < start)
917 		return ((start - offset) << ms_shift);
918 	if (offset > start)
919 		return ((offset - start) << ms_shift);
920 	return (0);
921 }
922 
923 static uint64_t
924 metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
925     uint64_t min_distance, dva_t *dva, int d)
926 {
927 	metaslab_t *msp = NULL;
928 	uint64_t offset = -1ULL;
929 	avl_tree_t *t = &mg->mg_metaslab_tree;
930 	uint64_t activation_weight;
931 	uint64_t target_distance;
932 	int i;
933 
934 	activation_weight = METASLAB_WEIGHT_PRIMARY;
935 	for (i = 0; i < d; i++) {
936 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
937 			activation_weight = METASLAB_WEIGHT_SECONDARY;
938 			break;
939 		}
940 	}
941 
942 	for (;;) {
943 		boolean_t was_active;
944 
945 		mutex_enter(&mg->mg_lock);
946 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
947 			if (msp->ms_weight < size) {
948 				mutex_exit(&mg->mg_lock);
949 				return (-1ULL);
950 			}
951 
952 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
953 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
954 				break;
955 
956 			target_distance = min_distance +
957 			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
958 
959 			for (i = 0; i < d; i++)
960 				if (metaslab_distance(msp, &dva[i]) <
961 				    target_distance)
962 					break;
963 			if (i == d)
964 				break;
965 		}
966 		mutex_exit(&mg->mg_lock);
967 		if (msp == NULL)
968 			return (-1ULL);
969 
970 		mutex_enter(&msp->ms_lock);
971 
972 		/*
973 		 * Ensure that the metaslab we have selected is still
974 		 * capable of handling our request. It's possible that
975 		 * another thread may have changed the weight while we
976 		 * were blocked on the metaslab lock.
977 		 */
978 		if (msp->ms_weight < size || (was_active &&
979 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
980 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
981 			mutex_exit(&msp->ms_lock);
982 			continue;
983 		}
984 
985 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
986 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
987 			metaslab_passivate(msp,
988 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
989 			mutex_exit(&msp->ms_lock);
990 			continue;
991 		}
992 
993 		if (metaslab_activate(msp, activation_weight, size) != 0) {
994 			mutex_exit(&msp->ms_lock);
995 			continue;
996 		}
997 
998 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
999 			break;
1000 
1001 		metaslab_passivate(msp, size - 1);
1002 
1003 		mutex_exit(&msp->ms_lock);
1004 	}
1005 
1006 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1007 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1008 
1009 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1010 
1011 	mutex_exit(&msp->ms_lock);
1012 
1013 	return (offset);
1014 }
1015 
1016 /*
1017  * Allocate a block for the specified i/o.
1018  */
1019 static int
1020 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1021     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1022 {
1023 	metaslab_group_t *mg, *rotor;
1024 	vdev_t *vd;
1025 	int dshift = 3;
1026 	int all_zero;
1027 	int zio_lock = B_FALSE;
1028 	boolean_t allocatable;
1029 	uint64_t offset = -1ULL;
1030 	uint64_t asize;
1031 	uint64_t distance;
1032 
1033 	ASSERT(!DVA_IS_VALID(&dva[d]));
1034 
1035 	/*
1036 	 * For testing, make some blocks above a certain size be gang blocks.
1037 	 */
1038 	if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
1039 		return (ENOSPC);
1040 
1041 	/*
1042 	 * Start at the rotor and loop through all mgs until we find something.
1043 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1044 	 * nothing actually breaks if we miss a few updates -- we just won't
1045 	 * allocate quite as evenly.  It all balances out over time.
1046 	 *
1047 	 * If we are doing ditto or log blocks, try to spread them across
1048 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1049 	 * allocated all of our ditto blocks, then try and spread them out on
1050 	 * that vdev as much as possible.  If it turns out to not be possible,
1051 	 * gradually lower our standards until anything becomes acceptable.
1052 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1053 	 * gives us hope of containing our fault domains to something we're
1054 	 * able to reason about.  Otherwise, any two top-level vdev failures
1055 	 * will guarantee the loss of data.  With consecutive allocation,
1056 	 * only two adjacent top-level vdev failures will result in data loss.
1057 	 *
1058 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1059 	 * ourselves on the same vdev as our gang block header.  That
1060 	 * way, we can hope for locality in vdev_cache, plus it makes our
1061 	 * fault domains something tractable.
1062 	 */
1063 	if (hintdva) {
1064 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1065 
1066 		/*
1067 		 * It's possible the vdev we're using as the hint no
1068 		 * longer exists (i.e. removed). Consult the rotor when
1069 		 * all else fails.
1070 		 */
1071 		if (vd != NULL) {
1072 			mg = vd->vdev_mg;
1073 
1074 			if (flags & METASLAB_HINTBP_AVOID &&
1075 			    mg->mg_next != NULL)
1076 				mg = mg->mg_next;
1077 		} else {
1078 			mg = mc->mc_rotor;
1079 		}
1080 	} else if (d != 0) {
1081 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1082 		mg = vd->vdev_mg->mg_next;
1083 	} else {
1084 		mg = mc->mc_rotor;
1085 	}
1086 
1087 	/*
1088 	 * If the hint put us into the wrong metaslab class, or into a
1089 	 * metaslab group that has been passivated, just follow the rotor.
1090 	 */
1091 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1092 		mg = mc->mc_rotor;
1093 
1094 	rotor = mg;
1095 top:
1096 	all_zero = B_TRUE;
1097 	do {
1098 		ASSERT(mg->mg_activation_count == 1);
1099 
1100 		vd = mg->mg_vd;
1101 
1102 		/*
1103 		 * Don't allocate from faulted devices.
1104 		 */
1105 		if (zio_lock) {
1106 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1107 			allocatable = vdev_allocatable(vd);
1108 			spa_config_exit(spa, SCL_ZIO, FTAG);
1109 		} else {
1110 			allocatable = vdev_allocatable(vd);
1111 		}
1112 		if (!allocatable)
1113 			goto next;
1114 
1115 		/*
1116 		 * Avoid writing single-copy data to a failing vdev
1117 		 */
1118 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1119 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1120 		    d == 0 && dshift == 3) {
1121 			all_zero = B_FALSE;
1122 			goto next;
1123 		}
1124 
1125 		ASSERT(mg->mg_class == mc);
1126 
1127 		distance = vd->vdev_asize >> dshift;
1128 		if (distance <= (1ULL << vd->vdev_ms_shift))
1129 			distance = 0;
1130 		else
1131 			all_zero = B_FALSE;
1132 
1133 		asize = vdev_psize_to_asize(vd, psize);
1134 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1135 
1136 		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
1137 		if (offset != -1ULL) {
1138 			/*
1139 			 * If we've just selected this metaslab group,
1140 			 * figure out whether the corresponding vdev is
1141 			 * over- or under-used relative to the pool,
1142 			 * and set an allocation bias to even it out.
1143 			 */
1144 			if (mc->mc_aliquot == 0) {
1145 				vdev_stat_t *vs = &vd->vdev_stat;
1146 				int64_t vu, cu;
1147 
1148 				/*
1149 				 * Determine percent used in units of 0..1024.
1150 				 * (This is just to avoid floating point.)
1151 				 */
1152 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
1153 				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
1154 
1155 				/*
1156 				 * Bias by at most +/- 25% of the aliquot.
1157 				 */
1158 				mg->mg_bias = ((cu - vu) *
1159 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
1160 			}
1161 
1162 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1163 			    mg->mg_aliquot + mg->mg_bias) {
1164 				mc->mc_rotor = mg->mg_next;
1165 				mc->mc_aliquot = 0;
1166 			}
1167 
1168 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1169 			DVA_SET_OFFSET(&dva[d], offset);
1170 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1171 			DVA_SET_ASIZE(&dva[d], asize);
1172 
1173 			return (0);
1174 		}
1175 next:
1176 		mc->mc_rotor = mg->mg_next;
1177 		mc->mc_aliquot = 0;
1178 	} while ((mg = mg->mg_next) != rotor);
1179 
1180 	if (!all_zero) {
1181 		dshift++;
1182 		ASSERT(dshift < 64);
1183 		goto top;
1184 	}
1185 
1186 	if (!allocatable && !zio_lock) {
1187 		dshift = 3;
1188 		zio_lock = B_TRUE;
1189 		goto top;
1190 	}
1191 
1192 	bzero(&dva[d], sizeof (dva_t));
1193 
1194 	return (ENOSPC);
1195 }
1196 
1197 /*
1198  * Free the block represented by DVA in the context of the specified
1199  * transaction group.
1200  */
1201 static void
1202 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1203 {
1204 	uint64_t vdev = DVA_GET_VDEV(dva);
1205 	uint64_t offset = DVA_GET_OFFSET(dva);
1206 	uint64_t size = DVA_GET_ASIZE(dva);
1207 	vdev_t *vd;
1208 	metaslab_t *msp;
1209 
1210 	ASSERT(DVA_IS_VALID(dva));
1211 
1212 	if (txg > spa_freeze_txg(spa))
1213 		return;
1214 
1215 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1216 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1217 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1218 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1219 		ASSERT(0);
1220 		return;
1221 	}
1222 
1223 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1224 
1225 	if (DVA_GET_GANG(dva))
1226 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1227 
1228 	mutex_enter(&msp->ms_lock);
1229 
1230 	if (now) {
1231 		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1232 		    offset, size);
1233 		space_map_free(&msp->ms_map, offset, size);
1234 	} else {
1235 		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1236 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1237 		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1238 	}
1239 
1240 	mutex_exit(&msp->ms_lock);
1241 }
1242 
1243 /*
1244  * Intent log support: upon opening the pool after a crash, notify the SPA
1245  * of blocks that the intent log has allocated for immediate write, but
1246  * which are still considered free by the SPA because the last transaction
1247  * group didn't commit yet.
1248  */
1249 static int
1250 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1251 {
1252 	uint64_t vdev = DVA_GET_VDEV(dva);
1253 	uint64_t offset = DVA_GET_OFFSET(dva);
1254 	uint64_t size = DVA_GET_ASIZE(dva);
1255 	vdev_t *vd;
1256 	metaslab_t *msp;
1257 	int error = 0;
1258 
1259 	ASSERT(DVA_IS_VALID(dva));
1260 
1261 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1262 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1263 		return (ENXIO);
1264 
1265 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1266 
1267 	if (DVA_GET_GANG(dva))
1268 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1269 
1270 	mutex_enter(&msp->ms_lock);
1271 
1272 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1273 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
1274 
1275 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1276 		error = ENOENT;
1277 
1278 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
1279 		mutex_exit(&msp->ms_lock);
1280 		return (error);
1281 	}
1282 
1283 	space_map_claim(&msp->ms_map, offset, size);
1284 
1285 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
1286 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1287 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1288 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1289 	}
1290 
1291 	mutex_exit(&msp->ms_lock);
1292 
1293 	return (0);
1294 }
1295 
1296 int
1297 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1298     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1299 {
1300 	dva_t *dva = bp->blk_dva;
1301 	dva_t *hintdva = hintbp->blk_dva;
1302 	int error = 0;
1303 
1304 	ASSERT(bp->blk_birth == 0);
1305 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1306 
1307 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1308 
1309 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
1310 		spa_config_exit(spa, SCL_ALLOC, FTAG);
1311 		return (ENOSPC);
1312 	}
1313 
1314 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1315 	ASSERT(BP_GET_NDVAS(bp) == 0);
1316 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1317 
1318 	for (int d = 0; d < ndvas; d++) {
1319 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1320 		    txg, flags);
1321 		if (error) {
1322 			for (d--; d >= 0; d--) {
1323 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1324 				bzero(&dva[d], sizeof (dva_t));
1325 			}
1326 			spa_config_exit(spa, SCL_ALLOC, FTAG);
1327 			return (error);
1328 		}
1329 	}
1330 	ASSERT(error == 0);
1331 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
1332 
1333 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1334 
1335 	BP_SET_BIRTH(bp, txg, txg);
1336 
1337 	return (0);
1338 }
1339 
1340 void
1341 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1342 {
1343 	const dva_t *dva = bp->blk_dva;
1344 	int ndvas = BP_GET_NDVAS(bp);
1345 
1346 	ASSERT(!BP_IS_HOLE(bp));
1347 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1348 
1349 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1350 
1351 	for (int d = 0; d < ndvas; d++)
1352 		metaslab_free_dva(spa, &dva[d], txg, now);
1353 
1354 	spa_config_exit(spa, SCL_FREE, FTAG);
1355 }
1356 
1357 int
1358 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1359 {
1360 	const dva_t *dva = bp->blk_dva;
1361 	int ndvas = BP_GET_NDVAS(bp);
1362 	int error = 0;
1363 
1364 	ASSERT(!BP_IS_HOLE(bp));
1365 
1366 	if (txg != 0) {
1367 		/*
1368 		 * First do a dry run to make sure all DVAs are claimable,
1369 		 * so we don't have to unwind from partial failures below.
1370 		 */
1371 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
1372 			return (error);
1373 	}
1374 
1375 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1376 
1377 	for (int d = 0; d < ndvas; d++)
1378 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1379 			break;
1380 
1381 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1382 
1383 	ASSERT(error == 0 || txg == 0);
1384 
1385 	return (error);
1386 }
1387