xref: /titanic_41/usr/src/uts/common/fs/zfs/metaslab.c (revision 88f8b78a88cbdc6d8c1af5c3e54bc49d25095c98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/dmu.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/space_map.h>
34 #include <sys/metaslab_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/zio.h>
37 
38 /*
39  * ==========================================================================
40  * Metaslab classes
41  * ==========================================================================
42  */
43 metaslab_class_t *
44 metaslab_class_create(void)
45 {
46 	metaslab_class_t *mc;
47 
48 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
49 
50 	mc->mc_rotor = NULL;
51 
52 	return (mc);
53 }
54 
55 void
56 metaslab_class_destroy(metaslab_class_t *mc)
57 {
58 	metaslab_group_t *mg;
59 
60 	while ((mg = mc->mc_rotor) != NULL) {
61 		metaslab_class_remove(mc, mg);
62 		metaslab_group_destroy(mg);
63 	}
64 
65 	kmem_free(mc, sizeof (metaslab_class_t));
66 }
67 
68 void
69 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
70 {
71 	metaslab_group_t *mgprev, *mgnext;
72 
73 	ASSERT(mg->mg_class == NULL);
74 
75 	if ((mgprev = mc->mc_rotor) == NULL) {
76 		mg->mg_prev = mg;
77 		mg->mg_next = mg;
78 	} else {
79 		mgnext = mgprev->mg_next;
80 		mg->mg_prev = mgprev;
81 		mg->mg_next = mgnext;
82 		mgprev->mg_next = mg;
83 		mgnext->mg_prev = mg;
84 	}
85 	mc->mc_rotor = mg;
86 	mg->mg_class = mc;
87 }
88 
89 void
90 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
91 {
92 	metaslab_group_t *mgprev, *mgnext;
93 
94 	ASSERT(mg->mg_class == mc);
95 
96 	mgprev = mg->mg_prev;
97 	mgnext = mg->mg_next;
98 
99 	if (mg == mgnext) {
100 		mc->mc_rotor = NULL;
101 	} else {
102 		mc->mc_rotor = mgnext;
103 		mgprev->mg_next = mgnext;
104 		mgnext->mg_prev = mgprev;
105 	}
106 
107 	mg->mg_prev = NULL;
108 	mg->mg_next = NULL;
109 	mg->mg_class = NULL;
110 }
111 
112 /*
113  * ==========================================================================
114  * Metaslab groups
115  * ==========================================================================
116  */
117 static int
118 metaslab_compare(const void *x1, const void *x2)
119 {
120 	const metaslab_t *m1 = x1;
121 	const metaslab_t *m2 = x2;
122 
123 	if (m1->ms_weight < m2->ms_weight)
124 		return (1);
125 	if (m1->ms_weight > m2->ms_weight)
126 		return (-1);
127 
128 	/*
129 	 * If the weights are identical, use the offset to force uniqueness.
130 	 */
131 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
132 		return (-1);
133 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
134 		return (1);
135 
136 	ASSERT3P(m1, ==, m2);
137 
138 	return (0);
139 }
140 
141 metaslab_group_t *
142 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
143 {
144 	metaslab_group_t *mg;
145 
146 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
147 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
148 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
149 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
150 	mg->mg_aliquot = 2ULL << 20;		/* XXX -- tweak me */
151 	mg->mg_vd = vd;
152 	metaslab_class_add(mc, mg);
153 
154 	return (mg);
155 }
156 
157 void
158 metaslab_group_destroy(metaslab_group_t *mg)
159 {
160 	avl_destroy(&mg->mg_metaslab_tree);
161 	mutex_destroy(&mg->mg_lock);
162 	kmem_free(mg, sizeof (metaslab_group_t));
163 }
164 
165 void
166 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
167 {
168 	mutex_enter(&mg->mg_lock);
169 	ASSERT(msp->ms_group == NULL);
170 	msp->ms_group = mg;
171 	msp->ms_weight = weight;
172 	avl_add(&mg->mg_metaslab_tree, msp);
173 	mutex_exit(&mg->mg_lock);
174 }
175 
176 void
177 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
178 {
179 	mutex_enter(&mg->mg_lock);
180 	ASSERT(msp->ms_group == mg);
181 	avl_remove(&mg->mg_metaslab_tree, msp);
182 	msp->ms_group = NULL;
183 	mutex_exit(&mg->mg_lock);
184 }
185 
186 void
187 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
188 {
189 	mutex_enter(&mg->mg_lock);
190 	ASSERT(msp->ms_group == mg);
191 	avl_remove(&mg->mg_metaslab_tree, msp);
192 	msp->ms_weight = weight;
193 	avl_add(&mg->mg_metaslab_tree, msp);
194 	mutex_exit(&mg->mg_lock);
195 }
196 
197 /*
198  * ==========================================================================
199  * Metaslabs
200  * ==========================================================================
201  */
202 void
203 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
204 	uint64_t start, uint64_t size, uint64_t txg)
205 {
206 	vdev_t *vd = mg->mg_vd;
207 	metaslab_t *msp;
208 	int fm;
209 
210 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
211 
212 	msp->ms_smo = smo;
213 
214 	space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
215 	    &msp->ms_lock);
216 
217 	for (fm = 0; fm < TXG_SIZE; fm++) {
218 		space_map_create(&msp->ms_allocmap[fm], start, size,
219 		    vd->vdev_ashift, &msp->ms_lock);
220 		space_map_create(&msp->ms_freemap[fm], start, size,
221 		    vd->vdev_ashift, &msp->ms_lock);
222 	}
223 
224 	/*
225 	 * If we're opening an existing pool (txg == 0) or creating
226 	 * a new one (txg == TXG_INITIAL), all space is available now.
227 	 * If we're adding space to an existing pool, the new space
228 	 * does not become available until after this txg has synced.
229 	 * We enforce this by assigning an initial weight of 0 to new space.
230 	 *
231 	 * (Transactional allocations for this txg would actually be OK;
232 	 * it's intent log allocations that cause trouble.  If we wrote
233 	 * a log block in this txg and lost power, the log replay would be
234 	 * based on the DVA translations that had been synced in txg - 1.
235 	 * Those translations would not include this metaslab's vdev.)
236 	 */
237 	metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
238 
239 	if (txg == 0) {
240 		/*
241 		 * We're opening the pool.  Make the metaslab's
242 		 * free space available immediately.
243 		 */
244 		vdev_space_update(vd, size, smo->smo_alloc);
245 		metaslab_sync_done(msp, 0);
246 	} else {
247 		/*
248 		 * We're adding a new metaslab to an already-open pool.
249 		 * Declare all of the metaslab's space to be free.
250 		 *
251 		 * Note that older transaction groups cannot allocate
252 		 * from this metaslab until its existence is committed,
253 		 * because we set ms_last_alloc to the current txg.
254 		 */
255 		smo->smo_alloc = 0;
256 		msp->ms_usable_space = size;
257 		mutex_enter(&msp->ms_lock);
258 		space_map_add(&msp->ms_map, start, size);
259 		msp->ms_map_incore = 1;
260 		mutex_exit(&msp->ms_lock);
261 
262 		/* XXX -- we'll need a call to picker_init here */
263 		msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
264 		msp->ms_last_alloc = txg;
265 		vdev_dirty(vd, VDD_ADD, txg);
266 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
267 	}
268 
269 	*mspp = msp;
270 }
271 
272 void
273 metaslab_fini(metaslab_t *msp)
274 {
275 	int fm;
276 	metaslab_group_t *mg = msp->ms_group;
277 
278 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
279 	    -msp->ms_smo->smo_alloc);
280 
281 	metaslab_group_remove(mg, msp);
282 
283 	/* XXX -- we'll need a call to picker_fini here */
284 
285 	mutex_enter(&msp->ms_lock);
286 
287 	space_map_vacate(&msp->ms_map, NULL, NULL);
288 	msp->ms_map_incore = 0;
289 	space_map_destroy(&msp->ms_map);
290 
291 	for (fm = 0; fm < TXG_SIZE; fm++) {
292 		space_map_destroy(&msp->ms_allocmap[fm]);
293 		space_map_destroy(&msp->ms_freemap[fm]);
294 	}
295 
296 	mutex_exit(&msp->ms_lock);
297 
298 	kmem_free(msp, sizeof (metaslab_t));
299 }
300 
301 /*
302  * Write a metaslab to disk in the context of the specified transaction group.
303  */
304 void
305 metaslab_sync(metaslab_t *msp, uint64_t txg)
306 {
307 	vdev_t *vd = msp->ms_group->mg_vd;
308 	spa_t *spa = vd->vdev_spa;
309 	objset_t *os = spa->spa_meta_objset;
310 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
311 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
312 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
313 	space_map_obj_t *smo = msp->ms_smo;
314 	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
315 	uint64_t alloc_delta;
316 	dmu_buf_t *db;
317 	dmu_tx_t *tx;
318 
319 	dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
320 
321 	mutex_enter(&msp->ms_lock);
322 
323 	if (*dirty & MSD_ADD)
324 		vdev_space_update(vd, msp->ms_map.sm_size, 0);
325 
326 	if (*dirty & (MSD_ALLOC | MSD_FREE)) {
327 		tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
328 
329 		if (smo->smo_object == 0) {
330 			ASSERT(smo->smo_objsize == 0);
331 			ASSERT(smo->smo_alloc == 0);
332 			smo->smo_object = dmu_object_alloc(os,
333 			    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
334 			    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
335 			ASSERT(smo->smo_object != 0);
336 			dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
337 			    (msp->ms_map.sm_start >> vd->vdev_ms_shift),
338 			    sizeof (uint64_t), &smo->smo_object, tx);
339 		}
340 
341 		alloc_delta = allocmap->sm_space - freemap->sm_space;
342 		vdev_space_update(vd, 0, alloc_delta);
343 		smo->smo_alloc += alloc_delta;
344 
345 		if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
346 		    (*dirty & MSD_CONDENSE) == 0) {
347 			space_map_t *sm = &msp->ms_map;
348 			space_map_t *tsm;
349 			int i;
350 
351 			ASSERT(msp->ms_map_incore);
352 
353 			space_map_merge(freemap, freed_map);
354 			space_map_vacate(allocmap, NULL, NULL);
355 
356 			/*
357 			 * Write out the current state of the allocation
358 			 * world.  The current metaslab is full, minus
359 			 * stuff that's been freed this txg (freed_map),
360 			 * minus allocations from txgs in the future.
361 			 */
362 			space_map_add(sm, sm->sm_start, sm->sm_size);
363 			for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
364 				tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
365 				space_map_iterate(tsm, space_map_remove, sm);
366 			}
367 			space_map_iterate(freed_map, space_map_remove, sm);
368 
369 			space_map_write(sm, smo, os, tx);
370 
371 			ASSERT(sm->sm_space == 0);
372 			ASSERT(freemap->sm_space == 0);
373 			ASSERT(allocmap->sm_space == 0);
374 
375 			*dirty |= MSD_CONDENSE;
376 		} else {
377 			space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
378 			space_map_sync(freemap, freed_map, smo, SM_FREE,
379 			    os, tx);
380 		}
381 
382 		db = dmu_bonus_hold(os, smo->smo_object);
383 		dmu_buf_will_dirty(db, tx);
384 		ASSERT3U(db->db_size, ==, sizeof (*smo));
385 		bcopy(smo, db->db_data, db->db_size);
386 		dmu_buf_rele(db);
387 
388 		dmu_tx_commit(tx);
389 	}
390 
391 	*dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
392 
393 	mutex_exit(&msp->ms_lock);
394 
395 	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
396 }
397 
398 /*
399  * Called after a transaction group has completely synced to mark
400  * all of the metaslab's free space as usable.
401  */
402 void
403 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
404 {
405 	uint64_t weight;
406 	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
407 	space_map_obj_t *smo = msp->ms_smo;
408 
409 	dprintf("%s offset %llx txg %llu\n",
410 	    vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
411 
412 	mutex_enter(&msp->ms_lock);
413 
414 	ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
415 
416 	msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
417 	msp->ms_usable_end = smo->smo_objsize;
418 
419 	weight = msp->ms_usable_space;
420 
421 	if (txg != 0) {
422 		space_map_t *freed_map =
423 		    &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
424 
425 		/* XXX -- we'll need a call to picker_fini here */
426 
427 		/* If we're empty, don't bother sticking around */
428 		if (msp->ms_usable_space == 0) {
429 			space_map_vacate(&msp->ms_map, NULL, NULL);
430 			msp->ms_map_incore = 0;
431 			ASSERT3U(freed_map->sm_space, ==, 0);
432 			weight = 0;
433 		} else {
434 			/* Add the freed blocks to the available space map */
435 			if (msp->ms_map_incore)
436 				space_map_merge(freed_map, &msp->ms_map);
437 			else
438 				space_map_vacate(freed_map, NULL, NULL);
439 			weight += msp->ms_map.sm_size;
440 		}
441 
442 		if (msp->ms_last_alloc == txg)
443 			/* Safe to use for allocation now */
444 			msp->ms_last_alloc = 0;
445 
446 		*dirty = 0;
447 	}
448 
449 	mutex_exit(&msp->ms_lock);
450 
451 	metaslab_group_sort(msp->ms_group, msp, weight);
452 }
453 
454 /*
455  * The first-fit block picker.  No picker_init or picker_fini,
456  * this is just an experiment to see how it feels to separate out
457  * the block selection policy from the map updates.
458  * Note: the 'cursor' argument is a form of PPD.
459  */
460 static uint64_t
461 metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
462 {
463 	avl_tree_t *t = &sm->sm_root;
464 	uint64_t align = size & -size;
465 	space_seg_t *ss, ssearch;
466 	avl_index_t where;
467 	int tried_once = 0;
468 
469 again:
470 	ssearch.ss_start = *cursor;
471 	ssearch.ss_end = *cursor + size;
472 
473 	ss = avl_find(t, &ssearch, &where);
474 	if (ss == NULL)
475 		ss = avl_nearest(t, where, AVL_AFTER);
476 
477 	while (ss != NULL) {
478 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
479 
480 		if (offset + size <= ss->ss_end) {
481 			*cursor = offset + size;
482 			return (offset);
483 		}
484 		ss = AVL_NEXT(t, ss);
485 	}
486 
487 	/* If we couldn't find a block after cursor, search again */
488 	if (tried_once == 0) {
489 		tried_once = 1;
490 		*cursor = 0;
491 		goto again;
492 	}
493 
494 	return (-1ULL);
495 }
496 
497 static uint64_t
498 metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
499 {
500 	space_map_t *sm = &msp->ms_map;
501 	vdev_t *vd = msp->ms_group->mg_vd;
502 	uint64_t offset;
503 
504 	ASSERT(MUTEX_HELD(&msp->ms_lock));
505 	ASSERT(msp->ms_map_incore);
506 	ASSERT(sm->sm_space != 0);
507 	ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
508 
509 	offset = metaslab_pick_block(sm, size,
510 	    &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
511 	if (offset != -1ULL) {
512 		space_map_remove(sm, offset, size);
513 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
514 	}
515 	return (offset);
516 }
517 
518 /*
519  * Intent log support: upon opening the pool after a crash, notify the SPA
520  * of blocks that the intent log has allocated for immediate write, but
521  * which are still considered free by the SPA because the last transaction
522  * group didn't commit yet.
523  */
524 int
525 metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
526 {
527 	uint64_t vdev = DVA_GET_VDEV(dva);
528 	uint64_t offset = DVA_GET_OFFSET(dva);
529 	uint64_t size = DVA_GET_ASIZE(dva);
530 	objset_t *os = spa->spa_meta_objset;
531 	vdev_t *vd;
532 	metaslab_t *msp;
533 	space_map_t *sm;
534 	space_map_obj_t *smo;
535 	int error;
536 
537 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
538 		return (ENXIO);
539 
540 	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
541 		return (ENXIO);
542 
543 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
544 	sm = &msp->ms_map;
545 	smo = msp->ms_smo;
546 
547 	if (DVA_GET_GANG(dva))
548 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
549 
550 	mutex_enter(&msp->ms_lock);
551 
552 	if (msp->ms_map_incore == 0) {
553 		error = space_map_load(sm, smo, SM_FREE, os,
554 		    msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
555 		ASSERT(error == 0);
556 		if (error) {
557 			mutex_exit(&msp->ms_lock);
558 			return (error);
559 		}
560 		msp->ms_map_incore = 1;
561 		/* XXX -- we'll need a call to picker_init here */
562 		bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
563 	}
564 
565 	space_map_remove(sm, offset, size);
566 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
567 
568 	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
569 		msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
570 		msp->ms_last_alloc = txg;
571 		vdev_dirty(vd, VDD_ALLOC, txg);
572 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
573 	}
574 
575 	mutex_exit(&msp->ms_lock);
576 
577 	return (0);
578 }
579 
580 static int
581 metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
582 {
583 	/*
584 	 * Enforce segregation across transaction groups.
585 	 */
586 	/* XXX -- We should probably not assume we know what ms_weight means */
587 	if (msp->ms_last_alloc == txg)
588 		return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
589 
590 	if (msp->ms_last_alloc != 0)
591 		return (0);
592 
593 	if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
594 		return (1);
595 
596 	/* XXX -- the weight test should be in terms of MINFREE */
597 	return (msp->ms_usable_space >= size && msp->ms_weight >= size);
598 }
599 
600 static metaslab_t *
601 metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
602 {
603 	metaslab_t *msp;
604 	avl_tree_t *t = &mg->mg_metaslab_tree;
605 
606 	mutex_enter(&mg->mg_lock);
607 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
608 		if (metaslab_usable(msp, size, txg))
609 			break;
610 	mutex_exit(&mg->mg_lock);
611 
612 	return (msp);
613 }
614 
615 static metaslab_t *
616 metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
617     uint64_t *offp, uint64_t txg)
618 {
619 	metaslab_t *msp;
620 	int error;
621 
622 	while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
623 		space_map_obj_t *smo = msp->ms_smo;
624 		mutex_enter(&msp->ms_lock);
625 		if (!metaslab_usable(msp, size, txg)) {
626 			mutex_exit(&msp->ms_lock);
627 			continue;
628 		}
629 		if (msp->ms_map_incore == 0) {
630 			error = space_map_load(&msp->ms_map, smo, SM_FREE,
631 			    spa->spa_meta_objset, msp->ms_usable_end,
632 			    msp->ms_map.sm_size - msp->ms_usable_space);
633 			ASSERT(error == 0);
634 			if (error) {
635 				mutex_exit(&msp->ms_lock);
636 				metaslab_group_sort(mg, msp, 0);
637 				continue;
638 			}
639 			msp->ms_map_incore = 1;
640 			/* XXX -- we'll need a call to picker_init here */
641 			bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
642 		}
643 		*offp = metaslab_getblock(msp, size, txg);
644 		if (*offp != -1ULL) {
645 			if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
646 				vdev_t *vd = mg->mg_vd;
647 				msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
648 				msp->ms_last_alloc = txg;
649 				vdev_dirty(vd, VDD_ALLOC, txg);
650 				(void) txg_list_add(&vd->vdev_ms_list,
651 				    msp, txg);
652 			}
653 			mutex_exit(&msp->ms_lock);
654 			return (msp);
655 		}
656 		mutex_exit(&msp->ms_lock);
657 		metaslab_group_sort(msp->ms_group, msp, size - 1);
658 	}
659 
660 	return (NULL);
661 }
662 
663 /*
664  * Allocate a block for the specified i/o.
665  */
666 int
667 metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
668 {
669 	metaslab_t *msp;
670 	metaslab_group_t *mg, *rotor;
671 	metaslab_class_t *mc;
672 	vdev_t *vd;
673 	uint64_t offset = -1ULL;
674 	uint64_t asize;
675 
676 	mc = spa_metaslab_class_select(spa);
677 
678 	/*
679 	 * Start at the rotor and loop through all mgs until we find something.
680 	 * Note that there's no locking on mc_rotor or mc_allocated because
681 	 * nothing actually breaks if we miss a few updates -- we just won't
682 	 * allocate quite as evenly.  It all balances out over time.
683 	 */
684 	mg = rotor = mc->mc_rotor;
685 	do {
686 		vd = mg->mg_vd;
687 		asize = vdev_psize_to_asize(vd, psize);
688 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
689 
690 		msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
691 		if (msp != NULL) {
692 			ASSERT(offset != -1ULL);
693 
694 			/*
695 			 * If we've just selected this metaslab group,
696 			 * figure out whether the corresponding vdev is
697 			 * over- or under-used relative to the pool,
698 			 * and set an allocation bias to even it out.
699 			 */
700 			if (mc->mc_allocated == 0) {
701 				vdev_stat_t *vs = &vd->vdev_stat;
702 				uint64_t alloc, space;
703 				int64_t vu, su;
704 
705 				alloc = spa_get_alloc(spa);
706 				space = spa_get_space(spa);
707 
708 				/*
709 				 * Determine percent used in units of 0..1024.
710 				 * (This is just to avoid floating point.)
711 				 */
712 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
713 				su = (alloc << 10) / (space + 1);
714 
715 				/*
716 				 * Bias by at most +/- 25% of the aliquot.
717 				 */
718 				mg->mg_bias = ((su - vu) *
719 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
720 
721 				dprintf("bias = %lld\n", mg->mg_bias);
722 			}
723 
724 			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
725 			    mg->mg_aliquot + mg->mg_bias) {
726 				mc->mc_rotor = mg->mg_next;
727 				mc->mc_allocated = 0;
728 			}
729 
730 			DVA_SET_VDEV(dva, vd->vdev_id);
731 			DVA_SET_OFFSET(dva, offset);
732 			DVA_SET_GANG(dva, 0);
733 			DVA_SET_ASIZE(dva, asize);
734 
735 			return (0);
736 		}
737 		mc->mc_rotor = mg->mg_next;
738 		mc->mc_allocated = 0;
739 	} while ((mg = mg->mg_next) != rotor);
740 
741 	dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
742 
743 	DVA_SET_VDEV(dva, 0);
744 	DVA_SET_OFFSET(dva, 0);
745 	DVA_SET_GANG(dva, 0);
746 
747 	return (ENOSPC);
748 }
749 
750 /*
751  * Free the block represented by DVA in the context of the specified
752  * transaction group.
753  */
754 void
755 metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
756 {
757 	uint64_t vdev = DVA_GET_VDEV(dva);
758 	uint64_t offset = DVA_GET_OFFSET(dva);
759 	uint64_t size = DVA_GET_ASIZE(dva);
760 	vdev_t *vd;
761 	metaslab_t *msp;
762 
763 	if (txg > spa_freeze_txg(spa))
764 		return;
765 
766 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
767 		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
768 		    (u_longlong_t)vdev);
769 		ASSERT(0);
770 		return;
771 	}
772 
773 	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
774 		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
775 		    (u_longlong_t)offset);
776 		ASSERT(0);
777 		return;
778 	}
779 
780 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
781 
782 	if (DVA_GET_GANG(dva))
783 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
784 
785 	mutex_enter(&msp->ms_lock);
786 
787 	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
788 		msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
789 		vdev_dirty(vd, VDD_FREE, txg);
790 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
791 	}
792 
793 	space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
794 
795 	mutex_exit(&msp->ms_lock);
796 }
797