xref: /illumos-gate/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef _SYS_METASLAB_IMPL_H
28 #define	_SYS_METASLAB_IMPL_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/metaslab.h>
33 #include <sys/space_map.h>
34 #include <sys/vdev.h>
35 #include <sys/txg.h>
36 #include <sys/avl.h>
37 
38 #ifdef	__cplusplus
39 extern "C" {
40 #endif
41 
42 struct metaslab_class {
43 	metaslab_group_t	*mc_rotor;
44 	uint64_t		mc_allocated;
45 };
46 
47 struct metaslab_group {
48 	kmutex_t		mg_lock;
49 	avl_tree_t		mg_metaslab_tree;
50 	uint64_t		mg_aliquot;
51 	int64_t			mg_bias;
52 	metaslab_class_t	*mg_class;
53 	vdev_t			*mg_vd;
54 	metaslab_group_t	*mg_prev;
55 	metaslab_group_t	*mg_next;
56 };
57 
58 /*
59  * Each metaslab's free block list is kept in its own DMU object in the
60  * metaslab freelist dataset.  To minimize space consumption, the list
61  * is circular.
62  *
63  * Allocations and frees can happen in multiple transaction groups at
64  * the same time, which makes it a bit challening to keep the metaslab
65  * consistent.  For example, we cannot allow frees from different
66  * transaction groups to be interleaved in the metaslab's free block list.
67  *
68  * We address this in several ways:
69  *
70  *	We don't allow allocations from the same metaslab in concurrent
71  *	transaction groups.  metaslab_alloc() enforces this by checking
72  *	the ms_last_alloc field, which specifies the last txg in which
73  *	the metaslab was used for allocations.
74  *
75  *	We can't segregate frees this way because we can't choose which
76  *	DVAs someone wants to free.  So we keep separate in-core freelists
77  *	for each active transaction group.  This in-core data is only
78  *	written to the metaslab's on-disk freelist in metaslab_sync(),
79  *	which solves the interleave problem: we only append frees from
80  *	the syncing txg to the on-disk freelist, so the appends all occur
81  *	in txg order.
82  *
83  *	We cannot allow a block which was freed in a given txg to be
84  *	allocated again until that txg has closed; otherwise, if we
85  *	failed to sync that txg and had to roll back to txg - 1,
86  *	changes in txg + 1 could have overwritten the data.  Therefore,
87  *	we partition the free blocks into "available" and "limbo" states.
88  *	A block is available if the txg in which it was freed has closed;
89  *	until then, the block is in limbo.  Each time metaslab_sync() runs,
90  *	if first adds any limbo blocks to the avail list, clears the limbo
91  *	list, and starts writing the new limbo blocks (i.e. the ones that
92  *	were freed in the syncing txg).
93  */
94 
95 struct metaslab {
96 	kmutex_t	ms_lock;	/* metaslab lock		*/
97 	space_map_obj_t	*ms_smo;	/* space map object		*/
98 	uint64_t	ms_last_alloc;	/* txg of last alloc		*/
99 	uint64_t	ms_usable_end;	/* end of free_obj at last sync	*/
100 	uint64_t	ms_usable_space; /* usable space at last sync	*/
101 	metaslab_group_t *ms_group;	/* metaslab group		*/
102 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
103 	uint64_t	ms_weight;	/* weight vs. others in group	*/
104 	uint8_t		ms_dirty[TXG_SIZE];	/* per-txg dirty flags	*/
105 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
106 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
107 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
108 	space_map_t	ms_map;		/* in-core free space map	*/
109 	uint8_t		ms_map_incore;  /* space map contents are valid */
110 	uint64_t	ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD	*/
111 };
112 
113 /*
114  * ms_dirty[] flags
115  */
116 #define	MSD_ALLOC	0x01	/* allocated from in this txg		*/
117 #define	MSD_FREE	0x02	/* freed to in this txg			*/
118 #define	MSD_ADD		0x04	/* added to the pool in this txg	*/
119 #define	MSD_CONDENSE	0x08	/* condensed in this txg		*/
120 
121 #ifdef	__cplusplus
122 }
123 #endif
124 
125 #endif	/* _SYS_METASLAB_IMPL_H */
126