xref: /illumos-gate/usr/src/uts/common/fs/zfs/bplist.c (revision e0d8bef656731559f2d1d5dca0077a6a66c6f8b9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/bplist.h>
27 #include <sys/zfs_context.h>
28 
29 static int
30 bplist_hold(bplist_t *bpl)
31 {
32 	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
33 	if (bpl->bpl_dbuf == NULL) {
34 		int err = dmu_bonus_hold(bpl->bpl_mos,
35 		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
36 		if (err)
37 			return (err);
38 		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
39 	}
40 	return (0);
41 }
42 
43 uint64_t
44 bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
45 {
46 	int size;
47 
48 	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
49 	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
50 
51 	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
52 	    DMU_OT_BPLIST_HDR, size, tx));
53 }
54 
55 void
56 bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
57 {
58 	VERIFY(dmu_object_free(mos, object, tx) == 0);
59 }
60 
61 int
62 bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
63 {
64 	dmu_object_info_t doi;
65 	int err;
66 
67 	err = dmu_object_info(mos, object, &doi);
68 	if (err)
69 		return (err);
70 
71 	mutex_enter(&bpl->bpl_lock);
72 
73 	ASSERT(bpl->bpl_dbuf == NULL);
74 	ASSERT(bpl->bpl_phys == NULL);
75 	ASSERT(bpl->bpl_cached_dbuf == NULL);
76 	ASSERT(bpl->bpl_queue == NULL);
77 	ASSERT(object != 0);
78 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
79 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
80 
81 	bpl->bpl_mos = mos;
82 	bpl->bpl_object = object;
83 	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
84 	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
85 	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
86 
87 	mutex_exit(&bpl->bpl_lock);
88 	return (0);
89 }
90 
91 void
92 bplist_close(bplist_t *bpl)
93 {
94 	mutex_enter(&bpl->bpl_lock);
95 
96 	ASSERT(bpl->bpl_queue == NULL);
97 
98 	if (bpl->bpl_cached_dbuf) {
99 		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
100 		bpl->bpl_cached_dbuf = NULL;
101 	}
102 	if (bpl->bpl_dbuf) {
103 		dmu_buf_rele(bpl->bpl_dbuf, bpl);
104 		bpl->bpl_dbuf = NULL;
105 		bpl->bpl_phys = NULL;
106 	}
107 
108 	mutex_exit(&bpl->bpl_lock);
109 }
110 
111 boolean_t
112 bplist_empty(bplist_t *bpl)
113 {
114 	boolean_t rv;
115 
116 	if (bpl->bpl_object == 0)
117 		return (B_TRUE);
118 
119 	mutex_enter(&bpl->bpl_lock);
120 	VERIFY(0 == bplist_hold(bpl)); /* XXX */
121 	rv = (bpl->bpl_phys->bpl_entries == 0);
122 	mutex_exit(&bpl->bpl_lock);
123 
124 	return (rv);
125 }
126 
127 static int
128 bplist_cache(bplist_t *bpl, uint64_t blkid)
129 {
130 	int err = 0;
131 
132 	if (bpl->bpl_cached_dbuf == NULL ||
133 	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
134 		if (bpl->bpl_cached_dbuf != NULL)
135 			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
136 		err = dmu_buf_hold(bpl->bpl_mos,
137 		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
138 		    bpl, &bpl->bpl_cached_dbuf);
139 		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
140 		    1ULL << bpl->bpl_blockshift);
141 	}
142 	return (err);
143 }
144 
145 int
146 bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
147 {
148 	uint64_t blk, off;
149 	blkptr_t *bparray;
150 	int err;
151 
152 	mutex_enter(&bpl->bpl_lock);
153 
154 	err = bplist_hold(bpl);
155 	if (err) {
156 		mutex_exit(&bpl->bpl_lock);
157 		return (err);
158 	}
159 
160 	if (*itorp >= bpl->bpl_phys->bpl_entries) {
161 		mutex_exit(&bpl->bpl_lock);
162 		return (ENOENT);
163 	}
164 
165 	blk = *itorp >> bpl->bpl_bpshift;
166 	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
167 
168 	err = bplist_cache(bpl, blk);
169 	if (err) {
170 		mutex_exit(&bpl->bpl_lock);
171 		return (err);
172 	}
173 
174 	bparray = bpl->bpl_cached_dbuf->db_data;
175 	*bp = bparray[off];
176 	(*itorp)++;
177 	mutex_exit(&bpl->bpl_lock);
178 	return (0);
179 }
180 
181 int
182 bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
183 {
184 	uint64_t blk, off;
185 	blkptr_t *bparray;
186 	int err;
187 
188 	ASSERT(!BP_IS_HOLE(bp));
189 	mutex_enter(&bpl->bpl_lock);
190 	err = bplist_hold(bpl);
191 	if (err)
192 		return (err);
193 
194 	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
195 	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
196 
197 	err = bplist_cache(bpl, blk);
198 	if (err) {
199 		mutex_exit(&bpl->bpl_lock);
200 		return (err);
201 	}
202 
203 	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
204 	bparray = bpl->bpl_cached_dbuf->db_data;
205 	bparray[off] = *bp;
206 
207 	/* We never need the fill count. */
208 	bparray[off].blk_fill = 0;
209 
210 	/* The bplist will compress better if we can leave off the checksum */
211 	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
212 
213 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
214 	bpl->bpl_phys->bpl_entries++;
215 	bpl->bpl_phys->bpl_bytes +=
216 	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
217 	if (bpl->bpl_havecomp) {
218 		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
219 		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
220 	}
221 	mutex_exit(&bpl->bpl_lock);
222 
223 	return (0);
224 }
225 
226 /*
227  * Deferred entry; will be written later by bplist_sync().
228  */
229 void
230 bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
231 {
232 	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
233 
234 	ASSERT(!BP_IS_HOLE(bp));
235 	mutex_enter(&bpl->bpl_lock);
236 	bpq->bpq_blk = *bp;
237 	bpq->bpq_next = bpl->bpl_queue;
238 	bpl->bpl_queue = bpq;
239 	mutex_exit(&bpl->bpl_lock);
240 }
241 
242 void
243 bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
244 {
245 	bplist_q_t *bpq;
246 
247 	mutex_enter(&bpl->bpl_lock);
248 	while ((bpq = bpl->bpl_queue) != NULL) {
249 		bpl->bpl_queue = bpq->bpq_next;
250 		mutex_exit(&bpl->bpl_lock);
251 		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
252 		kmem_free(bpq, sizeof (*bpq));
253 		mutex_enter(&bpl->bpl_lock);
254 	}
255 	mutex_exit(&bpl->bpl_lock);
256 }
257 
258 void
259 bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
260 {
261 	mutex_enter(&bpl->bpl_lock);
262 	ASSERT3P(bpl->bpl_queue, ==, NULL);
263 	VERIFY(0 == bplist_hold(bpl));
264 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
265 	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
266 	    bpl->bpl_object, 0, -1ULL, tx));
267 	bpl->bpl_phys->bpl_entries = 0;
268 	bpl->bpl_phys->bpl_bytes = 0;
269 	if (bpl->bpl_havecomp) {
270 		bpl->bpl_phys->bpl_comp = 0;
271 		bpl->bpl_phys->bpl_uncomp = 0;
272 	}
273 	mutex_exit(&bpl->bpl_lock);
274 }
275 
276 int
277 bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
278 {
279 	int err;
280 
281 	mutex_enter(&bpl->bpl_lock);
282 
283 	err = bplist_hold(bpl);
284 	if (err) {
285 		mutex_exit(&bpl->bpl_lock);
286 		return (err);
287 	}
288 
289 	*usedp = bpl->bpl_phys->bpl_bytes;
290 	if (bpl->bpl_havecomp) {
291 		*compp = bpl->bpl_phys->bpl_comp;
292 		*uncompp = bpl->bpl_phys->bpl_uncomp;
293 	}
294 	mutex_exit(&bpl->bpl_lock);
295 
296 	if (!bpl->bpl_havecomp) {
297 		uint64_t itor = 0, comp = 0, uncomp = 0;
298 		blkptr_t bp;
299 
300 		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
301 			comp += BP_GET_PSIZE(&bp);
302 			uncomp += BP_GET_UCSIZE(&bp);
303 		}
304 		if (err == ENOENT)
305 			err = 0;
306 		*compp = comp;
307 		*uncompp = uncomp;
308 	}
309 
310 	return (err);
311 }
312 
313 /*
314  * Return (in *dasizep) the amount of space on the deadlist which is:
315  * mintxg < blk_birth <= maxtxg
316  */
317 int
318 bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
319     uint64_t *dasizep)
320 {
321 	uint64_t size = 0;
322 	uint64_t itor = 0;
323 	blkptr_t bp;
324 	int err;
325 
326 	/*
327 	 * As an optimization, if they want the whole txg range, just
328 	 * get bpl_bytes rather than iterating over the bps.
329 	 */
330 	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
331 		mutex_enter(&bpl->bpl_lock);
332 		err = bplist_hold(bpl);
333 		if (err == 0)
334 			*dasizep = bpl->bpl_phys->bpl_bytes;
335 		mutex_exit(&bpl->bpl_lock);
336 		return (err);
337 	}
338 
339 	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
340 		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
341 			size +=
342 			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
343 		}
344 	}
345 	if (err == ENOENT)
346 		err = 0;
347 	*dasizep = size;
348 	return (err);
349 }
350