xref: /illumos-gate/usr/src/uts/common/fs/zfs/space_map.c (revision 004388ebfdfe2ed7dfd2d153a876dfcc22d2c006)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/dmu.h>
31 #include <sys/zio.h>
32 #include <sys/space_map.h>
33 
34 /*
35  * Space map routines.
36  * NOTE: caller is responsible for all locking.
37  */
38 static int
39 space_map_seg_compare(const void *x1, const void *x2)
40 {
41 	const space_seg_t *s1 = x1;
42 	const space_seg_t *s2 = x2;
43 
44 	if (s1->ss_start < s2->ss_start) {
45 		if (s1->ss_end > s2->ss_start)
46 			return (0);
47 		return (-1);
48 	}
49 	if (s1->ss_start > s2->ss_start) {
50 		if (s1->ss_start < s2->ss_end)
51 			return (0);
52 		return (1);
53 	}
54 	return (0);
55 }
56 
57 void
58 space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
59 	kmutex_t *lp)
60 {
61 	bzero(sm, sizeof (*sm));
62 
63 	avl_create(&sm->sm_root, space_map_seg_compare,
64 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
65 
66 	sm->sm_start = start;
67 	sm->sm_size = size;
68 	sm->sm_shift = shift;
69 	sm->sm_lock = lp;
70 }
71 
72 void
73 space_map_destroy(space_map_t *sm)
74 {
75 	ASSERT(!sm->sm_loaded && !sm->sm_loading);
76 	VERIFY3U(sm->sm_space, ==, 0);
77 	avl_destroy(&sm->sm_root);
78 }
79 
80 void
81 space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
82 {
83 	avl_index_t where;
84 	space_seg_t ssearch, *ss_before, *ss_after, *ss;
85 	uint64_t end = start + size;
86 	int merge_before, merge_after;
87 
88 	ASSERT(MUTEX_HELD(sm->sm_lock));
89 	VERIFY(size != 0);
90 	VERIFY3U(start, >=, sm->sm_start);
91 	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
92 	VERIFY(sm->sm_space + size <= sm->sm_size);
93 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
94 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
95 
96 	ssearch.ss_start = start;
97 	ssearch.ss_end = end;
98 	ss = avl_find(&sm->sm_root, &ssearch, &where);
99 
100 	/* Make sure we don't overlap with either of our neighbors */
101 	VERIFY(ss == NULL);
102 
103 	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
104 	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
105 
106 	merge_before = (ss_before != NULL && ss_before->ss_end == start);
107 	merge_after = (ss_after != NULL && ss_after->ss_start == end);
108 
109 	if (merge_before && merge_after) {
110 		avl_remove(&sm->sm_root, ss_before);
111 		ss_after->ss_start = ss_before->ss_start;
112 		kmem_free(ss_before, sizeof (*ss_before));
113 	} else if (merge_before) {
114 		ss_before->ss_end = end;
115 	} else if (merge_after) {
116 		ss_after->ss_start = start;
117 	} else {
118 		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
119 		ss->ss_start = start;
120 		ss->ss_end = end;
121 		avl_insert(&sm->sm_root, ss, where);
122 	}
123 
124 	sm->sm_space += size;
125 }
126 
127 void
128 space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
129 {
130 	avl_index_t where;
131 	space_seg_t ssearch, *ss, *newseg;
132 	uint64_t end = start + size;
133 	int left_over, right_over;
134 
135 	ASSERT(MUTEX_HELD(sm->sm_lock));
136 	VERIFY(size != 0);
137 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
138 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
139 
140 	ssearch.ss_start = start;
141 	ssearch.ss_end = end;
142 	ss = avl_find(&sm->sm_root, &ssearch, &where);
143 
144 	/* Make sure we completely overlap with someone */
145 	VERIFY(ss != NULL);
146 	VERIFY3U(ss->ss_start, <=, start);
147 	VERIFY3U(ss->ss_end, >=, end);
148 	VERIFY(sm->sm_space - size <= sm->sm_size);
149 
150 	left_over = (ss->ss_start != start);
151 	right_over = (ss->ss_end != end);
152 
153 	if (left_over && right_over) {
154 		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
155 		newseg->ss_start = end;
156 		newseg->ss_end = ss->ss_end;
157 		ss->ss_end = start;
158 		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
159 	} else if (left_over) {
160 		ss->ss_end = start;
161 	} else if (right_over) {
162 		ss->ss_start = end;
163 	} else {
164 		avl_remove(&sm->sm_root, ss);
165 		kmem_free(ss, sizeof (*ss));
166 	}
167 
168 	sm->sm_space -= size;
169 }
170 
171 int
172 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
173 {
174 	avl_index_t where;
175 	space_seg_t ssearch, *ss;
176 	uint64_t end = start + size;
177 
178 	ASSERT(MUTEX_HELD(sm->sm_lock));
179 	VERIFY(size != 0);
180 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
181 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
182 
183 	ssearch.ss_start = start;
184 	ssearch.ss_end = end;
185 	ss = avl_find(&sm->sm_root, &ssearch, &where);
186 
187 	return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
188 }
189 
190 void
191 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
192 {
193 	space_seg_t *ss;
194 	void *cookie = NULL;
195 
196 	ASSERT(MUTEX_HELD(sm->sm_lock));
197 
198 	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
199 		if (func != NULL)
200 			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
201 		kmem_free(ss, sizeof (*ss));
202 	}
203 	sm->sm_space = 0;
204 }
205 
206 void
207 space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
208 {
209 	space_seg_t *ss;
210 
211 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
212 		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
213 }
214 
215 void
216 space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
217 {
218 	avl_tree_t *t = &sm->sm_root;
219 	avl_index_t where;
220 	space_seg_t *ss, search;
221 	uint64_t end = start + size;
222 	uint64_t rm_start, rm_end;
223 
224 	ASSERT(MUTEX_HELD(sm->sm_lock));
225 
226 	search.ss_start = start;
227 	search.ss_end = start;
228 
229 	for (;;) {
230 		ss = avl_find(t, &search, &where);
231 
232 		if (ss == NULL)
233 			ss = avl_nearest(t, where, AVL_AFTER);
234 
235 		if (ss == NULL || ss->ss_start >= end)
236 			break;
237 
238 		rm_start = MAX(ss->ss_start, start);
239 		rm_end = MIN(ss->ss_end, end);
240 
241 		space_map_remove(sm, rm_start, rm_end - rm_start);
242 	}
243 }
244 
245 /*
246  * Replace smd with the union of smd and sms.
247  */
248 void
249 space_map_union(space_map_t *smd, space_map_t *sms)
250 {
251 	avl_tree_t *t = &sms->sm_root;
252 	space_seg_t *ss;
253 
254 	ASSERT(MUTEX_HELD(smd->sm_lock));
255 
256 	/*
257 	 * For each source segment, remove any intersections with the
258 	 * destination, then add the source segment to the destination.
259 	 */
260 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
261 		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
262 		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
263 	}
264 }
265 
266 /*
267  * Wait for any in-progress space_map_load() to complete.
268  */
269 void
270 space_map_load_wait(space_map_t *sm)
271 {
272 	ASSERT(MUTEX_HELD(sm->sm_lock));
273 
274 	while (sm->sm_loading)
275 		cv_wait(&sm->sm_load_cv, sm->sm_lock);
276 }
277 
278 /*
279  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
280  * The caller must be OK with this.
281  */
282 int
283 space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
284 	space_map_obj_t *smo, objset_t *os)
285 {
286 	uint64_t *entry, *entry_map, *entry_map_end;
287 	uint64_t bufsize, size, offset;
288 	uint64_t mapstart = sm->sm_start;
289 	uint64_t end = smo->smo_objsize;
290 	uint64_t space = smo->smo_alloc;
291 
292 	ASSERT(MUTEX_HELD(sm->sm_lock));
293 
294 	space_map_load_wait(sm);
295 
296 	if (sm->sm_loaded)
297 		return (0);
298 
299 	sm->sm_loading = B_TRUE;
300 
301 	ASSERT(sm->sm_ops == NULL);
302 	VERIFY3U(sm->sm_space, ==, 0);
303 
304 	if (maptype == SM_FREE) {
305 		space_map_add(sm, sm->sm_start, sm->sm_size);
306 		space = sm->sm_size - space;
307 	}
308 
309 	bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
310 	entry_map = zio_buf_alloc(bufsize);
311 
312 	mutex_exit(sm->sm_lock);
313 	if (end > bufsize)
314 		dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
315 	mutex_enter(sm->sm_lock);
316 
317 	for (offset = 0; offset < end; offset += bufsize) {
318 		size = MIN(end - offset, bufsize);
319 		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
320 		VERIFY(size != 0);
321 
322 		dprintf("object=%llu  offset=%llx  size=%llx\n",
323 		    smo->smo_object, offset, size);
324 
325 		mutex_exit(sm->sm_lock);
326 		VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
327 		    entry_map), ==, 0);
328 		mutex_enter(sm->sm_lock);
329 
330 		entry_map_end = entry_map + (size / sizeof (uint64_t));
331 		for (entry = entry_map; entry < entry_map_end; entry++) {
332 			uint64_t e = *entry;
333 
334 			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
335 				continue;
336 
337 			(SM_TYPE_DECODE(e) == maptype ?
338 			    space_map_add : space_map_remove)(sm,
339 			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
340 			    SM_RUN_DECODE(e) << sm->sm_shift);
341 		}
342 	}
343 	VERIFY3U(sm->sm_space, ==, space);
344 
345 	zio_buf_free(entry_map, bufsize);
346 
347 	sm->sm_loading = B_FALSE;
348 	sm->sm_loaded = B_TRUE;
349 	sm->sm_ops = ops;
350 
351 	cv_broadcast(&sm->sm_load_cv);
352 
353 	if (ops != NULL)
354 		ops->smop_load(sm);
355 
356 	return (0);
357 }
358 
359 void
360 space_map_unload(space_map_t *sm)
361 {
362 	ASSERT(MUTEX_HELD(sm->sm_lock));
363 
364 	if (sm->sm_loaded && sm->sm_ops != NULL)
365 		sm->sm_ops->smop_unload(sm);
366 
367 	sm->sm_loaded = B_FALSE;
368 	sm->sm_ops = NULL;
369 
370 	space_map_vacate(sm, NULL, NULL);
371 }
372 
373 uint64_t
374 space_map_alloc(space_map_t *sm, uint64_t size)
375 {
376 	uint64_t start;
377 
378 	start = sm->sm_ops->smop_alloc(sm, size);
379 	if (start != -1ULL)
380 		space_map_remove(sm, start, size);
381 	return (start);
382 }
383 
384 void
385 space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
386 {
387 	sm->sm_ops->smop_claim(sm, start, size);
388 	space_map_remove(sm, start, size);
389 }
390 
391 void
392 space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
393 {
394 	space_map_add(sm, start, size);
395 	sm->sm_ops->smop_free(sm, start, size);
396 }
397 
398 /*
399  * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
400  */
401 void
402 space_map_sync(space_map_t *sm, uint8_t maptype,
403 	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
404 {
405 	spa_t *spa = dmu_objset_spa(os);
406 	void *cookie = NULL;
407 	space_seg_t *ss;
408 	uint64_t bufsize, start, size, run_len;
409 	uint64_t *entry, *entry_map, *entry_map_end;
410 
411 	ASSERT(MUTEX_HELD(sm->sm_lock));
412 
413 	if (sm->sm_space == 0)
414 		return;
415 
416 	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
417 	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
418 	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
419 	    sm->sm_space);
420 
421 	if (maptype == SM_ALLOC)
422 		smo->smo_alloc += sm->sm_space;
423 	else
424 		smo->smo_alloc -= sm->sm_space;
425 
426 	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
427 	bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
428 	entry_map = zio_buf_alloc(bufsize);
429 	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
430 	entry = entry_map;
431 
432 	*entry++ = SM_DEBUG_ENCODE(1) |
433 	    SM_DEBUG_ACTION_ENCODE(maptype) |
434 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
435 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
436 
437 	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
438 		size = ss->ss_end - ss->ss_start;
439 		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
440 
441 		sm->sm_space -= size;
442 		size >>= sm->sm_shift;
443 
444 		while (size) {
445 			run_len = MIN(size, SM_RUN_MAX);
446 
447 			if (entry == entry_map_end) {
448 				mutex_exit(sm->sm_lock);
449 				dmu_write(os, smo->smo_object, smo->smo_objsize,
450 				    bufsize, entry_map, tx);
451 				mutex_enter(sm->sm_lock);
452 				smo->smo_objsize += bufsize;
453 				entry = entry_map;
454 			}
455 
456 			*entry++ = SM_OFFSET_ENCODE(start) |
457 			    SM_TYPE_ENCODE(maptype) |
458 			    SM_RUN_ENCODE(run_len);
459 
460 			start += run_len;
461 			size -= run_len;
462 		}
463 		kmem_free(ss, sizeof (*ss));
464 	}
465 
466 	if (entry != entry_map) {
467 		size = (entry - entry_map) * sizeof (uint64_t);
468 		mutex_exit(sm->sm_lock);
469 		dmu_write(os, smo->smo_object, smo->smo_objsize,
470 		    size, entry_map, tx);
471 		mutex_enter(sm->sm_lock);
472 		smo->smo_objsize += size;
473 	}
474 
475 	zio_buf_free(entry_map, bufsize);
476 
477 	VERIFY3U(sm->sm_space, ==, 0);
478 }
479 
480 void
481 space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
482 {
483 	VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
484 
485 	smo->smo_objsize = 0;
486 	smo->smo_alloc = 0;
487 }
488