xref: /illumos-gate/usr/src/uts/common/fs/zfs/range_tree.c (revision 4c28a617e3922d92a58e813a5b955eb526b9c386)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dnode.h>
33 #include <sys/zio.h>
34 #include <sys/range_tree.h>
35 
36 kmem_cache_t *range_seg_cache;
37 
38 void
39 range_tree_init(void)
40 {
41 	ASSERT(range_seg_cache == NULL);
42 	range_seg_cache = kmem_cache_create("range_seg_cache",
43 	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44 }
45 
46 void
47 range_tree_fini(void)
48 {
49 	kmem_cache_destroy(range_seg_cache);
50 	range_seg_cache = NULL;
51 }
52 
53 void
54 range_tree_stat_verify(range_tree_t *rt)
55 {
56 	range_seg_t *rs;
57 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58 	int i;
59 
60 	for (rs = avl_first(&rt->rt_root); rs != NULL;
61 	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62 		uint64_t size = rs->rs_end - rs->rs_start;
63 		int idx	= highbit64(size) - 1;
64 
65 		hist[idx]++;
66 		ASSERT3U(hist[idx], !=, 0);
67 	}
68 
69 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70 		if (hist[i] != rt->rt_histogram[i]) {
71 			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72 			    i, hist, hist[i], rt->rt_histogram[i]);
73 		}
74 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75 	}
76 }
77 
78 static void
79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80 {
81 	uint64_t size = rs->rs_end - rs->rs_start;
82 	int idx = highbit64(size) - 1;
83 
84 	ASSERT(size != 0);
85 	ASSERT3U(idx, <,
86 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
87 
88 	rt->rt_histogram[idx]++;
89 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
90 }
91 
92 static void
93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
94 {
95 	uint64_t size = rs->rs_end - rs->rs_start;
96 	int idx = highbit64(size) - 1;
97 
98 	ASSERT(size != 0);
99 	ASSERT3U(idx, <,
100 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
101 
102 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
103 	rt->rt_histogram[idx]--;
104 }
105 
106 /*
107  * NOTE: caller is responsible for all locking.
108  */
109 static int
110 range_tree_seg_compare(const void *x1, const void *x2)
111 {
112 	const range_seg_t *r1 = x1;
113 	const range_seg_t *r2 = x2;
114 
115 	if (r1->rs_start < r2->rs_start) {
116 		if (r1->rs_end > r2->rs_start)
117 			return (0);
118 		return (-1);
119 	}
120 	if (r1->rs_start > r2->rs_start) {
121 		if (r1->rs_start < r2->rs_end)
122 			return (0);
123 		return (1);
124 	}
125 	return (0);
126 }
127 
128 range_tree_t *
129 range_tree_create(range_tree_ops_t *ops, void *arg)
130 {
131 	range_tree_t *rt;
132 
133 	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
134 
135 	avl_create(&rt->rt_root, range_tree_seg_compare,
136 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
137 
138 	rt->rt_ops = ops;
139 	rt->rt_arg = arg;
140 
141 	if (rt->rt_ops != NULL)
142 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
143 
144 	return (rt);
145 }
146 
147 void
148 range_tree_destroy(range_tree_t *rt)
149 {
150 	VERIFY0(rt->rt_space);
151 
152 	if (rt->rt_ops != NULL)
153 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
154 
155 	avl_destroy(&rt->rt_root);
156 	kmem_free(rt, sizeof (*rt));
157 }
158 
159 void
160 range_tree_add(void *arg, uint64_t start, uint64_t size)
161 {
162 	range_tree_t *rt = arg;
163 	avl_index_t where;
164 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
165 	uint64_t end = start + size;
166 	boolean_t merge_before, merge_after;
167 
168 	VERIFY(size != 0);
169 
170 	rsearch.rs_start = start;
171 	rsearch.rs_end = end;
172 	rs = avl_find(&rt->rt_root, &rsearch, &where);
173 
174 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
175 		zfs_panic_recover("zfs: allocating allocated segment"
176 		    "(offset=%llu size=%llu)\n",
177 		    (longlong_t)start, (longlong_t)size);
178 		return;
179 	}
180 
181 	/* Make sure we don't overlap with either of our neighbors */
182 	VERIFY3P(rs, ==, NULL);
183 
184 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
185 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
186 
187 	merge_before = (rs_before != NULL && rs_before->rs_end == start);
188 	merge_after = (rs_after != NULL && rs_after->rs_start == end);
189 
190 	if (merge_before && merge_after) {
191 		avl_remove(&rt->rt_root, rs_before);
192 		if (rt->rt_ops != NULL) {
193 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
194 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
195 		}
196 
197 		range_tree_stat_decr(rt, rs_before);
198 		range_tree_stat_decr(rt, rs_after);
199 
200 		rs_after->rs_start = rs_before->rs_start;
201 		kmem_cache_free(range_seg_cache, rs_before);
202 		rs = rs_after;
203 	} else if (merge_before) {
204 		if (rt->rt_ops != NULL)
205 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
206 
207 		range_tree_stat_decr(rt, rs_before);
208 
209 		rs_before->rs_end = end;
210 		rs = rs_before;
211 	} else if (merge_after) {
212 		if (rt->rt_ops != NULL)
213 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
214 
215 		range_tree_stat_decr(rt, rs_after);
216 
217 		rs_after->rs_start = start;
218 		rs = rs_after;
219 	} else {
220 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
221 		rs->rs_start = start;
222 		rs->rs_end = end;
223 		avl_insert(&rt->rt_root, rs, where);
224 	}
225 
226 	if (rt->rt_ops != NULL)
227 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
228 
229 	range_tree_stat_incr(rt, rs);
230 	rt->rt_space += size;
231 }
232 
233 void
234 range_tree_remove(void *arg, uint64_t start, uint64_t size)
235 {
236 	range_tree_t *rt = arg;
237 	avl_index_t where;
238 	range_seg_t rsearch, *rs, *newseg;
239 	uint64_t end = start + size;
240 	boolean_t left_over, right_over;
241 
242 	VERIFY3U(size, !=, 0);
243 	VERIFY3U(size, <=, rt->rt_space);
244 
245 	rsearch.rs_start = start;
246 	rsearch.rs_end = end;
247 	rs = avl_find(&rt->rt_root, &rsearch, &where);
248 
249 	/* Make sure we completely overlap with someone */
250 	if (rs == NULL) {
251 		zfs_panic_recover("zfs: freeing free segment "
252 		    "(offset=%llu size=%llu)",
253 		    (longlong_t)start, (longlong_t)size);
254 		return;
255 	}
256 	VERIFY3U(rs->rs_start, <=, start);
257 	VERIFY3U(rs->rs_end, >=, end);
258 
259 	left_over = (rs->rs_start != start);
260 	right_over = (rs->rs_end != end);
261 
262 	range_tree_stat_decr(rt, rs);
263 
264 	if (rt->rt_ops != NULL)
265 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
266 
267 	if (left_over && right_over) {
268 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
269 		newseg->rs_start = end;
270 		newseg->rs_end = rs->rs_end;
271 		range_tree_stat_incr(rt, newseg);
272 
273 		rs->rs_end = start;
274 
275 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
276 		if (rt->rt_ops != NULL)
277 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
278 	} else if (left_over) {
279 		rs->rs_end = start;
280 	} else if (right_over) {
281 		rs->rs_start = end;
282 	} else {
283 		avl_remove(&rt->rt_root, rs);
284 		kmem_cache_free(range_seg_cache, rs);
285 		rs = NULL;
286 	}
287 
288 	if (rs != NULL) {
289 		range_tree_stat_incr(rt, rs);
290 
291 		if (rt->rt_ops != NULL)
292 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
293 	}
294 
295 	rt->rt_space -= size;
296 }
297 
298 static range_seg_t *
299 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
300 {
301 	range_seg_t rsearch;
302 	uint64_t end = start + size;
303 
304 	VERIFY(size != 0);
305 
306 	rsearch.rs_start = start;
307 	rsearch.rs_end = end;
308 	return (avl_find(&rt->rt_root, &rsearch, NULL));
309 }
310 
311 static range_seg_t *
312 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
313 {
314 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
315 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
316 		return (rs);
317 	return (NULL);
318 }
319 
320 void
321 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
322 {
323 	range_seg_t *rs;
324 
325 	rs = range_tree_find(rt, off, size);
326 	if (rs != NULL)
327 		panic("freeing free block; rs=%p", (void *)rs);
328 }
329 
330 boolean_t
331 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
332 {
333 	return (range_tree_find(rt, start, size) != NULL);
334 }
335 
336 /*
337  * Ensure that this range is not in the tree, regardless of whether
338  * it is currently in the tree.
339  */
340 void
341 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
342 {
343 	range_seg_t *rs;
344 
345 	if (size == 0)
346 		return;
347 
348 	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
349 		uint64_t free_start = MAX(rs->rs_start, start);
350 		uint64_t free_end = MIN(rs->rs_end, start + size);
351 		range_tree_remove(rt, free_start, free_end - free_start);
352 	}
353 }
354 
355 void
356 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
357 {
358 	range_tree_t *rt;
359 
360 	ASSERT0(range_tree_space(*rtdst));
361 	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
362 
363 	rt = *rtsrc;
364 	*rtsrc = *rtdst;
365 	*rtdst = rt;
366 }
367 
368 void
369 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
370 {
371 	range_seg_t *rs;
372 	void *cookie = NULL;
373 
374 
375 	if (rt->rt_ops != NULL)
376 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
377 
378 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
379 		if (func != NULL)
380 			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
381 		kmem_cache_free(range_seg_cache, rs);
382 	}
383 
384 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
385 	rt->rt_space = 0;
386 }
387 
388 void
389 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
390 {
391 	range_seg_t *rs;
392 
393 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
394 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
395 }
396 
397 uint64_t
398 range_tree_space(range_tree_t *rt)
399 {
400 	return (rt->rt_space);
401 }
402 
403 boolean_t
404 range_tree_is_empty(range_tree_t *rt)
405 {
406 	ASSERT(rt != NULL);
407 	return (range_tree_space(rt) == 0);
408 }
409 
410 uint64_t
411 range_tree_min(range_tree_t *rt)
412 {
413 	range_seg_t *rs = avl_first(&rt->rt_root);
414 	return (rs != NULL ? rs->rs_start : 0);
415 }
416 
417 uint64_t
418 range_tree_max(range_tree_t *rt)
419 {
420 	range_seg_t *rs = avl_last(&rt->rt_root);
421 	return (rs != NULL ? rs->rs_end : 0);
422 }
423 
424 uint64_t
425 range_tree_span(range_tree_t *rt)
426 {
427 	return (range_tree_max(rt) - range_tree_min(rt));
428 }
429