xref: /titanic_52/usr/src/uts/common/fs/zfs/range_tree.c (revision 241996a3debb0b7690aef018488c76770d83311e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013 by Delphix. All rights reserved.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dnode.h>
33 #include <sys/zio.h>
34 #include <sys/range_tree.h>
35 
36 static kmem_cache_t *range_seg_cache;
37 
38 void
39 range_tree_init(void)
40 {
41 	ASSERT(range_seg_cache == NULL);
42 	range_seg_cache = kmem_cache_create("range_seg_cache",
43 	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44 }
45 
46 void
47 range_tree_fini(void)
48 {
49 	kmem_cache_destroy(range_seg_cache);
50 	range_seg_cache = NULL;
51 }
52 
53 void
54 range_tree_stat_verify(range_tree_t *rt)
55 {
56 	range_seg_t *rs;
57 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58 	int i;
59 
60 	for (rs = avl_first(&rt->rt_root); rs != NULL;
61 	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62 		uint64_t size = rs->rs_end - rs->rs_start;
63 		int idx	= highbit(size) - 1;
64 
65 		hist[idx]++;
66 		ASSERT3U(hist[idx], !=, 0);
67 	}
68 
69 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70 		if (hist[i] != rt->rt_histogram[i]) {
71 			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72 			    i, hist, hist[i], rt->rt_histogram[i]);
73 		}
74 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75 	}
76 }
77 
78 static void
79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80 {
81 	uint64_t size = rs->rs_end - rs->rs_start;
82 	int idx = highbit(size) - 1;
83 
84 	ASSERT3U(idx, <,
85 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
86 
87 	ASSERT(MUTEX_HELD(rt->rt_lock));
88 	rt->rt_histogram[idx]++;
89 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
90 }
91 
92 static void
93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
94 {
95 	uint64_t size = rs->rs_end - rs->rs_start;
96 	int idx = highbit(size) - 1;
97 
98 	ASSERT3U(idx, <,
99 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
100 
101 	ASSERT(MUTEX_HELD(rt->rt_lock));
102 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
103 	rt->rt_histogram[idx]--;
104 }
105 
106 /*
107  * NOTE: caller is responsible for all locking.
108  */
109 static int
110 range_tree_seg_compare(const void *x1, const void *x2)
111 {
112 	const range_seg_t *r1 = x1;
113 	const range_seg_t *r2 = x2;
114 
115 	if (r1->rs_start < r2->rs_start) {
116 		if (r1->rs_end > r2->rs_start)
117 			return (0);
118 		return (-1);
119 	}
120 	if (r1->rs_start > r2->rs_start) {
121 		if (r1->rs_start < r2->rs_end)
122 			return (0);
123 		return (1);
124 	}
125 	return (0);
126 }
127 
128 range_tree_t *
129 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
130 {
131 	range_tree_t *rt;
132 
133 	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
134 
135 	avl_create(&rt->rt_root, range_tree_seg_compare,
136 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
137 
138 	rt->rt_lock = lp;
139 	rt->rt_ops = ops;
140 	rt->rt_arg = arg;
141 
142 	if (rt->rt_ops != NULL)
143 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
144 
145 	return (rt);
146 }
147 
148 void
149 range_tree_destroy(range_tree_t *rt)
150 {
151 	VERIFY0(rt->rt_space);
152 
153 	if (rt->rt_ops != NULL)
154 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
155 
156 	avl_destroy(&rt->rt_root);
157 	kmem_free(rt, sizeof (*rt));
158 }
159 
160 void
161 range_tree_add(void *arg, uint64_t start, uint64_t size)
162 {
163 	range_tree_t *rt = arg;
164 	avl_index_t where;
165 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
166 	uint64_t end = start + size;
167 	boolean_t merge_before, merge_after;
168 
169 	ASSERT(MUTEX_HELD(rt->rt_lock));
170 	VERIFY(size != 0);
171 
172 	rsearch.rs_start = start;
173 	rsearch.rs_end = end;
174 	rs = avl_find(&rt->rt_root, &rsearch, &where);
175 
176 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
177 		zfs_panic_recover("zfs: allocating allocated segment"
178 		    "(offset=%llu size=%llu)\n",
179 		    (longlong_t)start, (longlong_t)size);
180 		return;
181 	}
182 
183 	/* Make sure we don't overlap with either of our neighbors */
184 	VERIFY(rs == NULL);
185 
186 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
187 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
188 
189 	merge_before = (rs_before != NULL && rs_before->rs_end == start);
190 	merge_after = (rs_after != NULL && rs_after->rs_start == end);
191 
192 	if (merge_before && merge_after) {
193 		avl_remove(&rt->rt_root, rs_before);
194 		if (rt->rt_ops != NULL) {
195 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
196 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
197 		}
198 
199 		range_tree_stat_decr(rt, rs_before);
200 		range_tree_stat_decr(rt, rs_after);
201 
202 		rs_after->rs_start = rs_before->rs_start;
203 		kmem_cache_free(range_seg_cache, rs_before);
204 		rs = rs_after;
205 	} else if (merge_before) {
206 		if (rt->rt_ops != NULL)
207 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
208 
209 		range_tree_stat_decr(rt, rs_before);
210 
211 		rs_before->rs_end = end;
212 		rs = rs_before;
213 	} else if (merge_after) {
214 		if (rt->rt_ops != NULL)
215 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
216 
217 		range_tree_stat_decr(rt, rs_after);
218 
219 		rs_after->rs_start = start;
220 		rs = rs_after;
221 	} else {
222 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
223 		rs->rs_start = start;
224 		rs->rs_end = end;
225 		avl_insert(&rt->rt_root, rs, where);
226 	}
227 
228 	if (rt->rt_ops != NULL)
229 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
230 
231 	range_tree_stat_incr(rt, rs);
232 	rt->rt_space += size;
233 }
234 
235 void
236 range_tree_remove(void *arg, uint64_t start, uint64_t size)
237 {
238 	range_tree_t *rt = arg;
239 	avl_index_t where;
240 	range_seg_t rsearch, *rs, *newseg;
241 	uint64_t end = start + size;
242 	boolean_t left_over, right_over;
243 
244 	ASSERT(MUTEX_HELD(rt->rt_lock));
245 	VERIFY3U(size, !=, 0);
246 	VERIFY3U(size, <=, rt->rt_space);
247 
248 	rsearch.rs_start = start;
249 	rsearch.rs_end = end;
250 	rs = avl_find(&rt->rt_root, &rsearch, &where);
251 
252 	/* Make sure we completely overlap with someone */
253 	if (rs == NULL) {
254 		zfs_panic_recover("zfs: freeing free segment "
255 		    "(offset=%llu size=%llu)",
256 		    (longlong_t)start, (longlong_t)size);
257 		return;
258 	}
259 	VERIFY3U(rs->rs_start, <=, start);
260 	VERIFY3U(rs->rs_end, >=, end);
261 
262 	left_over = (rs->rs_start != start);
263 	right_over = (rs->rs_end != end);
264 
265 	range_tree_stat_decr(rt, rs);
266 
267 	if (rt->rt_ops != NULL)
268 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
269 
270 	if (left_over && right_over) {
271 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
272 		newseg->rs_start = end;
273 		newseg->rs_end = rs->rs_end;
274 		range_tree_stat_incr(rt, newseg);
275 
276 		rs->rs_end = start;
277 
278 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
279 		if (rt->rt_ops != NULL)
280 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
281 	} else if (left_over) {
282 		rs->rs_end = start;
283 	} else if (right_over) {
284 		rs->rs_start = end;
285 	} else {
286 		avl_remove(&rt->rt_root, rs);
287 		kmem_cache_free(range_seg_cache, rs);
288 		rs = NULL;
289 	}
290 
291 	if (rs != NULL) {
292 		range_tree_stat_incr(rt, rs);
293 
294 		if (rt->rt_ops != NULL)
295 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
296 	}
297 
298 	rt->rt_space -= size;
299 }
300 
301 static range_seg_t *
302 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
303     avl_index_t *wherep)
304 {
305 	range_seg_t rsearch, *rs;
306 	uint64_t end = start + size;
307 
308 	ASSERT(MUTEX_HELD(rt->rt_lock));
309 	VERIFY(size != 0);
310 
311 	rsearch.rs_start = start;
312 	rsearch.rs_end = end;
313 	rs = avl_find(&rt->rt_root, &rsearch, wherep);
314 
315 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
316 		return (rs);
317 	return (NULL);
318 }
319 
320 void
321 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
322 {
323 	range_seg_t *rs;
324 	avl_index_t where;
325 
326 	mutex_enter(rt->rt_lock);
327 	rs = range_tree_find(rt, off, size, &where);
328 	if (rs != NULL)
329 		panic("freeing free block; rs=%p", (void *)rs);
330 	mutex_exit(rt->rt_lock);
331 }
332 
333 boolean_t
334 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
335 {
336 	avl_index_t where;
337 
338 	return (range_tree_find(rt, start, size, &where) != NULL);
339 }
340 
341 void
342 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
343 {
344 	range_tree_t *rt;
345 
346 	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
347 	ASSERT0(range_tree_space(*rtdst));
348 	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
349 
350 	rt = *rtsrc;
351 	*rtsrc = *rtdst;
352 	*rtdst = rt;
353 }
354 
355 void
356 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
357 {
358 	range_seg_t *rs;
359 	void *cookie = NULL;
360 
361 	ASSERT(MUTEX_HELD(rt->rt_lock));
362 
363 	if (rt->rt_ops != NULL)
364 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
365 
366 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
367 		if (func != NULL)
368 			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
369 		kmem_cache_free(range_seg_cache, rs);
370 	}
371 
372 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
373 	rt->rt_space = 0;
374 }
375 
376 void
377 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
378 {
379 	range_seg_t *rs;
380 
381 	ASSERT(MUTEX_HELD(rt->rt_lock));
382 
383 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
384 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
385 }
386 
387 uint64_t
388 range_tree_space(range_tree_t *rt)
389 {
390 	return (rt->rt_space);
391 }
392