xref: /freebsd/sys/contrib/openzfs/module/zfs/range_tree.c (revision b59a0cde6a5253f94494397ce5b18dbfa071e08c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
27  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/dmu.h>
33 #include <sys/dnode.h>
34 #include <sys/zio.h>
35 #include <sys/range_tree.h>
36 
37 /*
38  * Range trees are tree-based data structures that can be used to
39  * track free space or generally any space allocation information.
40  * A range tree keeps track of individual segments and automatically
41  * provides facilities such as adjacent extent merging and extent
42  * splitting in response to range add/remove requests.
43  *
44  * A range tree starts out completely empty, with no segments in it.
45  * Adding an allocation via zfs_range_tree_add to the range tree can either:
46  * 1) create a new extent
47  * 2) extend an adjacent extent
48  * 3) merge two adjacent extents
49  * Conversely, removing an allocation via zfs_range_tree_remove can:
50  * 1) completely remove an extent
51  * 2) shorten an extent (if the allocation was near one of its ends)
52  * 3) split an extent into two extents, in effect punching a hole
53  *
54  * A range tree is also capable of 'bridging' gaps when adding
55  * allocations. This is useful for cases when close proximity of
56  * allocations is an important detail that needs to be represented
57  * in the range tree. See zfs_range_tree_set_gap(). The default behavior
58  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
59  *
60  * In order to traverse a range tree, use either the zfs_range_tree_walk()
61  * or zfs_range_tree_vacate() functions.
62  *
63  * To obtain more accurate information on individual segment
64  * operations that the range tree performs "under the hood", you can
65  * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
66  * to the zfs_range_tree_create function. Any callbacks that are non-NULL
67  * are then called at the appropriate times.
68  *
69  * The range tree code also supports a special variant of range trees
70  * that can bridge small gaps between segments. This kind of tree is used
71  * by the dsl scanning code to group I/Os into mostly sequential chunks to
72  * optimize disk performance. The code here attempts to do this with as
73  * little memory and computational overhead as possible. One limitation of
74  * this implementation is that segments of range trees with gaps can only
75  * support removing complete segments.
76  */
77 
78 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)79 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
80 {
81 	ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
82 	size_t size = 0;
83 	switch (rt->rt_type) {
84 	case ZFS_RANGE_SEG32:
85 		size = sizeof (zfs_range_seg32_t);
86 		break;
87 	case ZFS_RANGE_SEG64:
88 		size = sizeof (zfs_range_seg64_t);
89 		break;
90 	case ZFS_RANGE_SEG_GAP:
91 		size = sizeof (zfs_range_seg_gap_t);
92 		break;
93 	default:
94 		__builtin_unreachable();
95 	}
96 	memcpy(dest, src, size);
97 }
98 
99 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)100 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
101 {
102 	zfs_range_seg_t *rs;
103 	zfs_btree_index_t where;
104 	uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
105 	int i;
106 
107 	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
108 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
109 		uint64_t size = zfs_rs_get_end(rs, rt) -
110 		    zfs_rs_get_start(rs, rt);
111 		int idx	= highbit64(size) - 1;
112 
113 		hist[idx]++;
114 		ASSERT3U(hist[idx], !=, 0);
115 	}
116 
117 	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
118 		if (hist[i] != rt->rt_histogram[i]) {
119 			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
120 			    i, hist, (u_longlong_t)hist[i],
121 			    (u_longlong_t)rt->rt_histogram[i]);
122 		}
123 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
124 	}
125 }
126 
127 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)128 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
129 {
130 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
131 	int idx = highbit64(size) - 1;
132 
133 	ASSERT(size != 0);
134 	ASSERT3U(idx, <,
135 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
136 
137 	rt->rt_histogram[idx]++;
138 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
139 }
140 
141 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)142 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
143 {
144 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
145 	int idx = highbit64(size) - 1;
146 
147 	ASSERT(size != 0);
148 	ASSERT3U(idx, <,
149 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
150 
151 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
152 	rt->rt_histogram[idx]--;
153 }
154 
155 __attribute__((always_inline)) inline
156 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)157 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
158 {
159 	const zfs_range_seg32_t *r1 = x1;
160 	const zfs_range_seg32_t *r2 = x2;
161 
162 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
163 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
164 
165 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
166 }
167 
168 __attribute__((always_inline)) inline
169 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)170 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
171 {
172 	const zfs_range_seg64_t *r1 = x1;
173 	const zfs_range_seg64_t *r2 = x2;
174 
175 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
176 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
177 
178 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
179 }
180 
181 __attribute__((always_inline)) inline
182 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)183 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
184 {
185 	const zfs_range_seg_gap_t *r1 = x1;
186 	const zfs_range_seg_gap_t *r2 = x2;
187 
188 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
189 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
190 
191 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
192 }
193 
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)194 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
195     zfs_range_tree_seg32_compare)
196 
197 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
198     zfs_range_tree_seg64_compare)
199 
200 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
201     zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
202 
203 zfs_range_tree_t *
204 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
205     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
206     uint64_t gap)
207 {
208 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
209 
210 	ASSERT3U(shift, <, 64);
211 	ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
212 	size_t size;
213 	int (*compare) (const void *, const void *);
214 	bt_find_in_buf_f bt_find;
215 	switch (type) {
216 	case ZFS_RANGE_SEG32:
217 		size = sizeof (zfs_range_seg32_t);
218 		compare = zfs_range_tree_seg32_compare;
219 		bt_find = zfs_range_tree_seg32_find_in_buf;
220 		break;
221 	case ZFS_RANGE_SEG64:
222 		size = sizeof (zfs_range_seg64_t);
223 		compare = zfs_range_tree_seg64_compare;
224 		bt_find = zfs_range_tree_seg64_find_in_buf;
225 		break;
226 	case ZFS_RANGE_SEG_GAP:
227 		size = sizeof (zfs_range_seg_gap_t);
228 		compare = zfs_range_tree_seg_gap_compare;
229 		bt_find = zfs_range_tree_seg_gap_find_in_buf;
230 		break;
231 	default:
232 		panic("Invalid range seg type %d", type);
233 	}
234 	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
235 
236 	rt->rt_ops = ops;
237 	rt->rt_gap = gap;
238 	rt->rt_arg = arg;
239 	rt->rt_type = type;
240 	rt->rt_start = start;
241 	rt->rt_shift = shift;
242 
243 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
244 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
245 
246 	return (rt);
247 }
248 
249 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)250 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
251     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
252 {
253 	return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
254 }
255 
256 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)257 zfs_range_tree_destroy(zfs_range_tree_t *rt)
258 {
259 	VERIFY0(rt->rt_space);
260 
261 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
262 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
263 
264 	zfs_btree_destroy(&rt->rt_root);
265 	kmem_free(rt, sizeof (*rt));
266 }
267 
268 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)269 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
270     int64_t delta)
271 {
272 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
273 		zfs_panic_recover("zfs: attempting to decrease fill to or "
274 		    "below 0; probable double remove in segment [%llx:%llx]",
275 		    (longlong_t)zfs_rs_get_start(rs, rt),
276 		    (longlong_t)zfs_rs_get_end(rs, rt));
277 	}
278 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
279 	    zfs_rs_get_start(rs, rt)) {
280 		zfs_panic_recover("zfs: attempting to increase fill beyond "
281 		    "max; probable double add in segment [%llx:%llx]",
282 		    (longlong_t)zfs_rs_get_start(rs, rt),
283 		    (longlong_t)zfs_rs_get_end(rs, rt));
284 	}
285 
286 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
287 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
288 	zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
289 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
290 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
291 }
292 
293 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)294 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
295 {
296 	zfs_range_tree_t *rt = arg;
297 	zfs_btree_index_t where;
298 	zfs_range_seg_t *rs_before, *rs_after, *rs;
299 	zfs_range_seg_max_t tmp, rsearch;
300 	uint64_t end = start + size, gap = rt->rt_gap;
301 	uint64_t bridge_size = 0;
302 	boolean_t merge_before, merge_after;
303 
304 	ASSERT3U(size, !=, 0);
305 	ASSERT3U(fill, <=, size);
306 	ASSERT3U(start + size, >, start);
307 
308 	zfs_rs_set_start(&rsearch, rt, start);
309 	zfs_rs_set_end(&rsearch, rt, end);
310 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
311 
312 	/*
313 	 * If this is a gap-supporting range tree, it is possible that we
314 	 * are inserting into an existing segment. In this case simply
315 	 * bump the fill count and call the remove / add callbacks. If the
316 	 * new range will extend an existing segment, we remove the
317 	 * existing one, apply the new extent to it and re-insert it using
318 	 * the normal code paths.
319 	 */
320 	if (rs != NULL) {
321 		if (gap == 0) {
322 			zfs_panic_recover("zfs: adding existent segment to "
323 			    "range tree (offset=%llx size=%llx)",
324 			    (longlong_t)start, (longlong_t)size);
325 			return;
326 		}
327 		uint64_t rstart = zfs_rs_get_start(rs, rt);
328 		uint64_t rend = zfs_rs_get_end(rs, rt);
329 		if (rstart <= start && rend >= end) {
330 			zfs_range_tree_adjust_fill(rt, rs, fill);
331 			return;
332 		}
333 
334 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
335 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
336 
337 		zfs_range_tree_stat_decr(rt, rs);
338 		rt->rt_space -= rend - rstart;
339 
340 		fill += zfs_rs_get_fill(rs, rt);
341 		start = MIN(start, rstart);
342 		end = MAX(end, rend);
343 		size = end - start;
344 
345 		zfs_btree_remove(&rt->rt_root, rs);
346 		zfs_range_tree_add_impl(rt, start, size, fill);
347 		return;
348 	}
349 
350 	ASSERT3P(rs, ==, NULL);
351 
352 	/*
353 	 * Determine whether or not we will have to merge with our neighbors.
354 	 * If gap != 0, we might need to merge with our neighbors even if we
355 	 * aren't directly touching.
356 	 */
357 	zfs_btree_index_t where_before, where_after;
358 	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
359 	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
360 
361 	merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
362 	    start - gap);
363 	merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
364 	    end + gap);
365 
366 	if (merge_before && gap != 0)
367 		bridge_size += start - zfs_rs_get_end(rs_before, rt);
368 	if (merge_after && gap != 0)
369 		bridge_size += zfs_rs_get_start(rs_after, rt) - end;
370 
371 	if (merge_before && merge_after) {
372 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
373 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
374 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
375 		}
376 
377 		zfs_range_tree_stat_decr(rt, rs_before);
378 		zfs_range_tree_stat_decr(rt, rs_after);
379 
380 		zfs_rs_copy(rs_after, &tmp, rt);
381 		uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
382 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
383 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
384 		zfs_btree_remove_idx(&rt->rt_root, &where_before);
385 
386 		/*
387 		 * We have to re-find the node because our old reference is
388 		 * invalid as soon as we do any mutating btree operations.
389 		 */
390 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
391 		ASSERT3P(rs_after, !=, NULL);
392 		zfs_rs_set_start_raw(rs_after, rt, before_start);
393 		zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
394 		rs = rs_after;
395 	} else if (merge_before) {
396 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
397 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
398 
399 		zfs_range_tree_stat_decr(rt, rs_before);
400 
401 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
402 		zfs_rs_set_end(rs_before, rt, end);
403 		zfs_rs_set_fill(rs_before, rt, before_fill + fill);
404 		rs = rs_before;
405 	} else if (merge_after) {
406 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
407 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
408 
409 		zfs_range_tree_stat_decr(rt, rs_after);
410 
411 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
412 		zfs_rs_set_start(rs_after, rt, start);
413 		zfs_rs_set_fill(rs_after, rt, after_fill + fill);
414 		rs = rs_after;
415 	} else {
416 		rs = &tmp;
417 
418 		zfs_rs_set_start(rs, rt, start);
419 		zfs_rs_set_end(rs, rt, end);
420 		zfs_rs_set_fill(rs, rt, fill);
421 		zfs_btree_add_idx(&rt->rt_root, rs, &where);
422 	}
423 
424 	if (gap != 0) {
425 		ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
426 		    zfs_rs_get_start(rs, rt));
427 	} else {
428 		ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
429 		    zfs_rs_get_start(rs, rt));
430 	}
431 
432 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
433 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
434 
435 	zfs_range_tree_stat_incr(rt, rs);
436 	rt->rt_space += size + bridge_size;
437 }
438 
439 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)440 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
441 {
442 	zfs_range_tree_add_impl(arg, start, size, size);
443 }
444 
445 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)446 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
447     boolean_t do_fill)
448 {
449 	zfs_btree_index_t where;
450 	zfs_range_seg_t *rs;
451 	zfs_range_seg_max_t rsearch, rs_tmp;
452 	uint64_t end = start + size;
453 	boolean_t left_over, right_over;
454 
455 	VERIFY3U(size, !=, 0);
456 	VERIFY3U(size, <=, rt->rt_space);
457 	if (rt->rt_type == ZFS_RANGE_SEG64)
458 		ASSERT3U(start + size, >, start);
459 
460 	zfs_rs_set_start(&rsearch, rt, start);
461 	zfs_rs_set_end(&rsearch, rt, end);
462 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
463 
464 	/* Make sure we completely overlap with someone */
465 	if (rs == NULL) {
466 		zfs_panic_recover("zfs: removing nonexistent segment from "
467 		    "range tree (offset=%llx size=%llx)",
468 		    (longlong_t)start, (longlong_t)size);
469 		return;
470 	}
471 
472 	/*
473 	 * Range trees with gap support must only remove complete segments
474 	 * from the tree. This allows us to maintain accurate fill accounting
475 	 * and to ensure that bridged sections are not leaked. If we need to
476 	 * remove less than the full segment, we can only adjust the fill count.
477 	 */
478 	if (rt->rt_gap != 0) {
479 		if (do_fill) {
480 			if (zfs_rs_get_fill(rs, rt) == size) {
481 				start = zfs_rs_get_start(rs, rt);
482 				end = zfs_rs_get_end(rs, rt);
483 				size = end - start;
484 			} else {
485 				zfs_range_tree_adjust_fill(rt, rs, -size);
486 				return;
487 			}
488 		} else if (zfs_rs_get_start(rs, rt) != start ||
489 		    zfs_rs_get_end(rs, rt) != end) {
490 			zfs_panic_recover("zfs: freeing partial segment of "
491 			    "gap tree (offset=%llx size=%llx) of "
492 			    "(offset=%llx size=%llx)",
493 			    (longlong_t)start, (longlong_t)size,
494 			    (longlong_t)zfs_rs_get_start(rs, rt),
495 			    (longlong_t)zfs_rs_get_end(rs, rt) -
496 			    zfs_rs_get_start(rs, rt));
497 			return;
498 		}
499 	}
500 
501 	VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
502 	VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
503 
504 	left_over = (zfs_rs_get_start(rs, rt) != start);
505 	right_over = (zfs_rs_get_end(rs, rt) != end);
506 
507 	zfs_range_tree_stat_decr(rt, rs);
508 
509 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
510 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
511 
512 	if (left_over && right_over) {
513 		zfs_range_seg_max_t newseg;
514 		zfs_rs_set_start(&newseg, rt, end);
515 		zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
516 		zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
517 		zfs_range_tree_stat_incr(rt, &newseg);
518 
519 		// This modifies the buffer already inside the range tree
520 		zfs_rs_set_end(rs, rt, start);
521 
522 		zfs_rs_copy(rs, &rs_tmp, rt);
523 		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
524 			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
525 		else
526 			zfs_btree_add(&rt->rt_root, &newseg);
527 
528 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
529 			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
530 	} else if (left_over) {
531 		// This modifies the buffer already inside the range tree
532 		zfs_rs_set_end(rs, rt, start);
533 		zfs_rs_copy(rs, &rs_tmp, rt);
534 	} else if (right_over) {
535 		// This modifies the buffer already inside the range tree
536 		zfs_rs_set_start(rs, rt, end);
537 		zfs_rs_copy(rs, &rs_tmp, rt);
538 	} else {
539 		zfs_btree_remove_idx(&rt->rt_root, &where);
540 		rs = NULL;
541 	}
542 
543 	if (rs != NULL) {
544 		/*
545 		 * The fill of the leftover segment will always be equal to
546 		 * the size, since we do not support removing partial segments
547 		 * of range trees with gaps.
548 		 */
549 		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
550 		    zfs_rs_get_start_raw(rs, rt));
551 		zfs_range_tree_stat_incr(rt, &rs_tmp);
552 
553 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
554 			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
555 	}
556 
557 	rt->rt_space -= size;
558 }
559 
560 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)561 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
562 {
563 	zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
564 }
565 
566 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)567 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
568 {
569 	zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
570 }
571 
572 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)573 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
574     uint64_t newstart, uint64_t newsize)
575 {
576 	int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
577 	    zfs_rs_get_start(rs, rt));
578 
579 	zfs_range_tree_stat_decr(rt, rs);
580 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
581 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
582 
583 	zfs_rs_set_start(rs, rt, newstart);
584 	zfs_rs_set_end(rs, rt, newstart + newsize);
585 
586 	zfs_range_tree_stat_incr(rt, rs);
587 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
588 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
589 
590 	rt->rt_space += delta;
591 }
592 
593 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)594 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
595 {
596 	zfs_range_seg_max_t rsearch;
597 	uint64_t end = start + size;
598 
599 	VERIFY(size != 0);
600 
601 	zfs_rs_set_start(&rsearch, rt, start);
602 	zfs_rs_set_end(&rsearch, rt, end);
603 	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
604 }
605 
606 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)607 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
608 {
609 	if (rt->rt_type == ZFS_RANGE_SEG64)
610 		ASSERT3U(start + size, >, start);
611 
612 	zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
613 	if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
614 	    zfs_rs_get_end(rs, rt) >= start + size) {
615 		return (rs);
616 	}
617 	return (NULL);
618 }
619 
620 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)621 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
622     uint64_t size)
623 {
624 	zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
625 	if (rs != NULL)
626 		panic("segment already in tree; rs=%p", (void *)rs);
627 }
628 
629 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)630 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
631 {
632 	return (zfs_range_tree_find(rt, start, size) != NULL);
633 }
634 
635 /*
636  * Returns the first subset of the given range which overlaps with the range
637  * tree. Returns true if there is a segment in the range, and false if there
638  * isn't.
639  */
640 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)641 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
642     uint64_t *ostart, uint64_t *osize)
643 {
644 	if (rt->rt_type == ZFS_RANGE_SEG64)
645 		ASSERT3U(start + size, >, start);
646 
647 	zfs_range_seg_max_t rsearch;
648 	zfs_rs_set_start(&rsearch, rt, start);
649 	zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
650 	    1);
651 
652 	zfs_btree_index_t where;
653 	zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
654 	if (rs != NULL) {
655 		*ostart = start;
656 		*osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
657 		return (B_TRUE);
658 	}
659 
660 	rs = zfs_btree_next(&rt->rt_root, &where, &where);
661 	if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size)
662 		return (B_FALSE);
663 
664 	*ostart = zfs_rs_get_start(rs, rt);
665 	*osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
666 	    zfs_rs_get_start(rs, rt);
667 	return (B_TRUE);
668 }
669 
670 /*
671  * Ensure that this range is not in the tree, regardless of whether
672  * it is currently in the tree.
673  */
674 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)675 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
676 {
677 	zfs_range_seg_t *rs;
678 
679 	if (size == 0)
680 		return;
681 
682 	if (rt->rt_type == ZFS_RANGE_SEG64)
683 		ASSERT3U(start + size, >, start);
684 
685 	while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
686 		uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
687 		uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
688 		zfs_range_tree_remove(rt, free_start, free_end - free_start);
689 	}
690 }
691 
692 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)693 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
694 {
695 	zfs_range_tree_t *rt;
696 
697 	ASSERT0(zfs_range_tree_space(*rtdst));
698 	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
699 
700 	rt = *rtsrc;
701 	*rtsrc = *rtdst;
702 	*rtdst = rt;
703 }
704 
705 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)706 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
707     void *arg)
708 {
709 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
710 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
711 
712 	if (func != NULL) {
713 		zfs_range_seg_t *rs;
714 		zfs_btree_index_t *cookie = NULL;
715 
716 		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
717 		    NULL) {
718 			func(arg, zfs_rs_get_start(rs, rt),
719 			    zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
720 		}
721 	} else {
722 		zfs_btree_clear(&rt->rt_root);
723 	}
724 
725 	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
726 	rt->rt_space = 0;
727 }
728 
729 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)730 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
731     void *arg)
732 {
733 	zfs_btree_index_t where;
734 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
735 	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
736 		func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
737 		    zfs_rs_get_start(rs, rt));
738 	}
739 }
740 
741 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)742 zfs_range_tree_first(zfs_range_tree_t *rt)
743 {
744 	return (zfs_btree_first(&rt->rt_root, NULL));
745 }
746 
747 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)748 zfs_range_tree_space(zfs_range_tree_t *rt)
749 {
750 	return (rt->rt_space);
751 }
752 
753 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)754 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
755 {
756 	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
757 }
758 
759 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)760 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
761 {
762 	ASSERT(rt != NULL);
763 	return (zfs_range_tree_space(rt) == 0);
764 }
765 
766 /*
767  * Remove any overlapping ranges between the given segment [start, end)
768  * from removefrom. Add non-overlapping leftovers to addto.
769  */
770 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)771 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
772     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
773 {
774 	zfs_btree_index_t where;
775 	zfs_range_seg_max_t starting_rs;
776 	zfs_rs_set_start(&starting_rs, removefrom, start);
777 	zfs_rs_set_end_raw(&starting_rs, removefrom,
778 	    zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
779 
780 	zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
781 	    &starting_rs, &where);
782 
783 	if (curr == NULL)
784 		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
785 
786 	zfs_range_seg_t *next;
787 	for (; curr != NULL; curr = next) {
788 		if (start == end)
789 			return;
790 		VERIFY3U(start, <, end);
791 
792 		/* there is no overlap */
793 		if (end <= zfs_rs_get_start(curr, removefrom)) {
794 			zfs_range_tree_add(addto, start, end - start);
795 			return;
796 		}
797 
798 		uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
799 		    start);
800 		uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
801 		    end);
802 		uint64_t overlap_size = overlap_end - overlap_start;
803 		ASSERT3S(overlap_size, >, 0);
804 		zfs_range_seg_max_t rs;
805 		zfs_rs_copy(curr, &rs, removefrom);
806 
807 		zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
808 
809 		if (start < overlap_start)
810 			zfs_range_tree_add(addto, start, overlap_start - start);
811 
812 		start = overlap_end;
813 		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
814 		/*
815 		 * If we find something here, we only removed part of the
816 		 * curr segment. Either there's some left at the end
817 		 * because we've reached the end of the range we're removing,
818 		 * or there's some left at the start because we started
819 		 * partway through the range.  Either way, we continue with
820 		 * the loop. If it's the former, we'll return at the start of
821 		 * the loop, and if it's the latter we'll see if there is more
822 		 * area to process.
823 		 */
824 		if (next != NULL) {
825 			ASSERT(start == end || start == zfs_rs_get_end(&rs,
826 			    removefrom));
827 		}
828 
829 		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
830 	}
831 	VERIFY3P(curr, ==, NULL);
832 
833 	if (start != end) {
834 		VERIFY3U(start, <, end);
835 		zfs_range_tree_add(addto, start, end - start);
836 	} else {
837 		VERIFY3U(start, ==, end);
838 	}
839 }
840 
841 /*
842  * For each entry in rt, if it exists in removefrom, remove it
843  * from removefrom. Otherwise, add it to addto.
844  */
845 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)846 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
847     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
848 {
849 	zfs_btree_index_t where;
850 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
851 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
852 		zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
853 		    zfs_rs_get_end(rs, rt), removefrom, addto);
854 	}
855 }
856 
857 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)858 zfs_range_tree_min(zfs_range_tree_t *rt)
859 {
860 	zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
861 	return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
862 }
863 
864 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)865 zfs_range_tree_max(zfs_range_tree_t *rt)
866 {
867 	zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
868 	return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
869 }
870 
871 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)872 zfs_range_tree_span(zfs_range_tree_t *rt)
873 {
874 	return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
875 }
876