xref: /freebsd/sys/contrib/openzfs/module/zfs/range_tree.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37 
38 /*
39  * Range trees are tree-based data structures that can be used to
40  * track free space or generally any space allocation information.
41  * A range tree keeps track of individual segments and automatically
42  * provides facilities such as adjacent extent merging and extent
43  * splitting in response to range add/remove requests.
44  *
45  * A range tree starts out completely empty, with no segments in it.
46  * Adding an allocation via zfs_range_tree_add to the range tree can either:
47  * 1) create a new extent
48  * 2) extend an adjacent extent
49  * 3) merge two adjacent extents
50  * Conversely, removing an allocation via zfs_range_tree_remove can:
51  * 1) completely remove an extent
52  * 2) shorten an extent (if the allocation was near one of its ends)
53  * 3) split an extent into two extents, in effect punching a hole
54  *
55  * A range tree is also capable of 'bridging' gaps when adding
56  * allocations. This is useful for cases when close proximity of
57  * allocations is an important detail that needs to be represented
58  * in the range tree. See zfs_range_tree_set_gap(). The default behavior
59  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
60  *
61  * In order to traverse a range tree, use either the zfs_range_tree_walk()
62  * or zfs_range_tree_vacate() functions.
63  *
64  * To obtain more accurate information on individual segment
65  * operations that the range tree performs "under the hood", you can
66  * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
67  * to the zfs_range_tree_create function. Any callbacks that are non-NULL
68  * are then called at the appropriate times.
69  *
70  * The range tree code also supports a special variant of range trees
71  * that can bridge small gaps between segments. This kind of tree is used
72  * by the dsl scanning code to group I/Os into mostly sequential chunks to
73  * optimize disk performance. The code here attempts to do this with as
74  * little memory and computational overhead as possible. One limitation of
75  * this implementation is that segments of range trees with gaps can only
76  * support removing complete segments.
77  */
78 
79 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)80 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
81 {
82 	ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
83 	size_t size = 0;
84 	switch (rt->rt_type) {
85 	case ZFS_RANGE_SEG32:
86 		size = sizeof (zfs_range_seg32_t);
87 		break;
88 	case ZFS_RANGE_SEG64:
89 		size = sizeof (zfs_range_seg64_t);
90 		break;
91 	case ZFS_RANGE_SEG_GAP:
92 		size = sizeof (zfs_range_seg_gap_t);
93 		break;
94 	default:
95 		__builtin_unreachable();
96 	}
97 	memcpy(dest, src, size);
98 }
99 
100 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)101 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
102 {
103 	zfs_range_seg_t *rs;
104 	zfs_btree_index_t where;
105 	uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
106 	int i;
107 
108 	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
109 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
110 		uint64_t size = zfs_rs_get_end(rs, rt) -
111 		    zfs_rs_get_start(rs, rt);
112 		int idx	= highbit64(size) - 1;
113 
114 		hist[idx]++;
115 		ASSERT3U(hist[idx], !=, 0);
116 	}
117 
118 	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
119 		if (hist[i] != rt->rt_histogram[i]) {
120 			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
121 			    i, hist, (u_longlong_t)hist[i],
122 			    (u_longlong_t)rt->rt_histogram[i]);
123 		}
124 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
125 	}
126 }
127 
128 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)129 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
130 {
131 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
132 	int idx = highbit64(size) - 1;
133 
134 	ASSERT(size != 0);
135 	ASSERT3U(idx, <,
136 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
137 
138 	rt->rt_histogram[idx]++;
139 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
140 }
141 
142 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)143 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
144 {
145 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
146 	int idx = highbit64(size) - 1;
147 
148 	ASSERT(size != 0);
149 	ASSERT3U(idx, <,
150 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
151 
152 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
153 	rt->rt_histogram[idx]--;
154 }
155 
156 __attribute__((always_inline)) inline
157 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)158 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
159 {
160 	const zfs_range_seg32_t *r1 = x1;
161 	const zfs_range_seg32_t *r2 = x2;
162 
163 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
164 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
165 
166 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
167 }
168 
169 __attribute__((always_inline)) inline
170 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)171 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
172 {
173 	const zfs_range_seg64_t *r1 = x1;
174 	const zfs_range_seg64_t *r2 = x2;
175 
176 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
177 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
178 
179 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
180 }
181 
182 __attribute__((always_inline)) inline
183 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)184 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
185 {
186 	const zfs_range_seg_gap_t *r1 = x1;
187 	const zfs_range_seg_gap_t *r2 = x2;
188 
189 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
190 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
191 
192 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
193 }
194 
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)195 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
196     zfs_range_tree_seg32_compare)
197 
198 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
199     zfs_range_tree_seg64_compare)
200 
201 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
202     zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
203 
204 zfs_range_tree_t *
205 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
206     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
207     uint64_t gap)
208 {
209 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
210 
211 	ASSERT3U(shift, <, 64);
212 	ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
213 	size_t size;
214 	int (*compare) (const void *, const void *);
215 	bt_find_in_buf_f bt_find;
216 	switch (type) {
217 	case ZFS_RANGE_SEG32:
218 		size = sizeof (zfs_range_seg32_t);
219 		compare = zfs_range_tree_seg32_compare;
220 		bt_find = zfs_range_tree_seg32_find_in_buf;
221 		break;
222 	case ZFS_RANGE_SEG64:
223 		size = sizeof (zfs_range_seg64_t);
224 		compare = zfs_range_tree_seg64_compare;
225 		bt_find = zfs_range_tree_seg64_find_in_buf;
226 		break;
227 	case ZFS_RANGE_SEG_GAP:
228 		size = sizeof (zfs_range_seg_gap_t);
229 		compare = zfs_range_tree_seg_gap_compare;
230 		bt_find = zfs_range_tree_seg_gap_find_in_buf;
231 		break;
232 	default:
233 		panic("Invalid range seg type %d", type);
234 	}
235 	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
236 
237 	rt->rt_ops = ops;
238 	rt->rt_gap = gap;
239 	rt->rt_arg = arg;
240 	rt->rt_type = type;
241 	rt->rt_start = start;
242 	rt->rt_shift = shift;
243 
244 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
245 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
246 
247 	return (rt);
248 }
249 
250 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)251 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
252     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
253 {
254 	return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
255 }
256 
257 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)258 zfs_range_tree_destroy(zfs_range_tree_t *rt)
259 {
260 	VERIFY0(rt->rt_space);
261 
262 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
263 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
264 
265 	zfs_btree_destroy(&rt->rt_root);
266 	kmem_free(rt, sizeof (*rt));
267 }
268 
269 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)270 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
271     int64_t delta)
272 {
273 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
274 		zfs_panic_recover("zfs: attempting to decrease fill to or "
275 		    "below 0; probable double remove in segment [%llx:%llx]",
276 		    (longlong_t)zfs_rs_get_start(rs, rt),
277 		    (longlong_t)zfs_rs_get_end(rs, rt));
278 	}
279 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
280 	    zfs_rs_get_start(rs, rt)) {
281 		zfs_panic_recover("zfs: attempting to increase fill beyond "
282 		    "max; probable double add in segment [%llx:%llx]",
283 		    (longlong_t)zfs_rs_get_start(rs, rt),
284 		    (longlong_t)zfs_rs_get_end(rs, rt));
285 	}
286 
287 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
288 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
289 	zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
290 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
291 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
292 }
293 
294 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)295 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
296 {
297 	zfs_range_tree_t *rt = arg;
298 	zfs_btree_index_t where;
299 	zfs_range_seg_t *rs_before, *rs_after, *rs;
300 	zfs_range_seg_max_t tmp, rsearch;
301 	uint64_t end = start + size, gap = rt->rt_gap;
302 	uint64_t bridge_size = 0;
303 	boolean_t merge_before, merge_after;
304 
305 	ASSERT3U(size, !=, 0);
306 	ASSERT3U(fill, <=, size);
307 	ASSERT3U(start + size, >, start);
308 
309 	zfs_rs_set_start(&rsearch, rt, start);
310 	zfs_rs_set_end(&rsearch, rt, end);
311 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
312 
313 	/*
314 	 * If this is a gap-supporting range tree, it is possible that we
315 	 * are inserting into an existing segment. In this case simply
316 	 * bump the fill count and call the remove / add callbacks. If the
317 	 * new range will extend an existing segment, we remove the
318 	 * existing one, apply the new extent to it and re-insert it using
319 	 * the normal code paths.
320 	 */
321 	if (rs != NULL) {
322 		if (gap == 0) {
323 			zfs_panic_recover("zfs: adding existent segment to "
324 			    "range tree (offset=%llx size=%llx)",
325 			    (longlong_t)start, (longlong_t)size);
326 			return;
327 		}
328 		uint64_t rstart = zfs_rs_get_start(rs, rt);
329 		uint64_t rend = zfs_rs_get_end(rs, rt);
330 		if (rstart <= start && rend >= end) {
331 			zfs_range_tree_adjust_fill(rt, rs, fill);
332 			return;
333 		}
334 
335 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
336 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
337 
338 		zfs_range_tree_stat_decr(rt, rs);
339 		rt->rt_space -= rend - rstart;
340 
341 		fill += zfs_rs_get_fill(rs, rt);
342 		start = MIN(start, rstart);
343 		end = MAX(end, rend);
344 		size = end - start;
345 
346 		zfs_btree_remove(&rt->rt_root, rs);
347 		zfs_range_tree_add_impl(rt, start, size, fill);
348 		return;
349 	}
350 
351 	ASSERT3P(rs, ==, NULL);
352 
353 	/*
354 	 * Determine whether or not we will have to merge with our neighbors.
355 	 * If gap != 0, we might need to merge with our neighbors even if we
356 	 * aren't directly touching.
357 	 */
358 	zfs_btree_index_t where_before, where_after;
359 	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
360 	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
361 
362 	merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
363 	    start - gap);
364 	merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
365 	    end + gap);
366 
367 	if (merge_before && gap != 0)
368 		bridge_size += start - zfs_rs_get_end(rs_before, rt);
369 	if (merge_after && gap != 0)
370 		bridge_size += zfs_rs_get_start(rs_after, rt) - end;
371 
372 	if (merge_before && merge_after) {
373 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
374 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
375 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
376 		}
377 
378 		zfs_range_tree_stat_decr(rt, rs_before);
379 		zfs_range_tree_stat_decr(rt, rs_after);
380 
381 		zfs_rs_copy(rs_after, &tmp, rt);
382 		uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
383 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
384 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
385 		zfs_btree_remove_idx(&rt->rt_root, &where_before);
386 
387 		/*
388 		 * We have to re-find the node because our old reference is
389 		 * invalid as soon as we do any mutating btree operations.
390 		 */
391 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
392 		ASSERT3P(rs_after, !=, NULL);
393 		zfs_rs_set_start_raw(rs_after, rt, before_start);
394 		zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
395 		rs = rs_after;
396 	} else if (merge_before) {
397 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
398 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
399 
400 		zfs_range_tree_stat_decr(rt, rs_before);
401 
402 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
403 		zfs_rs_set_end(rs_before, rt, end);
404 		zfs_rs_set_fill(rs_before, rt, before_fill + fill);
405 		rs = rs_before;
406 	} else if (merge_after) {
407 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
408 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
409 
410 		zfs_range_tree_stat_decr(rt, rs_after);
411 
412 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
413 		zfs_rs_set_start(rs_after, rt, start);
414 		zfs_rs_set_fill(rs_after, rt, after_fill + fill);
415 		rs = rs_after;
416 	} else {
417 		rs = &tmp;
418 
419 		zfs_rs_set_start(rs, rt, start);
420 		zfs_rs_set_end(rs, rt, end);
421 		zfs_rs_set_fill(rs, rt, fill);
422 		zfs_btree_add_idx(&rt->rt_root, rs, &where);
423 	}
424 
425 	if (gap != 0) {
426 		ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
427 		    zfs_rs_get_start(rs, rt));
428 	} else {
429 		ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
430 		    zfs_rs_get_start(rs, rt));
431 	}
432 
433 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
434 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
435 
436 	zfs_range_tree_stat_incr(rt, rs);
437 	rt->rt_space += size + bridge_size;
438 }
439 
440 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)441 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
442 {
443 	zfs_range_tree_add_impl(arg, start, size, size);
444 }
445 
446 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)447 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
448     boolean_t do_fill)
449 {
450 	zfs_btree_index_t where;
451 	zfs_range_seg_t *rs;
452 	zfs_range_seg_max_t rsearch, rs_tmp;
453 	uint64_t end = start + size;
454 	boolean_t left_over, right_over;
455 
456 	VERIFY3U(size, !=, 0);
457 	VERIFY3U(size, <=, rt->rt_space);
458 	if (rt->rt_type == ZFS_RANGE_SEG64)
459 		ASSERT3U(start + size, >, start);
460 
461 	zfs_rs_set_start(&rsearch, rt, start);
462 	zfs_rs_set_end(&rsearch, rt, end);
463 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
464 
465 	/* Make sure we completely overlap with someone */
466 	if (rs == NULL) {
467 		zfs_panic_recover("zfs: removing nonexistent segment from "
468 		    "range tree (offset=%llx size=%llx)",
469 		    (longlong_t)start, (longlong_t)size);
470 		return;
471 	}
472 
473 	/*
474 	 * Range trees with gap support must only remove complete segments
475 	 * from the tree. This allows us to maintain accurate fill accounting
476 	 * and to ensure that bridged sections are not leaked. If we need to
477 	 * remove less than the full segment, we can only adjust the fill count.
478 	 */
479 	if (rt->rt_gap != 0) {
480 		if (do_fill) {
481 			if (zfs_rs_get_fill(rs, rt) == size) {
482 				start = zfs_rs_get_start(rs, rt);
483 				end = zfs_rs_get_end(rs, rt);
484 				size = end - start;
485 			} else {
486 				zfs_range_tree_adjust_fill(rt, rs, -size);
487 				return;
488 			}
489 		} else if (zfs_rs_get_start(rs, rt) != start ||
490 		    zfs_rs_get_end(rs, rt) != end) {
491 			zfs_panic_recover("zfs: freeing partial segment of "
492 			    "gap tree (offset=%llx size=%llx) of "
493 			    "(offset=%llx size=%llx)",
494 			    (longlong_t)start, (longlong_t)size,
495 			    (longlong_t)zfs_rs_get_start(rs, rt),
496 			    (longlong_t)zfs_rs_get_end(rs, rt) -
497 			    zfs_rs_get_start(rs, rt));
498 			return;
499 		}
500 	}
501 
502 	VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
503 	VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
504 
505 	left_over = (zfs_rs_get_start(rs, rt) != start);
506 	right_over = (zfs_rs_get_end(rs, rt) != end);
507 
508 	zfs_range_tree_stat_decr(rt, rs);
509 
510 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
511 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
512 
513 	if (left_over && right_over) {
514 		zfs_range_seg_max_t newseg;
515 		zfs_rs_set_start(&newseg, rt, end);
516 		zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
517 		zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
518 		zfs_range_tree_stat_incr(rt, &newseg);
519 
520 		// This modifies the buffer already inside the range tree
521 		zfs_rs_set_end(rs, rt, start);
522 
523 		zfs_rs_copy(rs, &rs_tmp, rt);
524 		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
525 			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
526 		else
527 			zfs_btree_add(&rt->rt_root, &newseg);
528 
529 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
530 			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
531 	} else if (left_over) {
532 		// This modifies the buffer already inside the range tree
533 		zfs_rs_set_end(rs, rt, start);
534 		zfs_rs_copy(rs, &rs_tmp, rt);
535 	} else if (right_over) {
536 		// This modifies the buffer already inside the range tree
537 		zfs_rs_set_start(rs, rt, end);
538 		zfs_rs_copy(rs, &rs_tmp, rt);
539 	} else {
540 		zfs_btree_remove_idx(&rt->rt_root, &where);
541 		rs = NULL;
542 	}
543 
544 	if (rs != NULL) {
545 		/*
546 		 * The fill of the leftover segment will always be equal to
547 		 * the size, since we do not support removing partial segments
548 		 * of range trees with gaps.
549 		 */
550 		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
551 		    zfs_rs_get_start_raw(rs, rt));
552 		zfs_range_tree_stat_incr(rt, &rs_tmp);
553 
554 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
555 			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
556 	}
557 
558 	rt->rt_space -= size;
559 }
560 
561 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)562 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
563 {
564 	zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
565 }
566 
567 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)568 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
569 {
570 	zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
571 }
572 
573 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)574 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
575     uint64_t newstart, uint64_t newsize)
576 {
577 	int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
578 	    zfs_rs_get_start(rs, rt));
579 
580 	zfs_range_tree_stat_decr(rt, rs);
581 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
582 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
583 
584 	zfs_rs_set_start(rs, rt, newstart);
585 	zfs_rs_set_end(rs, rt, newstart + newsize);
586 
587 	zfs_range_tree_stat_incr(rt, rs);
588 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
589 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
590 
591 	rt->rt_space += delta;
592 }
593 
594 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)595 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
596 {
597 	zfs_range_seg_max_t rsearch;
598 	uint64_t end = start + size;
599 
600 	VERIFY(size != 0);
601 
602 	zfs_rs_set_start(&rsearch, rt, start);
603 	zfs_rs_set_end(&rsearch, rt, end);
604 	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
605 }
606 
607 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)608 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
609 {
610 	if (rt->rt_type == ZFS_RANGE_SEG64)
611 		ASSERT3U(start + size, >, start);
612 
613 	zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
614 	if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
615 	    zfs_rs_get_end(rs, rt) >= start + size) {
616 		return (rs);
617 	}
618 	return (NULL);
619 }
620 
621 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)622 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
623     uint64_t size)
624 {
625 	zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
626 	if (rs != NULL)
627 		panic("segment already in tree; rs=%p", (void *)rs);
628 }
629 
630 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)631 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
632 {
633 	return (zfs_range_tree_find(rt, start, size) != NULL);
634 }
635 
636 /*
637  * Returns the first subset of the given range which overlaps with the range
638  * tree. Returns true if there is a segment in the range, and false if there
639  * isn't.
640  */
641 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)642 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
643     uint64_t *ostart, uint64_t *osize)
644 {
645 	if (rt->rt_type == ZFS_RANGE_SEG64)
646 		ASSERT3U(start + size, >, start);
647 
648 	zfs_range_seg_max_t rsearch;
649 	zfs_rs_set_start(&rsearch, rt, start);
650 	zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
651 	    1);
652 
653 	zfs_btree_index_t where;
654 	zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
655 	if (rs != NULL) {
656 		*ostart = start;
657 		*osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
658 		return (B_TRUE);
659 	}
660 
661 	rs = zfs_btree_next(&rt->rt_root, &where, &where);
662 	if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size)
663 		return (B_FALSE);
664 
665 	*ostart = zfs_rs_get_start(rs, rt);
666 	*osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
667 	    zfs_rs_get_start(rs, rt);
668 	return (B_TRUE);
669 }
670 
671 /*
672  * Ensure that this range is not in the tree, regardless of whether
673  * it is currently in the tree.
674  */
675 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)676 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
677 {
678 	zfs_range_seg_t *rs;
679 
680 	if (size == 0)
681 		return;
682 
683 	if (rt->rt_type == ZFS_RANGE_SEG64)
684 		ASSERT3U(start + size, >, start);
685 
686 	while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
687 		uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
688 		uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
689 		zfs_range_tree_remove(rt, free_start, free_end - free_start);
690 	}
691 }
692 
693 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)694 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
695 {
696 	zfs_range_tree_t *rt;
697 
698 	ASSERT0(zfs_range_tree_space(*rtdst));
699 	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
700 
701 	rt = *rtsrc;
702 	*rtsrc = *rtdst;
703 	*rtdst = rt;
704 }
705 
706 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)707 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
708     void *arg)
709 {
710 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
711 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
712 
713 	if (func != NULL) {
714 		zfs_range_seg_t *rs;
715 		zfs_btree_index_t *cookie = NULL;
716 
717 		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
718 		    NULL) {
719 			func(arg, zfs_rs_get_start(rs, rt),
720 			    zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
721 		}
722 	} else {
723 		zfs_btree_clear(&rt->rt_root);
724 	}
725 
726 	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
727 	rt->rt_space = 0;
728 }
729 
730 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)731 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
732     void *arg)
733 {
734 	zfs_btree_index_t where;
735 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
736 	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
737 		func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
738 		    zfs_rs_get_start(rs, rt));
739 	}
740 }
741 
742 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)743 zfs_range_tree_first(zfs_range_tree_t *rt)
744 {
745 	return (zfs_btree_first(&rt->rt_root, NULL));
746 }
747 
748 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)749 zfs_range_tree_space(zfs_range_tree_t *rt)
750 {
751 	return (rt->rt_space);
752 }
753 
754 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)755 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
756 {
757 	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
758 }
759 
760 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)761 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
762 {
763 	ASSERT(rt != NULL);
764 	return (zfs_range_tree_space(rt) == 0);
765 }
766 
767 /*
768  * Remove any overlapping ranges between the given segment [start, end)
769  * from removefrom. Add non-overlapping leftovers to addto.
770  */
771 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)772 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
773     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
774 {
775 	zfs_btree_index_t where;
776 	zfs_range_seg_max_t starting_rs;
777 	zfs_rs_set_start(&starting_rs, removefrom, start);
778 	zfs_rs_set_end_raw(&starting_rs, removefrom,
779 	    zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
780 
781 	zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
782 	    &starting_rs, &where);
783 
784 	if (curr == NULL)
785 		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
786 
787 	zfs_range_seg_t *next;
788 	for (; curr != NULL; curr = next) {
789 		if (start == end)
790 			return;
791 		VERIFY3U(start, <, end);
792 
793 		/* there is no overlap */
794 		if (end <= zfs_rs_get_start(curr, removefrom)) {
795 			zfs_range_tree_add(addto, start, end - start);
796 			return;
797 		}
798 
799 		uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
800 		    start);
801 		uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
802 		    end);
803 		uint64_t overlap_size = overlap_end - overlap_start;
804 		ASSERT3S(overlap_size, >, 0);
805 		zfs_range_seg_max_t rs;
806 		zfs_rs_copy(curr, &rs, removefrom);
807 
808 		zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
809 
810 		if (start < overlap_start)
811 			zfs_range_tree_add(addto, start, overlap_start - start);
812 
813 		start = overlap_end;
814 		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
815 		/*
816 		 * If we find something here, we only removed part of the
817 		 * curr segment. Either there's some left at the end
818 		 * because we've reached the end of the range we're removing,
819 		 * or there's some left at the start because we started
820 		 * partway through the range.  Either way, we continue with
821 		 * the loop. If it's the former, we'll return at the start of
822 		 * the loop, and if it's the latter we'll see if there is more
823 		 * area to process.
824 		 */
825 		if (next != NULL) {
826 			ASSERT(start == end || start == zfs_rs_get_end(&rs,
827 			    removefrom));
828 		}
829 
830 		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
831 	}
832 	VERIFY3P(curr, ==, NULL);
833 
834 	if (start != end) {
835 		VERIFY3U(start, <, end);
836 		zfs_range_tree_add(addto, start, end - start);
837 	} else {
838 		VERIFY3U(start, ==, end);
839 	}
840 }
841 
842 /*
843  * For each entry in rt, if it exists in removefrom, remove it
844  * from removefrom. Otherwise, add it to addto.
845  */
846 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)847 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
848     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
849 {
850 	zfs_btree_index_t where;
851 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
852 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
853 		zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
854 		    zfs_rs_get_end(rs, rt), removefrom, addto);
855 	}
856 }
857 
858 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)859 zfs_range_tree_min(zfs_range_tree_t *rt)
860 {
861 	zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
862 	return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
863 }
864 
865 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)866 zfs_range_tree_max(zfs_range_tree_t *rt)
867 {
868 	zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
869 	return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
870 }
871 
872 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)873 zfs_range_tree_span(zfs_range_tree_t *rt)
874 {
875 	return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
876 }
877