xref: /freebsd/sys/contrib/openzfs/module/zfs/range_tree.c (revision df58e8b1506f241670be86a560fb6e8432043aee)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37 
38 /*
39  * Range trees are tree-based data structures that can be used to
40  * track free space or generally any space allocation information.
41  * A range tree keeps track of individual segments and automatically
42  * provides facilities such as adjacent extent merging and extent
43  * splitting in response to range add/remove requests.
44  *
45  * A range tree starts out completely empty, with no segments in it.
46  * Adding an allocation via zfs_range_tree_add to the range tree can either:
47  * 1) create a new extent
48  * 2) extend an adjacent extent
49  * 3) merge two adjacent extents
50  * Conversely, removing an allocation via zfs_range_tree_remove can:
51  * 1) completely remove an extent
52  * 2) shorten an extent (if the allocation was near one of its ends)
53  * 3) split an extent into two extents, in effect punching a hole
54  *
55  * A range tree is also capable of 'bridging' gaps when adding
56  * allocations. This is useful for cases when close proximity of
57  * allocations is an important detail that needs to be represented
58  * in the range tree. See zfs_range_tree_set_gap(). The default behavior
59  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
60  *
61  * In order to traverse a range tree, use either the zfs_range_tree_walk()
62  * or zfs_range_tree_vacate() functions.
63  *
64  * To obtain more accurate information on individual segment
65  * operations that the range tree performs "under the hood", you can
66  * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
67  * to the zfs_range_tree_create function. Any callbacks that are non-NULL
68  * are then called at the appropriate times.
69  *
70  * The range tree code also supports a special variant of range trees
71  * that can bridge small gaps between segments. This kind of tree is used
72  * by the dsl scanning code to group I/Os into mostly sequential chunks to
73  * optimize disk performance. The code here attempts to do this with as
74  * little memory and computational overhead as possible. One limitation of
75  * this implementation is that segments of range trees with gaps can only
76  * support removing complete segments.
77  */
78 
79 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)80 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
81 {
82 	ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
83 	size_t size = 0;
84 	switch (rt->rt_type) {
85 	case ZFS_RANGE_SEG32:
86 		size = sizeof (zfs_range_seg32_t);
87 		break;
88 	case ZFS_RANGE_SEG64:
89 		size = sizeof (zfs_range_seg64_t);
90 		break;
91 	case ZFS_RANGE_SEG_GAP:
92 		size = sizeof (zfs_range_seg_gap_t);
93 		break;
94 	default:
95 		__builtin_unreachable();
96 	}
97 	memcpy(dest, src, size);
98 }
99 
100 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)101 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
102 {
103 	zfs_range_seg_t *rs;
104 	zfs_btree_index_t where;
105 	uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
106 	int i;
107 
108 	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
109 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
110 		uint64_t size = zfs_rs_get_end(rs, rt) -
111 		    zfs_rs_get_start(rs, rt);
112 		int idx	= highbit64(size) - 1;
113 
114 		hist[idx]++;
115 		ASSERT3U(hist[idx], !=, 0);
116 	}
117 
118 	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
119 		if (hist[i] != rt->rt_histogram[i]) {
120 			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
121 			    i, hist, (u_longlong_t)hist[i],
122 			    (u_longlong_t)rt->rt_histogram[i]);
123 		}
124 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
125 	}
126 }
127 
128 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)129 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
130 {
131 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
132 	int idx = highbit64(size) - 1;
133 
134 	ASSERT(size != 0);
135 	ASSERT3U(idx, <,
136 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
137 
138 	rt->rt_histogram[idx]++;
139 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
140 }
141 
142 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)143 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
144 {
145 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
146 	int idx = highbit64(size) - 1;
147 
148 	ASSERT(size != 0);
149 	ASSERT3U(idx, <,
150 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
151 
152 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
153 	rt->rt_histogram[idx]--;
154 }
155 
156 __attribute__((always_inline)) inline
157 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)158 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
159 {
160 	const zfs_range_seg32_t *r1 = x1;
161 	const zfs_range_seg32_t *r2 = x2;
162 
163 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
164 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
165 
166 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
167 }
168 
169 __attribute__((always_inline)) inline
170 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)171 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
172 {
173 	const zfs_range_seg64_t *r1 = x1;
174 	const zfs_range_seg64_t *r2 = x2;
175 
176 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
177 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
178 
179 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
180 }
181 
182 __attribute__((always_inline)) inline
183 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)184 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
185 {
186 	const zfs_range_seg_gap_t *r1 = x1;
187 	const zfs_range_seg_gap_t *r2 = x2;
188 
189 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
190 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
191 
192 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
193 }
194 
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)195 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
196     zfs_range_tree_seg32_compare)
197 
198 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
199     zfs_range_tree_seg64_compare)
200 
201 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
202     zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
203 
204 static zfs_range_tree_t *
205 zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
206     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
207     uint64_t gap, uint64_t flags, const char *name)
208 {
209 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
210 
211 	ASSERT3U(shift, <, 64);
212 	ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
213 	size_t size;
214 	int (*compare) (const void *, const void *);
215 	bt_find_in_buf_f bt_find;
216 	switch (type) {
217 	case ZFS_RANGE_SEG32:
218 		size = sizeof (zfs_range_seg32_t);
219 		compare = zfs_range_tree_seg32_compare;
220 		bt_find = zfs_range_tree_seg32_find_in_buf;
221 		break;
222 	case ZFS_RANGE_SEG64:
223 		size = sizeof (zfs_range_seg64_t);
224 		compare = zfs_range_tree_seg64_compare;
225 		bt_find = zfs_range_tree_seg64_find_in_buf;
226 		break;
227 	case ZFS_RANGE_SEG_GAP:
228 		size = sizeof (zfs_range_seg_gap_t);
229 		compare = zfs_range_tree_seg_gap_compare;
230 		bt_find = zfs_range_tree_seg_gap_find_in_buf;
231 		break;
232 	default:
233 		panic("Invalid range seg type %d", type);
234 	}
235 	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
236 
237 	rt->rt_ops = ops;
238 	rt->rt_gap = gap;
239 	rt->rt_flags = flags;
240 	rt->rt_name = name;
241 	rt->rt_arg = arg;
242 	rt->rt_type = type;
243 	rt->rt_start = start;
244 	rt->rt_shift = shift;
245 
246 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
247 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
248 
249 	return (rt);
250 }
251 
252 zfs_range_tree_t *
zfs_range_tree_create_gap(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t gap)253 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
254     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
255     uint64_t gap)
256 {
257 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
258 	    0, NULL));
259 }
260 
261 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)262 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
263     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
264 {
265 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
266 	    0, NULL));
267 }
268 
269 zfs_range_tree_t *
zfs_range_tree_create_flags(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t flags,const char * name)270 zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
271     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
272     uint64_t flags, const char *name)
273 {
274 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
275 	    flags, name));
276 }
277 
278 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)279 zfs_range_tree_destroy(zfs_range_tree_t *rt)
280 {
281 	VERIFY0(rt->rt_space);
282 
283 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
284 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
285 
286 	if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
287 		kmem_strfree((char *)(uintptr_t)rt->rt_name);
288 
289 	zfs_btree_destroy(&rt->rt_root);
290 	kmem_free(rt, sizeof (*rt));
291 }
292 
293 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)294 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
295     int64_t delta)
296 {
297 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
298 		zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
299 		    "or below 0; probable double remove in segment [%llx:%llx]",
300 		    ZFS_RT_NAME(rt),
301 		    (longlong_t)zfs_rs_get_start(rs, rt),
302 		    (longlong_t)zfs_rs_get_end(rs, rt));
303 	}
304 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
305 	    zfs_rs_get_start(rs, rt)) {
306 		zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
307 		    "beyond max; probable double add in segment [%llx:%llx]",
308 		    ZFS_RT_NAME(rt),
309 		    (longlong_t)zfs_rs_get_start(rs, rt),
310 		    (longlong_t)zfs_rs_get_end(rs, rt));
311 	}
312 
313 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
314 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
315 	zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
316 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
317 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
318 }
319 
320 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)321 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
322 {
323 	zfs_range_tree_t *rt = arg;
324 	zfs_btree_index_t where;
325 	zfs_range_seg_t *rs_before, *rs_after, *rs;
326 	zfs_range_seg_max_t tmp, rsearch;
327 	uint64_t end = start + size, gap = rt->rt_gap;
328 	uint64_t bridge_size = 0;
329 	boolean_t merge_before, merge_after;
330 
331 	ASSERT3U(size, !=, 0);
332 	ASSERT3U(fill, <=, size);
333 	ASSERT3U(start + size, >, start);
334 
335 	zfs_rs_set_start(&rsearch, rt, start);
336 	zfs_rs_set_end(&rsearch, rt, end);
337 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
338 
339 	/*
340 	 * If this is a gap-supporting range tree, it is possible that we
341 	 * are inserting into an existing segment. In this case simply
342 	 * bump the fill count and call the remove / add callbacks. If the
343 	 * new range will extend an existing segment, we remove the
344 	 * existing one, apply the new extent to it and re-insert it using
345 	 * the normal code paths.
346 	 */
347 	if (rs != NULL) {
348 		uint64_t rstart = zfs_rs_get_start(rs, rt);
349 		uint64_t rend = zfs_rs_get_end(rs, rt);
350 		if (gap == 0) {
351 			zfs_panic_recover("zfs: rt=%s: adding segment "
352 			    "(offset=%llx size=%llx) overlapping with existing "
353 			    "one (offset=%llx size=%llx)",
354 			    ZFS_RT_NAME(rt),
355 			    (longlong_t)start, (longlong_t)size,
356 			    (longlong_t)rstart, (longlong_t)(rend - rstart));
357 			return;
358 		}
359 		if (rstart <= start && rend >= end) {
360 			zfs_range_tree_adjust_fill(rt, rs, fill);
361 			return;
362 		}
363 
364 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
365 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
366 
367 		zfs_range_tree_stat_decr(rt, rs);
368 		rt->rt_space -= rend - rstart;
369 
370 		fill += zfs_rs_get_fill(rs, rt);
371 		start = MIN(start, rstart);
372 		end = MAX(end, rend);
373 		size = end - start;
374 
375 		zfs_btree_remove(&rt->rt_root, rs);
376 		zfs_range_tree_add_impl(rt, start, size, fill);
377 		return;
378 	}
379 
380 	ASSERT3P(rs, ==, NULL);
381 
382 	/*
383 	 * Determine whether or not we will have to merge with our neighbors.
384 	 * If gap != 0, we might need to merge with our neighbors even if we
385 	 * aren't directly touching.
386 	 */
387 	zfs_btree_index_t where_before, where_after;
388 	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
389 	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
390 
391 	merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
392 	    start - gap);
393 	merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
394 	    end + gap);
395 
396 	if (merge_before && gap != 0)
397 		bridge_size += start - zfs_rs_get_end(rs_before, rt);
398 	if (merge_after && gap != 0)
399 		bridge_size += zfs_rs_get_start(rs_after, rt) - end;
400 
401 	if (merge_before && merge_after) {
402 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
403 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
404 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
405 		}
406 
407 		zfs_range_tree_stat_decr(rt, rs_before);
408 		zfs_range_tree_stat_decr(rt, rs_after);
409 
410 		zfs_rs_copy(rs_after, &tmp, rt);
411 		uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
412 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
413 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
414 		zfs_btree_remove_idx(&rt->rt_root, &where_before);
415 
416 		/*
417 		 * We have to re-find the node because our old reference is
418 		 * invalid as soon as we do any mutating btree operations.
419 		 */
420 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
421 		ASSERT3P(rs_after, !=, NULL);
422 		zfs_rs_set_start_raw(rs_after, rt, before_start);
423 		zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
424 		rs = rs_after;
425 	} else if (merge_before) {
426 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
427 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
428 
429 		zfs_range_tree_stat_decr(rt, rs_before);
430 
431 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
432 		zfs_rs_set_end(rs_before, rt, end);
433 		zfs_rs_set_fill(rs_before, rt, before_fill + fill);
434 		rs = rs_before;
435 	} else if (merge_after) {
436 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
437 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
438 
439 		zfs_range_tree_stat_decr(rt, rs_after);
440 
441 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
442 		zfs_rs_set_start(rs_after, rt, start);
443 		zfs_rs_set_fill(rs_after, rt, after_fill + fill);
444 		rs = rs_after;
445 	} else {
446 		rs = &tmp;
447 
448 		zfs_rs_set_start(rs, rt, start);
449 		zfs_rs_set_end(rs, rt, end);
450 		zfs_rs_set_fill(rs, rt, fill);
451 		zfs_btree_add_idx(&rt->rt_root, rs, &where);
452 	}
453 
454 	if (gap != 0) {
455 		ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
456 		    zfs_rs_get_start(rs, rt));
457 	} else {
458 		ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
459 		    zfs_rs_get_start(rs, rt));
460 	}
461 
462 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
463 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
464 
465 	zfs_range_tree_stat_incr(rt, rs);
466 	rt->rt_space += size + bridge_size;
467 }
468 
469 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)470 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
471 {
472 	zfs_range_tree_add_impl(arg, start, size, size);
473 }
474 
475 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)476 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
477     boolean_t do_fill)
478 {
479 	zfs_btree_index_t where;
480 	zfs_range_seg_t *rs;
481 	zfs_range_seg_max_t rsearch, rs_tmp;
482 	uint64_t end = start + size;
483 	uint64_t rstart, rend;
484 	boolean_t left_over, right_over;
485 
486 	VERIFY3U(size, !=, 0);
487 	VERIFY3U(size, <=, rt->rt_space);
488 	if (rt->rt_type == ZFS_RANGE_SEG64)
489 		ASSERT3U(start + size, >, start);
490 
491 	zfs_rs_set_start(&rsearch, rt, start);
492 	zfs_rs_set_end(&rsearch, rt, end);
493 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
494 
495 	/* Make sure we completely overlap with someone */
496 	if (rs == NULL) {
497 		zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
498 		    "from range tree (offset=%llx size=%llx)",
499 		    ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
500 		return;
501 	}
502 
503 	rstart = zfs_rs_get_start(rs, rt);
504 	rend = zfs_rs_get_end(rs, rt);
505 
506 	/*
507 	 * Range trees with gap support must only remove complete segments
508 	 * from the tree. This allows us to maintain accurate fill accounting
509 	 * and to ensure that bridged sections are not leaked. If we need to
510 	 * remove less than the full segment, we can only adjust the fill count.
511 	 */
512 	if (rt->rt_gap != 0) {
513 		if (do_fill) {
514 			if (zfs_rs_get_fill(rs, rt) == size) {
515 				start = rstart;
516 				end = rend;
517 				size = end - start;
518 			} else {
519 				zfs_range_tree_adjust_fill(rt, rs, -size);
520 				return;
521 			}
522 		} else if (rstart != start || rend != end) {
523 			zfs_panic_recover("zfs: rt=%s: freeing partial segment "
524 			    "of gap tree (offset=%llx size=%llx) of "
525 			    "(offset=%llx size=%llx)",
526 			    ZFS_RT_NAME(rt),
527 			    (longlong_t)start, (longlong_t)size,
528 			    (longlong_t)rstart, (longlong_t)(rend - rstart));
529 			return;
530 		}
531 	}
532 
533 	if (!(rstart <= start && rend >= end)) {
534 		panic("zfs: rt=%s: removing segment "
535 		    "(offset=%llx size=%llx) not completely overlapped by "
536 		    "existing one (offset=%llx size=%llx)",
537 		    ZFS_RT_NAME(rt),
538 		    (longlong_t)start, (longlong_t)size,
539 		    (longlong_t)rstart, (longlong_t)(rend - rstart));
540 		return;
541 	}
542 
543 	left_over = (rstart != start);
544 	right_over = (rend != end);
545 
546 	zfs_range_tree_stat_decr(rt, rs);
547 
548 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
549 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
550 
551 	if (left_over && right_over) {
552 		zfs_range_seg_max_t newseg;
553 		zfs_rs_set_start(&newseg, rt, end);
554 		zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
555 		zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
556 		zfs_range_tree_stat_incr(rt, &newseg);
557 
558 		// This modifies the buffer already inside the range tree
559 		zfs_rs_set_end(rs, rt, start);
560 
561 		zfs_rs_copy(rs, &rs_tmp, rt);
562 		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
563 			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
564 		else
565 			zfs_btree_add(&rt->rt_root, &newseg);
566 
567 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
568 			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
569 	} else if (left_over) {
570 		// This modifies the buffer already inside the range tree
571 		zfs_rs_set_end(rs, rt, start);
572 		zfs_rs_copy(rs, &rs_tmp, rt);
573 	} else if (right_over) {
574 		// This modifies the buffer already inside the range tree
575 		zfs_rs_set_start(rs, rt, end);
576 		zfs_rs_copy(rs, &rs_tmp, rt);
577 	} else {
578 		zfs_btree_remove_idx(&rt->rt_root, &where);
579 		rs = NULL;
580 	}
581 
582 	if (rs != NULL) {
583 		/*
584 		 * The fill of the leftover segment will always be equal to
585 		 * the size, since we do not support removing partial segments
586 		 * of range trees with gaps.
587 		 */
588 		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
589 		    zfs_rs_get_start_raw(rs, rt));
590 		zfs_range_tree_stat_incr(rt, &rs_tmp);
591 
592 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
593 			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
594 	}
595 
596 	rt->rt_space -= size;
597 }
598 
599 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)600 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
601 {
602 	zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
603 }
604 
605 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)606 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
607 {
608 	zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
609 }
610 
611 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)612 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
613     uint64_t newstart, uint64_t newsize)
614 {
615 	int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
616 	    zfs_rs_get_start(rs, rt));
617 
618 	zfs_range_tree_stat_decr(rt, rs);
619 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
620 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
621 
622 	zfs_rs_set_start(rs, rt, newstart);
623 	zfs_rs_set_end(rs, rt, newstart + newsize);
624 
625 	zfs_range_tree_stat_incr(rt, rs);
626 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
627 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
628 
629 	rt->rt_space += delta;
630 }
631 
632 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)633 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
634 {
635 	zfs_range_seg_max_t rsearch;
636 	uint64_t end = start + size;
637 
638 	VERIFY(size != 0);
639 
640 	zfs_rs_set_start(&rsearch, rt, start);
641 	zfs_rs_set_end(&rsearch, rt, end);
642 	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
643 }
644 
645 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)646 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
647 {
648 	if (rt->rt_type == ZFS_RANGE_SEG64)
649 		ASSERT3U(start + size, >, start);
650 
651 	zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
652 	if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
653 	    zfs_rs_get_end(rs, rt) >= start + size) {
654 		return (rs);
655 	}
656 	return (NULL);
657 }
658 
659 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)660 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
661     uint64_t size)
662 {
663 	zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
664 	if (rs != NULL)
665 		panic("segment already in tree; rs=%p", (void *)rs);
666 }
667 
668 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)669 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
670 {
671 	return (zfs_range_tree_find(rt, start, size) != NULL);
672 }
673 
674 /*
675  * Returns the first subset of the given range which overlaps with the range
676  * tree. Returns true if there is a segment in the range, and false if there
677  * isn't.
678  */
679 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)680 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
681     uint64_t *ostart, uint64_t *osize)
682 {
683 	if (rt->rt_type == ZFS_RANGE_SEG64)
684 		ASSERT3U(start + size, >, start);
685 
686 	zfs_range_seg_max_t rsearch;
687 	zfs_rs_set_start(&rsearch, rt, start);
688 	zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
689 	    1);
690 
691 	zfs_btree_index_t where;
692 	zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
693 	if (rs != NULL) {
694 		*ostart = start;
695 		*osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
696 		return (B_TRUE);
697 	}
698 
699 	rs = zfs_btree_next(&rt->rt_root, &where, &where);
700 	if (rs == NULL || zfs_rs_get_start(rs, rt) >= start + size)
701 		return (B_FALSE);
702 
703 	*ostart = zfs_rs_get_start(rs, rt);
704 	*osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
705 	    zfs_rs_get_start(rs, rt);
706 	return (B_TRUE);
707 }
708 
709 /*
710  * Ensure that this range is not in the tree, regardless of whether
711  * it is currently in the tree.
712  */
713 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)714 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
715 {
716 	zfs_range_seg_t *rs;
717 
718 	if (size == 0)
719 		return;
720 
721 	if (rt->rt_type == ZFS_RANGE_SEG64)
722 		ASSERT3U(start + size, >, start);
723 
724 	while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
725 		uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
726 		uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
727 		zfs_range_tree_remove(rt, free_start, free_end - free_start);
728 	}
729 }
730 
731 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)732 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
733 {
734 	zfs_range_tree_t *rt;
735 
736 	ASSERT0(zfs_range_tree_space(*rtdst));
737 	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
738 
739 	rt = *rtsrc;
740 	*rtsrc = *rtdst;
741 	*rtdst = rt;
742 }
743 
744 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)745 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
746     void *arg)
747 {
748 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
749 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
750 
751 	if (func != NULL) {
752 		zfs_range_seg_t *rs;
753 		zfs_btree_index_t *cookie = NULL;
754 
755 		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
756 		    NULL) {
757 			func(arg, zfs_rs_get_start(rs, rt),
758 			    zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
759 		}
760 	} else {
761 		zfs_btree_clear(&rt->rt_root);
762 	}
763 
764 	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
765 	rt->rt_space = 0;
766 }
767 
768 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)769 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
770     void *arg)
771 {
772 	zfs_btree_index_t where;
773 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
774 	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
775 		func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
776 		    zfs_rs_get_start(rs, rt));
777 	}
778 }
779 
780 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)781 zfs_range_tree_first(zfs_range_tree_t *rt)
782 {
783 	return (zfs_btree_first(&rt->rt_root, NULL));
784 }
785 
786 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)787 zfs_range_tree_space(zfs_range_tree_t *rt)
788 {
789 	return (rt->rt_space);
790 }
791 
792 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)793 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
794 {
795 	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
796 }
797 
798 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)799 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
800 {
801 	ASSERT(rt != NULL);
802 	return (zfs_range_tree_space(rt) == 0);
803 }
804 
805 /*
806  * Remove any overlapping ranges between the given segment [start, end)
807  * from removefrom. Add non-overlapping leftovers to addto.
808  */
809 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)810 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
811     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
812 {
813 	zfs_btree_index_t where;
814 	zfs_range_seg_max_t starting_rs;
815 	zfs_rs_set_start(&starting_rs, removefrom, start);
816 	zfs_rs_set_end_raw(&starting_rs, removefrom,
817 	    zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
818 
819 	zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
820 	    &starting_rs, &where);
821 
822 	if (curr == NULL)
823 		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
824 
825 	zfs_range_seg_t *next;
826 	for (; curr != NULL; curr = next) {
827 		if (start == end)
828 			return;
829 		VERIFY3U(start, <, end);
830 
831 		/* there is no overlap */
832 		if (end <= zfs_rs_get_start(curr, removefrom)) {
833 			zfs_range_tree_add(addto, start, end - start);
834 			return;
835 		}
836 
837 		uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
838 		    start);
839 		uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
840 		    end);
841 		uint64_t overlap_size = overlap_end - overlap_start;
842 		ASSERT3S(overlap_size, >, 0);
843 		zfs_range_seg_max_t rs;
844 		zfs_rs_copy(curr, &rs, removefrom);
845 
846 		zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
847 
848 		if (start < overlap_start)
849 			zfs_range_tree_add(addto, start, overlap_start - start);
850 
851 		start = overlap_end;
852 		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
853 		/*
854 		 * If we find something here, we only removed part of the
855 		 * curr segment. Either there's some left at the end
856 		 * because we've reached the end of the range we're removing,
857 		 * or there's some left at the start because we started
858 		 * partway through the range.  Either way, we continue with
859 		 * the loop. If it's the former, we'll return at the start of
860 		 * the loop, and if it's the latter we'll see if there is more
861 		 * area to process.
862 		 */
863 		if (next != NULL) {
864 			ASSERT(start == end || start == zfs_rs_get_end(&rs,
865 			    removefrom));
866 		}
867 
868 		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
869 	}
870 	VERIFY3P(curr, ==, NULL);
871 
872 	if (start != end) {
873 		VERIFY3U(start, <, end);
874 		zfs_range_tree_add(addto, start, end - start);
875 	} else {
876 		VERIFY3U(start, ==, end);
877 	}
878 }
879 
880 /*
881  * For each entry in rt, if it exists in removefrom, remove it
882  * from removefrom. Otherwise, add it to addto.
883  */
884 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)885 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
886     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
887 {
888 	zfs_btree_index_t where;
889 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
890 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
891 		zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
892 		    zfs_rs_get_end(rs, rt), removefrom, addto);
893 	}
894 }
895 
896 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)897 zfs_range_tree_min(zfs_range_tree_t *rt)
898 {
899 	zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
900 	return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
901 }
902 
903 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)904 zfs_range_tree_max(zfs_range_tree_t *rt)
905 {
906 	zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
907 	return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
908 }
909 
910 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)911 zfs_range_tree_span(zfs_range_tree_t *rt)
912 {
913 	return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
914 }
915