1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /*
27 * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37
38 /*
39 * Range trees are tree-based data structures that can be used to
40 * track free space or generally any space allocation information.
41 * A range tree keeps track of individual segments and automatically
42 * provides facilities such as adjacent extent merging and extent
43 * splitting in response to range add/remove requests.
44 *
45 * A range tree starts out completely empty, with no segments in it.
46 * Adding an allocation via zfs_range_tree_add to the range tree can either:
47 * 1) create a new extent
48 * 2) extend an adjacent extent
49 * 3) merge two adjacent extents
50 * Conversely, removing an allocation via zfs_range_tree_remove can:
51 * 1) completely remove an extent
52 * 2) shorten an extent (if the allocation was near one of its ends)
53 * 3) split an extent into two extents, in effect punching a hole
54 *
55 * A range tree is also capable of 'bridging' gaps when adding
56 * allocations. This is useful for cases when close proximity of
57 * allocations is an important detail that needs to be represented
58 * in the range tree. See zfs_range_tree_set_gap(). The default behavior
59 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
60 *
61 * In order to traverse a range tree, use either the zfs_range_tree_walk()
62 * or zfs_range_tree_vacate() functions.
63 *
64 * To obtain more accurate information on individual segment
65 * operations that the range tree performs "under the hood", you can
66 * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
67 * to the zfs_range_tree_create function. Any callbacks that are non-NULL
68 * are then called at the appropriate times.
69 *
70 * The range tree code also supports a special variant of range trees
71 * that can bridge small gaps between segments. This kind of tree is used
72 * by the dsl scanning code to group I/Os into mostly sequential chunks to
73 * optimize disk performance. The code here attempts to do this with as
74 * little memory and computational overhead as possible. One limitation of
75 * this implementation is that segments of range trees with gaps can only
76 * support removing complete segments.
77 */
78
79 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)80 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
81 {
82 ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
83 size_t size = 0;
84 switch (rt->rt_type) {
85 case ZFS_RANGE_SEG32:
86 size = sizeof (zfs_range_seg32_t);
87 break;
88 case ZFS_RANGE_SEG64:
89 size = sizeof (zfs_range_seg64_t);
90 break;
91 case ZFS_RANGE_SEG_GAP:
92 size = sizeof (zfs_range_seg_gap_t);
93 break;
94 default:
95 __builtin_unreachable();
96 }
97 memcpy(dest, src, size);
98 }
99
100 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)101 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
102 {
103 zfs_range_seg_t *rs;
104 zfs_btree_index_t where;
105 uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
106 int i;
107
108 for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
109 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
110 uint64_t size = zfs_rs_get_end(rs, rt) -
111 zfs_rs_get_start(rs, rt);
112 int idx = highbit64(size) - 1;
113
114 hist[idx]++;
115 ASSERT3U(hist[idx], !=, 0);
116 }
117
118 for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
119 if (hist[i] != rt->rt_histogram[i]) {
120 zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
121 i, hist, (u_longlong_t)hist[i],
122 (u_longlong_t)rt->rt_histogram[i]);
123 }
124 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
125 }
126 }
127
128 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)129 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
130 {
131 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
132 int idx = highbit64(size) - 1;
133
134 ASSERT(size != 0);
135 ASSERT3U(idx, <,
136 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
137
138 rt->rt_histogram[idx]++;
139 ASSERT3U(rt->rt_histogram[idx], !=, 0);
140 }
141
142 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)143 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
144 {
145 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
146 int idx = highbit64(size) - 1;
147
148 ASSERT(size != 0);
149 ASSERT3U(idx, <,
150 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
151
152 ASSERT3U(rt->rt_histogram[idx], !=, 0);
153 rt->rt_histogram[idx]--;
154 }
155
156 __attribute__((always_inline)) inline
157 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)158 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
159 {
160 const zfs_range_seg32_t *r1 = x1;
161 const zfs_range_seg32_t *r2 = x2;
162
163 ASSERT3U(r1->rs_start, <=, r1->rs_end);
164 ASSERT3U(r2->rs_start, <=, r2->rs_end);
165
166 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
167 }
168
169 __attribute__((always_inline)) inline
170 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)171 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
172 {
173 const zfs_range_seg64_t *r1 = x1;
174 const zfs_range_seg64_t *r2 = x2;
175
176 ASSERT3U(r1->rs_start, <=, r1->rs_end);
177 ASSERT3U(r2->rs_start, <=, r2->rs_end);
178
179 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
180 }
181
182 __attribute__((always_inline)) inline
183 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)184 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
185 {
186 const zfs_range_seg_gap_t *r1 = x1;
187 const zfs_range_seg_gap_t *r2 = x2;
188
189 ASSERT3U(r1->rs_start, <=, r1->rs_end);
190 ASSERT3U(r2->rs_start, <=, r2->rs_end);
191
192 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
193 }
194
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)195 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
196 zfs_range_tree_seg32_compare)
197
198 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
199 zfs_range_tree_seg64_compare)
200
201 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
202 zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
203
204 zfs_range_tree_t *
205 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
206 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
207 uint64_t gap)
208 {
209 zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
210
211 ASSERT3U(shift, <, 64);
212 ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
213 size_t size;
214 int (*compare) (const void *, const void *);
215 bt_find_in_buf_f bt_find;
216 switch (type) {
217 case ZFS_RANGE_SEG32:
218 size = sizeof (zfs_range_seg32_t);
219 compare = zfs_range_tree_seg32_compare;
220 bt_find = zfs_range_tree_seg32_find_in_buf;
221 break;
222 case ZFS_RANGE_SEG64:
223 size = sizeof (zfs_range_seg64_t);
224 compare = zfs_range_tree_seg64_compare;
225 bt_find = zfs_range_tree_seg64_find_in_buf;
226 break;
227 case ZFS_RANGE_SEG_GAP:
228 size = sizeof (zfs_range_seg_gap_t);
229 compare = zfs_range_tree_seg_gap_compare;
230 bt_find = zfs_range_tree_seg_gap_find_in_buf;
231 break;
232 default:
233 panic("Invalid range seg type %d", type);
234 }
235 zfs_btree_create(&rt->rt_root, compare, bt_find, size);
236
237 rt->rt_ops = ops;
238 rt->rt_gap = gap;
239 rt->rt_arg = arg;
240 rt->rt_type = type;
241 rt->rt_start = start;
242 rt->rt_shift = shift;
243
244 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
245 rt->rt_ops->rtop_create(rt, rt->rt_arg);
246
247 return (rt);
248 }
249
250 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)251 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
252 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
253 {
254 return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
255 }
256
257 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)258 zfs_range_tree_destroy(zfs_range_tree_t *rt)
259 {
260 VERIFY0(rt->rt_space);
261
262 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
263 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
264
265 zfs_btree_destroy(&rt->rt_root);
266 kmem_free(rt, sizeof (*rt));
267 }
268
269 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)270 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
271 int64_t delta)
272 {
273 if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
274 zfs_panic_recover("zfs: attempting to decrease fill to or "
275 "below 0; probable double remove in segment [%llx:%llx]",
276 (longlong_t)zfs_rs_get_start(rs, rt),
277 (longlong_t)zfs_rs_get_end(rs, rt));
278 }
279 if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
280 zfs_rs_get_start(rs, rt)) {
281 zfs_panic_recover("zfs: attempting to increase fill beyond "
282 "max; probable double add in segment [%llx:%llx]",
283 (longlong_t)zfs_rs_get_start(rs, rt),
284 (longlong_t)zfs_rs_get_end(rs, rt));
285 }
286
287 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
288 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
289 zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
290 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
291 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
292 }
293
294 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)295 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
296 {
297 zfs_range_tree_t *rt = arg;
298 zfs_btree_index_t where;
299 zfs_range_seg_t *rs_before, *rs_after, *rs;
300 zfs_range_seg_max_t tmp, rsearch;
301 uint64_t end = start + size, gap = rt->rt_gap;
302 uint64_t bridge_size = 0;
303 boolean_t merge_before, merge_after;
304
305 ASSERT3U(size, !=, 0);
306 ASSERT3U(fill, <=, size);
307 ASSERT3U(start + size, >, start);
308
309 zfs_rs_set_start(&rsearch, rt, start);
310 zfs_rs_set_end(&rsearch, rt, end);
311 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
312
313 /*
314 * If this is a gap-supporting range tree, it is possible that we
315 * are inserting into an existing segment. In this case simply
316 * bump the fill count and call the remove / add callbacks. If the
317 * new range will extend an existing segment, we remove the
318 * existing one, apply the new extent to it and re-insert it using
319 * the normal code paths.
320 */
321 if (rs != NULL) {
322 if (gap == 0) {
323 zfs_panic_recover("zfs: adding existent segment to "
324 "range tree (offset=%llx size=%llx)",
325 (longlong_t)start, (longlong_t)size);
326 return;
327 }
328 uint64_t rstart = zfs_rs_get_start(rs, rt);
329 uint64_t rend = zfs_rs_get_end(rs, rt);
330 if (rstart <= start && rend >= end) {
331 zfs_range_tree_adjust_fill(rt, rs, fill);
332 return;
333 }
334
335 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
336 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
337
338 zfs_range_tree_stat_decr(rt, rs);
339 rt->rt_space -= rend - rstart;
340
341 fill += zfs_rs_get_fill(rs, rt);
342 start = MIN(start, rstart);
343 end = MAX(end, rend);
344 size = end - start;
345
346 zfs_btree_remove(&rt->rt_root, rs);
347 zfs_range_tree_add_impl(rt, start, size, fill);
348 return;
349 }
350
351 ASSERT3P(rs, ==, NULL);
352
353 /*
354 * Determine whether or not we will have to merge with our neighbors.
355 * If gap != 0, we might need to merge with our neighbors even if we
356 * aren't directly touching.
357 */
358 zfs_btree_index_t where_before, where_after;
359 rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
360 rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
361
362 merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
363 start - gap);
364 merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
365 end + gap);
366
367 if (merge_before && gap != 0)
368 bridge_size += start - zfs_rs_get_end(rs_before, rt);
369 if (merge_after && gap != 0)
370 bridge_size += zfs_rs_get_start(rs_after, rt) - end;
371
372 if (merge_before && merge_after) {
373 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
374 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
375 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
376 }
377
378 zfs_range_tree_stat_decr(rt, rs_before);
379 zfs_range_tree_stat_decr(rt, rs_after);
380
381 zfs_rs_copy(rs_after, &tmp, rt);
382 uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
383 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
384 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
385 zfs_btree_remove_idx(&rt->rt_root, &where_before);
386
387 /*
388 * We have to re-find the node because our old reference is
389 * invalid as soon as we do any mutating btree operations.
390 */
391 rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
392 ASSERT3P(rs_after, !=, NULL);
393 zfs_rs_set_start_raw(rs_after, rt, before_start);
394 zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
395 rs = rs_after;
396 } else if (merge_before) {
397 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
398 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
399
400 zfs_range_tree_stat_decr(rt, rs_before);
401
402 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
403 zfs_rs_set_end(rs_before, rt, end);
404 zfs_rs_set_fill(rs_before, rt, before_fill + fill);
405 rs = rs_before;
406 } else if (merge_after) {
407 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
408 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
409
410 zfs_range_tree_stat_decr(rt, rs_after);
411
412 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
413 zfs_rs_set_start(rs_after, rt, start);
414 zfs_rs_set_fill(rs_after, rt, after_fill + fill);
415 rs = rs_after;
416 } else {
417 rs = &tmp;
418
419 zfs_rs_set_start(rs, rt, start);
420 zfs_rs_set_end(rs, rt, end);
421 zfs_rs_set_fill(rs, rt, fill);
422 zfs_btree_add_idx(&rt->rt_root, rs, &where);
423 }
424
425 if (gap != 0) {
426 ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
427 zfs_rs_get_start(rs, rt));
428 } else {
429 ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
430 zfs_rs_get_start(rs, rt));
431 }
432
433 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
434 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
435
436 zfs_range_tree_stat_incr(rt, rs);
437 rt->rt_space += size + bridge_size;
438 }
439
440 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)441 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
442 {
443 zfs_range_tree_add_impl(arg, start, size, size);
444 }
445
446 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)447 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
448 boolean_t do_fill)
449 {
450 zfs_btree_index_t where;
451 zfs_range_seg_t *rs;
452 zfs_range_seg_max_t rsearch, rs_tmp;
453 uint64_t end = start + size;
454 boolean_t left_over, right_over;
455
456 VERIFY3U(size, !=, 0);
457 VERIFY3U(size, <=, rt->rt_space);
458 if (rt->rt_type == ZFS_RANGE_SEG64)
459 ASSERT3U(start + size, >, start);
460
461 zfs_rs_set_start(&rsearch, rt, start);
462 zfs_rs_set_end(&rsearch, rt, end);
463 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
464
465 /* Make sure we completely overlap with someone */
466 if (rs == NULL) {
467 zfs_panic_recover("zfs: removing nonexistent segment from "
468 "range tree (offset=%llx size=%llx)",
469 (longlong_t)start, (longlong_t)size);
470 return;
471 }
472
473 /*
474 * Range trees with gap support must only remove complete segments
475 * from the tree. This allows us to maintain accurate fill accounting
476 * and to ensure that bridged sections are not leaked. If we need to
477 * remove less than the full segment, we can only adjust the fill count.
478 */
479 if (rt->rt_gap != 0) {
480 if (do_fill) {
481 if (zfs_rs_get_fill(rs, rt) == size) {
482 start = zfs_rs_get_start(rs, rt);
483 end = zfs_rs_get_end(rs, rt);
484 size = end - start;
485 } else {
486 zfs_range_tree_adjust_fill(rt, rs, -size);
487 return;
488 }
489 } else if (zfs_rs_get_start(rs, rt) != start ||
490 zfs_rs_get_end(rs, rt) != end) {
491 zfs_panic_recover("zfs: freeing partial segment of "
492 "gap tree (offset=%llx size=%llx) of "
493 "(offset=%llx size=%llx)",
494 (longlong_t)start, (longlong_t)size,
495 (longlong_t)zfs_rs_get_start(rs, rt),
496 (longlong_t)zfs_rs_get_end(rs, rt) -
497 zfs_rs_get_start(rs, rt));
498 return;
499 }
500 }
501
502 VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
503 VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
504
505 left_over = (zfs_rs_get_start(rs, rt) != start);
506 right_over = (zfs_rs_get_end(rs, rt) != end);
507
508 zfs_range_tree_stat_decr(rt, rs);
509
510 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
511 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
512
513 if (left_over && right_over) {
514 zfs_range_seg_max_t newseg;
515 zfs_rs_set_start(&newseg, rt, end);
516 zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
517 zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
518 zfs_range_tree_stat_incr(rt, &newseg);
519
520 // This modifies the buffer already inside the range tree
521 zfs_rs_set_end(rs, rt, start);
522
523 zfs_rs_copy(rs, &rs_tmp, rt);
524 if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
525 zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
526 else
527 zfs_btree_add(&rt->rt_root, &newseg);
528
529 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
530 rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
531 } else if (left_over) {
532 // This modifies the buffer already inside the range tree
533 zfs_rs_set_end(rs, rt, start);
534 zfs_rs_copy(rs, &rs_tmp, rt);
535 } else if (right_over) {
536 // This modifies the buffer already inside the range tree
537 zfs_rs_set_start(rs, rt, end);
538 zfs_rs_copy(rs, &rs_tmp, rt);
539 } else {
540 zfs_btree_remove_idx(&rt->rt_root, &where);
541 rs = NULL;
542 }
543
544 if (rs != NULL) {
545 /*
546 * The fill of the leftover segment will always be equal to
547 * the size, since we do not support removing partial segments
548 * of range trees with gaps.
549 */
550 zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
551 zfs_rs_get_start_raw(rs, rt));
552 zfs_range_tree_stat_incr(rt, &rs_tmp);
553
554 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
555 rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
556 }
557
558 rt->rt_space -= size;
559 }
560
561 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)562 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
563 {
564 zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
565 }
566
567 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)568 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
569 {
570 zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
571 }
572
573 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)574 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
575 uint64_t newstart, uint64_t newsize)
576 {
577 int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
578 zfs_rs_get_start(rs, rt));
579
580 zfs_range_tree_stat_decr(rt, rs);
581 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
582 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
583
584 zfs_rs_set_start(rs, rt, newstart);
585 zfs_rs_set_end(rs, rt, newstart + newsize);
586
587 zfs_range_tree_stat_incr(rt, rs);
588 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
589 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
590
591 rt->rt_space += delta;
592 }
593
594 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)595 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
596 {
597 zfs_range_seg_max_t rsearch;
598 uint64_t end = start + size;
599
600 VERIFY(size != 0);
601
602 zfs_rs_set_start(&rsearch, rt, start);
603 zfs_rs_set_end(&rsearch, rt, end);
604 return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
605 }
606
607 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)608 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
609 {
610 if (rt->rt_type == ZFS_RANGE_SEG64)
611 ASSERT3U(start + size, >, start);
612
613 zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
614 if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
615 zfs_rs_get_end(rs, rt) >= start + size) {
616 return (rs);
617 }
618 return (NULL);
619 }
620
621 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)622 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
623 uint64_t size)
624 {
625 zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
626 if (rs != NULL)
627 panic("segment already in tree; rs=%p", (void *)rs);
628 }
629
630 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)631 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
632 {
633 return (zfs_range_tree_find(rt, start, size) != NULL);
634 }
635
636 /*
637 * Returns the first subset of the given range which overlaps with the range
638 * tree. Returns true if there is a segment in the range, and false if there
639 * isn't.
640 */
641 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)642 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
643 uint64_t *ostart, uint64_t *osize)
644 {
645 if (rt->rt_type == ZFS_RANGE_SEG64)
646 ASSERT3U(start + size, >, start);
647
648 zfs_range_seg_max_t rsearch;
649 zfs_rs_set_start(&rsearch, rt, start);
650 zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
651 1);
652
653 zfs_btree_index_t where;
654 zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
655 if (rs != NULL) {
656 *ostart = start;
657 *osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
658 return (B_TRUE);
659 }
660
661 rs = zfs_btree_next(&rt->rt_root, &where, &where);
662 if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size)
663 return (B_FALSE);
664
665 *ostart = zfs_rs_get_start(rs, rt);
666 *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
667 zfs_rs_get_start(rs, rt);
668 return (B_TRUE);
669 }
670
671 /*
672 * Ensure that this range is not in the tree, regardless of whether
673 * it is currently in the tree.
674 */
675 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)676 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
677 {
678 zfs_range_seg_t *rs;
679
680 if (size == 0)
681 return;
682
683 if (rt->rt_type == ZFS_RANGE_SEG64)
684 ASSERT3U(start + size, >, start);
685
686 while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
687 uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
688 uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
689 zfs_range_tree_remove(rt, free_start, free_end - free_start);
690 }
691 }
692
693 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)694 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
695 {
696 zfs_range_tree_t *rt;
697
698 ASSERT0(zfs_range_tree_space(*rtdst));
699 ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
700
701 rt = *rtsrc;
702 *rtsrc = *rtdst;
703 *rtdst = rt;
704 }
705
706 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)707 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
708 void *arg)
709 {
710 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
711 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
712
713 if (func != NULL) {
714 zfs_range_seg_t *rs;
715 zfs_btree_index_t *cookie = NULL;
716
717 while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
718 NULL) {
719 func(arg, zfs_rs_get_start(rs, rt),
720 zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
721 }
722 } else {
723 zfs_btree_clear(&rt->rt_root);
724 }
725
726 memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
727 rt->rt_space = 0;
728 }
729
730 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)731 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
732 void *arg)
733 {
734 zfs_btree_index_t where;
735 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
736 rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
737 func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
738 zfs_rs_get_start(rs, rt));
739 }
740 }
741
742 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)743 zfs_range_tree_first(zfs_range_tree_t *rt)
744 {
745 return (zfs_btree_first(&rt->rt_root, NULL));
746 }
747
748 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)749 zfs_range_tree_space(zfs_range_tree_t *rt)
750 {
751 return (rt->rt_space);
752 }
753
754 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)755 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
756 {
757 return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
758 }
759
760 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)761 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
762 {
763 ASSERT(rt != NULL);
764 return (zfs_range_tree_space(rt) == 0);
765 }
766
767 /*
768 * Remove any overlapping ranges between the given segment [start, end)
769 * from removefrom. Add non-overlapping leftovers to addto.
770 */
771 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)772 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
773 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
774 {
775 zfs_btree_index_t where;
776 zfs_range_seg_max_t starting_rs;
777 zfs_rs_set_start(&starting_rs, removefrom, start);
778 zfs_rs_set_end_raw(&starting_rs, removefrom,
779 zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
780
781 zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
782 &starting_rs, &where);
783
784 if (curr == NULL)
785 curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
786
787 zfs_range_seg_t *next;
788 for (; curr != NULL; curr = next) {
789 if (start == end)
790 return;
791 VERIFY3U(start, <, end);
792
793 /* there is no overlap */
794 if (end <= zfs_rs_get_start(curr, removefrom)) {
795 zfs_range_tree_add(addto, start, end - start);
796 return;
797 }
798
799 uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
800 start);
801 uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
802 end);
803 uint64_t overlap_size = overlap_end - overlap_start;
804 ASSERT3S(overlap_size, >, 0);
805 zfs_range_seg_max_t rs;
806 zfs_rs_copy(curr, &rs, removefrom);
807
808 zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
809
810 if (start < overlap_start)
811 zfs_range_tree_add(addto, start, overlap_start - start);
812
813 start = overlap_end;
814 next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
815 /*
816 * If we find something here, we only removed part of the
817 * curr segment. Either there's some left at the end
818 * because we've reached the end of the range we're removing,
819 * or there's some left at the start because we started
820 * partway through the range. Either way, we continue with
821 * the loop. If it's the former, we'll return at the start of
822 * the loop, and if it's the latter we'll see if there is more
823 * area to process.
824 */
825 if (next != NULL) {
826 ASSERT(start == end || start == zfs_rs_get_end(&rs,
827 removefrom));
828 }
829
830 next = zfs_btree_next(&removefrom->rt_root, &where, &where);
831 }
832 VERIFY3P(curr, ==, NULL);
833
834 if (start != end) {
835 VERIFY3U(start, <, end);
836 zfs_range_tree_add(addto, start, end - start);
837 } else {
838 VERIFY3U(start, ==, end);
839 }
840 }
841
842 /*
843 * For each entry in rt, if it exists in removefrom, remove it
844 * from removefrom. Otherwise, add it to addto.
845 */
846 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)847 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
848 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
849 {
850 zfs_btree_index_t where;
851 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
852 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
853 zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
854 zfs_rs_get_end(rs, rt), removefrom, addto);
855 }
856 }
857
858 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)859 zfs_range_tree_min(zfs_range_tree_t *rt)
860 {
861 zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
862 return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
863 }
864
865 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)866 zfs_range_tree_max(zfs_range_tree_t *rt)
867 {
868 zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
869 return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
870 }
871
872 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)873 zfs_range_tree_span(zfs_range_tree_t *rt)
874 {
875 return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
876 }
877