1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
27 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/dmu.h>
33 #include <sys/dnode.h>
34 #include <sys/zio.h>
35 #include <sys/range_tree.h>
36
37 /*
38 * Range trees are tree-based data structures that can be used to
39 * track free space or generally any space allocation information.
40 * A range tree keeps track of individual segments and automatically
41 * provides facilities such as adjacent extent merging and extent
42 * splitting in response to range add/remove requests.
43 *
44 * A range tree starts out completely empty, with no segments in it.
45 * Adding an allocation via zfs_range_tree_add to the range tree can either:
46 * 1) create a new extent
47 * 2) extend an adjacent extent
48 * 3) merge two adjacent extents
49 * Conversely, removing an allocation via zfs_range_tree_remove can:
50 * 1) completely remove an extent
51 * 2) shorten an extent (if the allocation was near one of its ends)
52 * 3) split an extent into two extents, in effect punching a hole
53 *
54 * A range tree is also capable of 'bridging' gaps when adding
55 * allocations. This is useful for cases when close proximity of
56 * allocations is an important detail that needs to be represented
57 * in the range tree. See zfs_range_tree_set_gap(). The default behavior
58 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
59 *
60 * In order to traverse a range tree, use either the zfs_range_tree_walk()
61 * or zfs_range_tree_vacate() functions.
62 *
63 * To obtain more accurate information on individual segment
64 * operations that the range tree performs "under the hood", you can
65 * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
66 * to the zfs_range_tree_create function. Any callbacks that are non-NULL
67 * are then called at the appropriate times.
68 *
69 * The range tree code also supports a special variant of range trees
70 * that can bridge small gaps between segments. This kind of tree is used
71 * by the dsl scanning code to group I/Os into mostly sequential chunks to
72 * optimize disk performance. The code here attempts to do this with as
73 * little memory and computational overhead as possible. One limitation of
74 * this implementation is that segments of range trees with gaps can only
75 * support removing complete segments.
76 */
77
78 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)79 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
80 {
81 ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
82 size_t size = 0;
83 switch (rt->rt_type) {
84 case ZFS_RANGE_SEG32:
85 size = sizeof (zfs_range_seg32_t);
86 break;
87 case ZFS_RANGE_SEG64:
88 size = sizeof (zfs_range_seg64_t);
89 break;
90 case ZFS_RANGE_SEG_GAP:
91 size = sizeof (zfs_range_seg_gap_t);
92 break;
93 default:
94 __builtin_unreachable();
95 }
96 memcpy(dest, src, size);
97 }
98
99 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)100 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
101 {
102 zfs_range_seg_t *rs;
103 zfs_btree_index_t where;
104 uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
105 int i;
106
107 for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
108 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
109 uint64_t size = zfs_rs_get_end(rs, rt) -
110 zfs_rs_get_start(rs, rt);
111 int idx = highbit64(size) - 1;
112
113 hist[idx]++;
114 ASSERT3U(hist[idx], !=, 0);
115 }
116
117 for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
118 if (hist[i] != rt->rt_histogram[i]) {
119 zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
120 i, hist, (u_longlong_t)hist[i],
121 (u_longlong_t)rt->rt_histogram[i]);
122 }
123 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
124 }
125 }
126
127 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)128 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
129 {
130 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
131 int idx = highbit64(size) - 1;
132
133 ASSERT(size != 0);
134 ASSERT3U(idx, <,
135 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
136
137 rt->rt_histogram[idx]++;
138 ASSERT3U(rt->rt_histogram[idx], !=, 0);
139 }
140
141 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)142 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
143 {
144 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
145 int idx = highbit64(size) - 1;
146
147 ASSERT(size != 0);
148 ASSERT3U(idx, <,
149 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
150
151 ASSERT3U(rt->rt_histogram[idx], !=, 0);
152 rt->rt_histogram[idx]--;
153 }
154
155 __attribute__((always_inline)) inline
156 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)157 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
158 {
159 const zfs_range_seg32_t *r1 = x1;
160 const zfs_range_seg32_t *r2 = x2;
161
162 ASSERT3U(r1->rs_start, <=, r1->rs_end);
163 ASSERT3U(r2->rs_start, <=, r2->rs_end);
164
165 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
166 }
167
168 __attribute__((always_inline)) inline
169 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)170 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
171 {
172 const zfs_range_seg64_t *r1 = x1;
173 const zfs_range_seg64_t *r2 = x2;
174
175 ASSERT3U(r1->rs_start, <=, r1->rs_end);
176 ASSERT3U(r2->rs_start, <=, r2->rs_end);
177
178 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
179 }
180
181 __attribute__((always_inline)) inline
182 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)183 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
184 {
185 const zfs_range_seg_gap_t *r1 = x1;
186 const zfs_range_seg_gap_t *r2 = x2;
187
188 ASSERT3U(r1->rs_start, <=, r1->rs_end);
189 ASSERT3U(r2->rs_start, <=, r2->rs_end);
190
191 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
192 }
193
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)194 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
195 zfs_range_tree_seg32_compare)
196
197 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
198 zfs_range_tree_seg64_compare)
199
200 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
201 zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
202
203 zfs_range_tree_t *
204 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
205 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
206 uint64_t gap)
207 {
208 zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
209
210 ASSERT3U(shift, <, 64);
211 ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
212 size_t size;
213 int (*compare) (const void *, const void *);
214 bt_find_in_buf_f bt_find;
215 switch (type) {
216 case ZFS_RANGE_SEG32:
217 size = sizeof (zfs_range_seg32_t);
218 compare = zfs_range_tree_seg32_compare;
219 bt_find = zfs_range_tree_seg32_find_in_buf;
220 break;
221 case ZFS_RANGE_SEG64:
222 size = sizeof (zfs_range_seg64_t);
223 compare = zfs_range_tree_seg64_compare;
224 bt_find = zfs_range_tree_seg64_find_in_buf;
225 break;
226 case ZFS_RANGE_SEG_GAP:
227 size = sizeof (zfs_range_seg_gap_t);
228 compare = zfs_range_tree_seg_gap_compare;
229 bt_find = zfs_range_tree_seg_gap_find_in_buf;
230 break;
231 default:
232 panic("Invalid range seg type %d", type);
233 }
234 zfs_btree_create(&rt->rt_root, compare, bt_find, size);
235
236 rt->rt_ops = ops;
237 rt->rt_gap = gap;
238 rt->rt_arg = arg;
239 rt->rt_type = type;
240 rt->rt_start = start;
241 rt->rt_shift = shift;
242
243 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
244 rt->rt_ops->rtop_create(rt, rt->rt_arg);
245
246 return (rt);
247 }
248
249 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)250 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
251 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
252 {
253 return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
254 }
255
256 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)257 zfs_range_tree_destroy(zfs_range_tree_t *rt)
258 {
259 VERIFY0(rt->rt_space);
260
261 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
262 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
263
264 zfs_btree_destroy(&rt->rt_root);
265 kmem_free(rt, sizeof (*rt));
266 }
267
268 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)269 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
270 int64_t delta)
271 {
272 if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
273 zfs_panic_recover("zfs: attempting to decrease fill to or "
274 "below 0; probable double remove in segment [%llx:%llx]",
275 (longlong_t)zfs_rs_get_start(rs, rt),
276 (longlong_t)zfs_rs_get_end(rs, rt));
277 }
278 if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
279 zfs_rs_get_start(rs, rt)) {
280 zfs_panic_recover("zfs: attempting to increase fill beyond "
281 "max; probable double add in segment [%llx:%llx]",
282 (longlong_t)zfs_rs_get_start(rs, rt),
283 (longlong_t)zfs_rs_get_end(rs, rt));
284 }
285
286 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
287 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
288 zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
289 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
290 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
291 }
292
293 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)294 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
295 {
296 zfs_range_tree_t *rt = arg;
297 zfs_btree_index_t where;
298 zfs_range_seg_t *rs_before, *rs_after, *rs;
299 zfs_range_seg_max_t tmp, rsearch;
300 uint64_t end = start + size, gap = rt->rt_gap;
301 uint64_t bridge_size = 0;
302 boolean_t merge_before, merge_after;
303
304 ASSERT3U(size, !=, 0);
305 ASSERT3U(fill, <=, size);
306 ASSERT3U(start + size, >, start);
307
308 zfs_rs_set_start(&rsearch, rt, start);
309 zfs_rs_set_end(&rsearch, rt, end);
310 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
311
312 /*
313 * If this is a gap-supporting range tree, it is possible that we
314 * are inserting into an existing segment. In this case simply
315 * bump the fill count and call the remove / add callbacks. If the
316 * new range will extend an existing segment, we remove the
317 * existing one, apply the new extent to it and re-insert it using
318 * the normal code paths.
319 */
320 if (rs != NULL) {
321 if (gap == 0) {
322 zfs_panic_recover("zfs: adding existent segment to "
323 "range tree (offset=%llx size=%llx)",
324 (longlong_t)start, (longlong_t)size);
325 return;
326 }
327 uint64_t rstart = zfs_rs_get_start(rs, rt);
328 uint64_t rend = zfs_rs_get_end(rs, rt);
329 if (rstart <= start && rend >= end) {
330 zfs_range_tree_adjust_fill(rt, rs, fill);
331 return;
332 }
333
334 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
335 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
336
337 zfs_range_tree_stat_decr(rt, rs);
338 rt->rt_space -= rend - rstart;
339
340 fill += zfs_rs_get_fill(rs, rt);
341 start = MIN(start, rstart);
342 end = MAX(end, rend);
343 size = end - start;
344
345 zfs_btree_remove(&rt->rt_root, rs);
346 zfs_range_tree_add_impl(rt, start, size, fill);
347 return;
348 }
349
350 ASSERT3P(rs, ==, NULL);
351
352 /*
353 * Determine whether or not we will have to merge with our neighbors.
354 * If gap != 0, we might need to merge with our neighbors even if we
355 * aren't directly touching.
356 */
357 zfs_btree_index_t where_before, where_after;
358 rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
359 rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
360
361 merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
362 start - gap);
363 merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
364 end + gap);
365
366 if (merge_before && gap != 0)
367 bridge_size += start - zfs_rs_get_end(rs_before, rt);
368 if (merge_after && gap != 0)
369 bridge_size += zfs_rs_get_start(rs_after, rt) - end;
370
371 if (merge_before && merge_after) {
372 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
373 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
374 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
375 }
376
377 zfs_range_tree_stat_decr(rt, rs_before);
378 zfs_range_tree_stat_decr(rt, rs_after);
379
380 zfs_rs_copy(rs_after, &tmp, rt);
381 uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
382 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
383 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
384 zfs_btree_remove_idx(&rt->rt_root, &where_before);
385
386 /*
387 * We have to re-find the node because our old reference is
388 * invalid as soon as we do any mutating btree operations.
389 */
390 rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
391 ASSERT3P(rs_after, !=, NULL);
392 zfs_rs_set_start_raw(rs_after, rt, before_start);
393 zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
394 rs = rs_after;
395 } else if (merge_before) {
396 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
397 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
398
399 zfs_range_tree_stat_decr(rt, rs_before);
400
401 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
402 zfs_rs_set_end(rs_before, rt, end);
403 zfs_rs_set_fill(rs_before, rt, before_fill + fill);
404 rs = rs_before;
405 } else if (merge_after) {
406 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
407 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
408
409 zfs_range_tree_stat_decr(rt, rs_after);
410
411 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
412 zfs_rs_set_start(rs_after, rt, start);
413 zfs_rs_set_fill(rs_after, rt, after_fill + fill);
414 rs = rs_after;
415 } else {
416 rs = &tmp;
417
418 zfs_rs_set_start(rs, rt, start);
419 zfs_rs_set_end(rs, rt, end);
420 zfs_rs_set_fill(rs, rt, fill);
421 zfs_btree_add_idx(&rt->rt_root, rs, &where);
422 }
423
424 if (gap != 0) {
425 ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
426 zfs_rs_get_start(rs, rt));
427 } else {
428 ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
429 zfs_rs_get_start(rs, rt));
430 }
431
432 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
433 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
434
435 zfs_range_tree_stat_incr(rt, rs);
436 rt->rt_space += size + bridge_size;
437 }
438
439 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)440 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
441 {
442 zfs_range_tree_add_impl(arg, start, size, size);
443 }
444
445 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)446 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
447 boolean_t do_fill)
448 {
449 zfs_btree_index_t where;
450 zfs_range_seg_t *rs;
451 zfs_range_seg_max_t rsearch, rs_tmp;
452 uint64_t end = start + size;
453 boolean_t left_over, right_over;
454
455 VERIFY3U(size, !=, 0);
456 VERIFY3U(size, <=, rt->rt_space);
457 if (rt->rt_type == ZFS_RANGE_SEG64)
458 ASSERT3U(start + size, >, start);
459
460 zfs_rs_set_start(&rsearch, rt, start);
461 zfs_rs_set_end(&rsearch, rt, end);
462 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
463
464 /* Make sure we completely overlap with someone */
465 if (rs == NULL) {
466 zfs_panic_recover("zfs: removing nonexistent segment from "
467 "range tree (offset=%llx size=%llx)",
468 (longlong_t)start, (longlong_t)size);
469 return;
470 }
471
472 /*
473 * Range trees with gap support must only remove complete segments
474 * from the tree. This allows us to maintain accurate fill accounting
475 * and to ensure that bridged sections are not leaked. If we need to
476 * remove less than the full segment, we can only adjust the fill count.
477 */
478 if (rt->rt_gap != 0) {
479 if (do_fill) {
480 if (zfs_rs_get_fill(rs, rt) == size) {
481 start = zfs_rs_get_start(rs, rt);
482 end = zfs_rs_get_end(rs, rt);
483 size = end - start;
484 } else {
485 zfs_range_tree_adjust_fill(rt, rs, -size);
486 return;
487 }
488 } else if (zfs_rs_get_start(rs, rt) != start ||
489 zfs_rs_get_end(rs, rt) != end) {
490 zfs_panic_recover("zfs: freeing partial segment of "
491 "gap tree (offset=%llx size=%llx) of "
492 "(offset=%llx size=%llx)",
493 (longlong_t)start, (longlong_t)size,
494 (longlong_t)zfs_rs_get_start(rs, rt),
495 (longlong_t)zfs_rs_get_end(rs, rt) -
496 zfs_rs_get_start(rs, rt));
497 return;
498 }
499 }
500
501 VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
502 VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
503
504 left_over = (zfs_rs_get_start(rs, rt) != start);
505 right_over = (zfs_rs_get_end(rs, rt) != end);
506
507 zfs_range_tree_stat_decr(rt, rs);
508
509 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
510 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
511
512 if (left_over && right_over) {
513 zfs_range_seg_max_t newseg;
514 zfs_rs_set_start(&newseg, rt, end);
515 zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
516 zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
517 zfs_range_tree_stat_incr(rt, &newseg);
518
519 // This modifies the buffer already inside the range tree
520 zfs_rs_set_end(rs, rt, start);
521
522 zfs_rs_copy(rs, &rs_tmp, rt);
523 if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
524 zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
525 else
526 zfs_btree_add(&rt->rt_root, &newseg);
527
528 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
529 rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
530 } else if (left_over) {
531 // This modifies the buffer already inside the range tree
532 zfs_rs_set_end(rs, rt, start);
533 zfs_rs_copy(rs, &rs_tmp, rt);
534 } else if (right_over) {
535 // This modifies the buffer already inside the range tree
536 zfs_rs_set_start(rs, rt, end);
537 zfs_rs_copy(rs, &rs_tmp, rt);
538 } else {
539 zfs_btree_remove_idx(&rt->rt_root, &where);
540 rs = NULL;
541 }
542
543 if (rs != NULL) {
544 /*
545 * The fill of the leftover segment will always be equal to
546 * the size, since we do not support removing partial segments
547 * of range trees with gaps.
548 */
549 zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
550 zfs_rs_get_start_raw(rs, rt));
551 zfs_range_tree_stat_incr(rt, &rs_tmp);
552
553 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
554 rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
555 }
556
557 rt->rt_space -= size;
558 }
559
560 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)561 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
562 {
563 zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
564 }
565
566 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)567 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
568 {
569 zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
570 }
571
572 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)573 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
574 uint64_t newstart, uint64_t newsize)
575 {
576 int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
577 zfs_rs_get_start(rs, rt));
578
579 zfs_range_tree_stat_decr(rt, rs);
580 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
581 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
582
583 zfs_rs_set_start(rs, rt, newstart);
584 zfs_rs_set_end(rs, rt, newstart + newsize);
585
586 zfs_range_tree_stat_incr(rt, rs);
587 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
588 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
589
590 rt->rt_space += delta;
591 }
592
593 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)594 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
595 {
596 zfs_range_seg_max_t rsearch;
597 uint64_t end = start + size;
598
599 VERIFY(size != 0);
600
601 zfs_rs_set_start(&rsearch, rt, start);
602 zfs_rs_set_end(&rsearch, rt, end);
603 return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
604 }
605
606 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)607 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
608 {
609 if (rt->rt_type == ZFS_RANGE_SEG64)
610 ASSERT3U(start + size, >, start);
611
612 zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
613 if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
614 zfs_rs_get_end(rs, rt) >= start + size) {
615 return (rs);
616 }
617 return (NULL);
618 }
619
620 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)621 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
622 uint64_t size)
623 {
624 zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
625 if (rs != NULL)
626 panic("segment already in tree; rs=%p", (void *)rs);
627 }
628
629 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)630 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
631 {
632 return (zfs_range_tree_find(rt, start, size) != NULL);
633 }
634
635 /*
636 * Returns the first subset of the given range which overlaps with the range
637 * tree. Returns true if there is a segment in the range, and false if there
638 * isn't.
639 */
640 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)641 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
642 uint64_t *ostart, uint64_t *osize)
643 {
644 if (rt->rt_type == ZFS_RANGE_SEG64)
645 ASSERT3U(start + size, >, start);
646
647 zfs_range_seg_max_t rsearch;
648 zfs_rs_set_start(&rsearch, rt, start);
649 zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
650 1);
651
652 zfs_btree_index_t where;
653 zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
654 if (rs != NULL) {
655 *ostart = start;
656 *osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
657 return (B_TRUE);
658 }
659
660 rs = zfs_btree_next(&rt->rt_root, &where, &where);
661 if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size)
662 return (B_FALSE);
663
664 *ostart = zfs_rs_get_start(rs, rt);
665 *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
666 zfs_rs_get_start(rs, rt);
667 return (B_TRUE);
668 }
669
670 /*
671 * Ensure that this range is not in the tree, regardless of whether
672 * it is currently in the tree.
673 */
674 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)675 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
676 {
677 zfs_range_seg_t *rs;
678
679 if (size == 0)
680 return;
681
682 if (rt->rt_type == ZFS_RANGE_SEG64)
683 ASSERT3U(start + size, >, start);
684
685 while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
686 uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
687 uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
688 zfs_range_tree_remove(rt, free_start, free_end - free_start);
689 }
690 }
691
692 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)693 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
694 {
695 zfs_range_tree_t *rt;
696
697 ASSERT0(zfs_range_tree_space(*rtdst));
698 ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
699
700 rt = *rtsrc;
701 *rtsrc = *rtdst;
702 *rtdst = rt;
703 }
704
705 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)706 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
707 void *arg)
708 {
709 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
710 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
711
712 if (func != NULL) {
713 zfs_range_seg_t *rs;
714 zfs_btree_index_t *cookie = NULL;
715
716 while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
717 NULL) {
718 func(arg, zfs_rs_get_start(rs, rt),
719 zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
720 }
721 } else {
722 zfs_btree_clear(&rt->rt_root);
723 }
724
725 memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
726 rt->rt_space = 0;
727 }
728
729 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)730 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
731 void *arg)
732 {
733 zfs_btree_index_t where;
734 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
735 rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
736 func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
737 zfs_rs_get_start(rs, rt));
738 }
739 }
740
741 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)742 zfs_range_tree_first(zfs_range_tree_t *rt)
743 {
744 return (zfs_btree_first(&rt->rt_root, NULL));
745 }
746
747 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)748 zfs_range_tree_space(zfs_range_tree_t *rt)
749 {
750 return (rt->rt_space);
751 }
752
753 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)754 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
755 {
756 return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
757 }
758
759 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)760 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
761 {
762 ASSERT(rt != NULL);
763 return (zfs_range_tree_space(rt) == 0);
764 }
765
766 /*
767 * Remove any overlapping ranges between the given segment [start, end)
768 * from removefrom. Add non-overlapping leftovers to addto.
769 */
770 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)771 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
772 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
773 {
774 zfs_btree_index_t where;
775 zfs_range_seg_max_t starting_rs;
776 zfs_rs_set_start(&starting_rs, removefrom, start);
777 zfs_rs_set_end_raw(&starting_rs, removefrom,
778 zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
779
780 zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
781 &starting_rs, &where);
782
783 if (curr == NULL)
784 curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
785
786 zfs_range_seg_t *next;
787 for (; curr != NULL; curr = next) {
788 if (start == end)
789 return;
790 VERIFY3U(start, <, end);
791
792 /* there is no overlap */
793 if (end <= zfs_rs_get_start(curr, removefrom)) {
794 zfs_range_tree_add(addto, start, end - start);
795 return;
796 }
797
798 uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
799 start);
800 uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
801 end);
802 uint64_t overlap_size = overlap_end - overlap_start;
803 ASSERT3S(overlap_size, >, 0);
804 zfs_range_seg_max_t rs;
805 zfs_rs_copy(curr, &rs, removefrom);
806
807 zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
808
809 if (start < overlap_start)
810 zfs_range_tree_add(addto, start, overlap_start - start);
811
812 start = overlap_end;
813 next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
814 /*
815 * If we find something here, we only removed part of the
816 * curr segment. Either there's some left at the end
817 * because we've reached the end of the range we're removing,
818 * or there's some left at the start because we started
819 * partway through the range. Either way, we continue with
820 * the loop. If it's the former, we'll return at the start of
821 * the loop, and if it's the latter we'll see if there is more
822 * area to process.
823 */
824 if (next != NULL) {
825 ASSERT(start == end || start == zfs_rs_get_end(&rs,
826 removefrom));
827 }
828
829 next = zfs_btree_next(&removefrom->rt_root, &where, &where);
830 }
831 VERIFY3P(curr, ==, NULL);
832
833 if (start != end) {
834 VERIFY3U(start, <, end);
835 zfs_range_tree_add(addto, start, end - start);
836 } else {
837 VERIFY3U(start, ==, end);
838 }
839 }
840
841 /*
842 * For each entry in rt, if it exists in removefrom, remove it
843 * from removefrom. Otherwise, add it to addto.
844 */
845 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)846 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
847 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
848 {
849 zfs_btree_index_t where;
850 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
851 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
852 zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
853 zfs_rs_get_end(rs, rt), removefrom, addto);
854 }
855 }
856
857 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)858 zfs_range_tree_min(zfs_range_tree_t *rt)
859 {
860 zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
861 return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
862 }
863
864 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)865 zfs_range_tree_max(zfs_range_tree_t *rt)
866 {
867 zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
868 return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
869 }
870
871 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)872 zfs_range_tree_span(zfs_range_tree_t *rt)
873 {
874 return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
875 }
876