1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /*
27 * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37
38 /*
39 * Range trees are tree-based data structures that can be used to
40 * track free space or generally any space allocation information.
41 * A range tree keeps track of individual segments and automatically
42 * provides facilities such as adjacent extent merging and extent
43 * splitting in response to range add/remove requests.
44 *
45 * A range tree starts out completely empty, with no segments in it.
46 * Adding an allocation via zfs_range_tree_add to the range tree can either:
47 * 1) create a new extent
48 * 2) extend an adjacent extent
49 * 3) merge two adjacent extents
50 * Conversely, removing an allocation via zfs_range_tree_remove can:
51 * 1) completely remove an extent
52 * 2) shorten an extent (if the allocation was near one of its ends)
53 * 3) split an extent into two extents, in effect punching a hole
54 *
55 * A range tree is also capable of 'bridging' gaps when adding
56 * allocations. This is useful for cases when close proximity of
57 * allocations is an important detail that needs to be represented
58 * in the range tree. See zfs_range_tree_set_gap(). The default behavior
59 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
60 *
61 * In order to traverse a range tree, use either the zfs_range_tree_walk()
62 * or zfs_range_tree_vacate() functions.
63 *
64 * To obtain more accurate information on individual segment
65 * operations that the range tree performs "under the hood", you can
66 * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
67 * to the zfs_range_tree_create function. Any callbacks that are non-NULL
68 * are then called at the appropriate times.
69 *
70 * The range tree code also supports a special variant of range trees
71 * that can bridge small gaps between segments. This kind of tree is used
72 * by the dsl scanning code to group I/Os into mostly sequential chunks to
73 * optimize disk performance. The code here attempts to do this with as
74 * little memory and computational overhead as possible. One limitation of
75 * this implementation is that segments of range trees with gaps can only
76 * support removing complete segments.
77 */
78
79 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)80 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
81 {
82 ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
83 size_t size = 0;
84 switch (rt->rt_type) {
85 case ZFS_RANGE_SEG32:
86 size = sizeof (zfs_range_seg32_t);
87 break;
88 case ZFS_RANGE_SEG64:
89 size = sizeof (zfs_range_seg64_t);
90 break;
91 case ZFS_RANGE_SEG_GAP:
92 size = sizeof (zfs_range_seg_gap_t);
93 break;
94 default:
95 __builtin_unreachable();
96 }
97 memcpy(dest, src, size);
98 }
99
100 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)101 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
102 {
103 zfs_range_seg_t *rs;
104 zfs_btree_index_t where;
105 uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
106 int i;
107
108 for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
109 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
110 uint64_t size = zfs_rs_get_end(rs, rt) -
111 zfs_rs_get_start(rs, rt);
112 int idx = highbit64(size) - 1;
113
114 hist[idx]++;
115 ASSERT3U(hist[idx], !=, 0);
116 }
117
118 for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
119 if (hist[i] != rt->rt_histogram[i]) {
120 zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
121 i, hist, (u_longlong_t)hist[i],
122 (u_longlong_t)rt->rt_histogram[i]);
123 }
124 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
125 }
126 }
127
128 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)129 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
130 {
131 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
132 int idx = highbit64(size) - 1;
133
134 ASSERT(size != 0);
135 ASSERT3U(idx, <,
136 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
137
138 rt->rt_histogram[idx]++;
139 ASSERT3U(rt->rt_histogram[idx], !=, 0);
140 }
141
142 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)143 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
144 {
145 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
146 int idx = highbit64(size) - 1;
147
148 ASSERT(size != 0);
149 ASSERT3U(idx, <,
150 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
151
152 ASSERT3U(rt->rt_histogram[idx], !=, 0);
153 rt->rt_histogram[idx]--;
154 }
155
156 __attribute__((always_inline)) inline
157 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)158 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
159 {
160 const zfs_range_seg32_t *r1 = x1;
161 const zfs_range_seg32_t *r2 = x2;
162
163 ASSERT3U(r1->rs_start, <=, r1->rs_end);
164 ASSERT3U(r2->rs_start, <=, r2->rs_end);
165
166 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
167 }
168
169 __attribute__((always_inline)) inline
170 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)171 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
172 {
173 const zfs_range_seg64_t *r1 = x1;
174 const zfs_range_seg64_t *r2 = x2;
175
176 ASSERT3U(r1->rs_start, <=, r1->rs_end);
177 ASSERT3U(r2->rs_start, <=, r2->rs_end);
178
179 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
180 }
181
182 __attribute__((always_inline)) inline
183 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)184 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
185 {
186 const zfs_range_seg_gap_t *r1 = x1;
187 const zfs_range_seg_gap_t *r2 = x2;
188
189 ASSERT3U(r1->rs_start, <=, r1->rs_end);
190 ASSERT3U(r2->rs_start, <=, r2->rs_end);
191
192 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
193 }
194
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)195 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
196 zfs_range_tree_seg32_compare)
197
198 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
199 zfs_range_tree_seg64_compare)
200
201 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
202 zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
203
204 static zfs_range_tree_t *
205 zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
206 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
207 uint64_t gap, uint64_t flags, const char *name)
208 {
209 zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
210
211 ASSERT3U(shift, <, 64);
212 ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
213 size_t size;
214 int (*compare) (const void *, const void *);
215 bt_find_in_buf_f bt_find;
216 switch (type) {
217 case ZFS_RANGE_SEG32:
218 size = sizeof (zfs_range_seg32_t);
219 compare = zfs_range_tree_seg32_compare;
220 bt_find = zfs_range_tree_seg32_find_in_buf;
221 break;
222 case ZFS_RANGE_SEG64:
223 size = sizeof (zfs_range_seg64_t);
224 compare = zfs_range_tree_seg64_compare;
225 bt_find = zfs_range_tree_seg64_find_in_buf;
226 break;
227 case ZFS_RANGE_SEG_GAP:
228 size = sizeof (zfs_range_seg_gap_t);
229 compare = zfs_range_tree_seg_gap_compare;
230 bt_find = zfs_range_tree_seg_gap_find_in_buf;
231 break;
232 default:
233 panic("Invalid range seg type %d", type);
234 }
235 zfs_btree_create(&rt->rt_root, compare, bt_find, size);
236
237 rt->rt_ops = ops;
238 rt->rt_gap = gap;
239 rt->rt_flags = flags;
240 rt->rt_name = name;
241 rt->rt_arg = arg;
242 rt->rt_type = type;
243 rt->rt_start = start;
244 rt->rt_shift = shift;
245
246 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
247 rt->rt_ops->rtop_create(rt, rt->rt_arg);
248
249 return (rt);
250 }
251
252 zfs_range_tree_t *
zfs_range_tree_create_gap(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t gap)253 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
254 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
255 uint64_t gap)
256 {
257 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
258 0, NULL));
259 }
260
261 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)262 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
263 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
264 {
265 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
266 0, NULL));
267 }
268
269 zfs_range_tree_t *
zfs_range_tree_create_flags(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t flags,const char * name)270 zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
271 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
272 uint64_t flags, const char *name)
273 {
274 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
275 flags, name));
276 }
277
278 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)279 zfs_range_tree_destroy(zfs_range_tree_t *rt)
280 {
281 VERIFY0(rt->rt_space);
282
283 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
284 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
285
286 if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
287 kmem_strfree((char *)(uintptr_t)rt->rt_name);
288
289 zfs_btree_destroy(&rt->rt_root);
290 kmem_free(rt, sizeof (*rt));
291 }
292
293 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)294 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
295 int64_t delta)
296 {
297 if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
298 zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
299 "or below 0; probable double remove in segment [%llx:%llx]",
300 ZFS_RT_NAME(rt),
301 (longlong_t)zfs_rs_get_start(rs, rt),
302 (longlong_t)zfs_rs_get_end(rs, rt));
303 }
304 if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
305 zfs_rs_get_start(rs, rt)) {
306 zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
307 "beyond max; probable double add in segment [%llx:%llx]",
308 ZFS_RT_NAME(rt),
309 (longlong_t)zfs_rs_get_start(rs, rt),
310 (longlong_t)zfs_rs_get_end(rs, rt));
311 }
312
313 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
314 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
315 zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
316 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
317 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
318 }
319
320 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)321 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
322 {
323 zfs_range_tree_t *rt = arg;
324 zfs_btree_index_t where;
325 zfs_range_seg_t *rs_before, *rs_after, *rs;
326 zfs_range_seg_max_t tmp, rsearch;
327 uint64_t end = start + size, gap = rt->rt_gap;
328 uint64_t bridge_size = 0;
329 boolean_t merge_before, merge_after;
330
331 ASSERT3U(size, !=, 0);
332 ASSERT3U(fill, <=, size);
333 ASSERT3U(start + size, >, start);
334
335 zfs_rs_set_start(&rsearch, rt, start);
336 zfs_rs_set_end(&rsearch, rt, end);
337 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
338
339 /*
340 * If this is a gap-supporting range tree, it is possible that we
341 * are inserting into an existing segment. In this case simply
342 * bump the fill count and call the remove / add callbacks. If the
343 * new range will extend an existing segment, we remove the
344 * existing one, apply the new extent to it and re-insert it using
345 * the normal code paths.
346 */
347 if (rs != NULL) {
348 uint64_t rstart = zfs_rs_get_start(rs, rt);
349 uint64_t rend = zfs_rs_get_end(rs, rt);
350 if (gap == 0) {
351 zfs_panic_recover("zfs: rt=%s: adding segment "
352 "(offset=%llx size=%llx) overlapping with existing "
353 "one (offset=%llx size=%llx)",
354 ZFS_RT_NAME(rt),
355 (longlong_t)start, (longlong_t)size,
356 (longlong_t)rstart, (longlong_t)(rend - rstart));
357 return;
358 }
359 if (rstart <= start && rend >= end) {
360 zfs_range_tree_adjust_fill(rt, rs, fill);
361 return;
362 }
363
364 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
365 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
366
367 zfs_range_tree_stat_decr(rt, rs);
368 rt->rt_space -= rend - rstart;
369
370 fill += zfs_rs_get_fill(rs, rt);
371 start = MIN(start, rstart);
372 end = MAX(end, rend);
373 size = end - start;
374
375 zfs_btree_remove(&rt->rt_root, rs);
376 zfs_range_tree_add_impl(rt, start, size, fill);
377 return;
378 }
379
380 ASSERT3P(rs, ==, NULL);
381
382 /*
383 * Determine whether or not we will have to merge with our neighbors.
384 * If gap != 0, we might need to merge with our neighbors even if we
385 * aren't directly touching.
386 */
387 zfs_btree_index_t where_before, where_after;
388 rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
389 rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
390
391 merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
392 start - gap);
393 merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
394 end + gap);
395
396 if (merge_before && gap != 0)
397 bridge_size += start - zfs_rs_get_end(rs_before, rt);
398 if (merge_after && gap != 0)
399 bridge_size += zfs_rs_get_start(rs_after, rt) - end;
400
401 if (merge_before && merge_after) {
402 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
403 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
404 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
405 }
406
407 zfs_range_tree_stat_decr(rt, rs_before);
408 zfs_range_tree_stat_decr(rt, rs_after);
409
410 zfs_rs_copy(rs_after, &tmp, rt);
411 uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
412 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
413 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
414 zfs_btree_remove_idx(&rt->rt_root, &where_before);
415
416 /*
417 * We have to re-find the node because our old reference is
418 * invalid as soon as we do any mutating btree operations.
419 */
420 rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
421 ASSERT3P(rs_after, !=, NULL);
422 zfs_rs_set_start_raw(rs_after, rt, before_start);
423 zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
424 rs = rs_after;
425 } else if (merge_before) {
426 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
427 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
428
429 zfs_range_tree_stat_decr(rt, rs_before);
430
431 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
432 zfs_rs_set_end(rs_before, rt, end);
433 zfs_rs_set_fill(rs_before, rt, before_fill + fill);
434 rs = rs_before;
435 } else if (merge_after) {
436 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
437 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
438
439 zfs_range_tree_stat_decr(rt, rs_after);
440
441 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
442 zfs_rs_set_start(rs_after, rt, start);
443 zfs_rs_set_fill(rs_after, rt, after_fill + fill);
444 rs = rs_after;
445 } else {
446 rs = &tmp;
447
448 zfs_rs_set_start(rs, rt, start);
449 zfs_rs_set_end(rs, rt, end);
450 zfs_rs_set_fill(rs, rt, fill);
451 zfs_btree_add_idx(&rt->rt_root, rs, &where);
452 }
453
454 if (gap != 0) {
455 ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
456 zfs_rs_get_start(rs, rt));
457 } else {
458 ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
459 zfs_rs_get_start(rs, rt));
460 }
461
462 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
463 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
464
465 zfs_range_tree_stat_incr(rt, rs);
466 rt->rt_space += size + bridge_size;
467 }
468
469 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)470 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
471 {
472 zfs_range_tree_add_impl(arg, start, size, size);
473 }
474
475 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)476 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
477 boolean_t do_fill)
478 {
479 zfs_btree_index_t where;
480 zfs_range_seg_t *rs;
481 zfs_range_seg_max_t rsearch, rs_tmp;
482 uint64_t end = start + size;
483 uint64_t rstart, rend;
484 boolean_t left_over, right_over;
485
486 VERIFY3U(size, !=, 0);
487 VERIFY3U(size, <=, rt->rt_space);
488 if (rt->rt_type == ZFS_RANGE_SEG64)
489 ASSERT3U(start + size, >, start);
490
491 zfs_rs_set_start(&rsearch, rt, start);
492 zfs_rs_set_end(&rsearch, rt, end);
493 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
494
495 /* Make sure we completely overlap with someone */
496 if (rs == NULL) {
497 zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
498 "from range tree (offset=%llx size=%llx)",
499 ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
500 return;
501 }
502
503 rstart = zfs_rs_get_start(rs, rt);
504 rend = zfs_rs_get_end(rs, rt);
505
506 /*
507 * Range trees with gap support must only remove complete segments
508 * from the tree. This allows us to maintain accurate fill accounting
509 * and to ensure that bridged sections are not leaked. If we need to
510 * remove less than the full segment, we can only adjust the fill count.
511 */
512 if (rt->rt_gap != 0) {
513 if (do_fill) {
514 if (zfs_rs_get_fill(rs, rt) == size) {
515 start = rstart;
516 end = rend;
517 size = end - start;
518 } else {
519 zfs_range_tree_adjust_fill(rt, rs, -size);
520 return;
521 }
522 } else if (rstart != start || rend != end) {
523 zfs_panic_recover("zfs: rt=%s: freeing partial segment "
524 "of gap tree (offset=%llx size=%llx) of "
525 "(offset=%llx size=%llx)",
526 ZFS_RT_NAME(rt),
527 (longlong_t)start, (longlong_t)size,
528 (longlong_t)rstart, (longlong_t)(rend - rstart));
529 return;
530 }
531 }
532
533 if (!(rstart <= start && rend >= end)) {
534 panic("zfs: rt=%s: removing segment "
535 "(offset=%llx size=%llx) not completely overlapped by "
536 "existing one (offset=%llx size=%llx)",
537 ZFS_RT_NAME(rt),
538 (longlong_t)start, (longlong_t)size,
539 (longlong_t)rstart, (longlong_t)(rend - rstart));
540 return;
541 }
542
543 left_over = (rstart != start);
544 right_over = (rend != end);
545
546 zfs_range_tree_stat_decr(rt, rs);
547
548 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
549 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
550
551 if (left_over && right_over) {
552 zfs_range_seg_max_t newseg;
553 zfs_rs_set_start(&newseg, rt, end);
554 zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
555 zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
556 zfs_range_tree_stat_incr(rt, &newseg);
557
558 // This modifies the buffer already inside the range tree
559 zfs_rs_set_end(rs, rt, start);
560
561 zfs_rs_copy(rs, &rs_tmp, rt);
562 if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
563 zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
564 else
565 zfs_btree_add(&rt->rt_root, &newseg);
566
567 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
568 rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
569 } else if (left_over) {
570 // This modifies the buffer already inside the range tree
571 zfs_rs_set_end(rs, rt, start);
572 zfs_rs_copy(rs, &rs_tmp, rt);
573 } else if (right_over) {
574 // This modifies the buffer already inside the range tree
575 zfs_rs_set_start(rs, rt, end);
576 zfs_rs_copy(rs, &rs_tmp, rt);
577 } else {
578 zfs_btree_remove_idx(&rt->rt_root, &where);
579 rs = NULL;
580 }
581
582 if (rs != NULL) {
583 /*
584 * The fill of the leftover segment will always be equal to
585 * the size, since we do not support removing partial segments
586 * of range trees with gaps.
587 */
588 zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
589 zfs_rs_get_start_raw(rs, rt));
590 zfs_range_tree_stat_incr(rt, &rs_tmp);
591
592 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
593 rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
594 }
595
596 rt->rt_space -= size;
597 }
598
599 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)600 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
601 {
602 zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
603 }
604
605 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)606 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
607 {
608 zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
609 }
610
611 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)612 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
613 uint64_t newstart, uint64_t newsize)
614 {
615 int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
616 zfs_rs_get_start(rs, rt));
617
618 zfs_range_tree_stat_decr(rt, rs);
619 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
620 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
621
622 zfs_rs_set_start(rs, rt, newstart);
623 zfs_rs_set_end(rs, rt, newstart + newsize);
624
625 zfs_range_tree_stat_incr(rt, rs);
626 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
627 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
628
629 rt->rt_space += delta;
630 }
631
632 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)633 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
634 {
635 zfs_range_seg_max_t rsearch;
636 uint64_t end = start + size;
637
638 VERIFY(size != 0);
639
640 zfs_rs_set_start(&rsearch, rt, start);
641 zfs_rs_set_end(&rsearch, rt, end);
642 return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
643 }
644
645 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)646 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
647 {
648 if (rt->rt_type == ZFS_RANGE_SEG64)
649 ASSERT3U(start + size, >, start);
650
651 zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
652 if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
653 zfs_rs_get_end(rs, rt) >= start + size) {
654 return (rs);
655 }
656 return (NULL);
657 }
658
659 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)660 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
661 uint64_t size)
662 {
663 zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
664 if (rs != NULL)
665 panic("segment already in tree; rs=%p", (void *)rs);
666 }
667
668 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)669 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
670 {
671 return (zfs_range_tree_find(rt, start, size) != NULL);
672 }
673
674 /*
675 * Returns the first subset of the given range which overlaps with the range
676 * tree. Returns true if there is a segment in the range, and false if there
677 * isn't.
678 */
679 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)680 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
681 uint64_t *ostart, uint64_t *osize)
682 {
683 if (rt->rt_type == ZFS_RANGE_SEG64)
684 ASSERT3U(start + size, >, start);
685
686 zfs_range_seg_max_t rsearch;
687 zfs_rs_set_start(&rsearch, rt, start);
688 zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
689 1);
690
691 zfs_btree_index_t where;
692 zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
693 if (rs != NULL) {
694 *ostart = start;
695 *osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
696 return (B_TRUE);
697 }
698
699 rs = zfs_btree_next(&rt->rt_root, &where, &where);
700 if (rs == NULL || zfs_rs_get_start(rs, rt) >= start + size)
701 return (B_FALSE);
702
703 *ostart = zfs_rs_get_start(rs, rt);
704 *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
705 zfs_rs_get_start(rs, rt);
706 return (B_TRUE);
707 }
708
709 /*
710 * Ensure that this range is not in the tree, regardless of whether
711 * it is currently in the tree.
712 */
713 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)714 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
715 {
716 zfs_range_seg_t *rs;
717
718 if (size == 0)
719 return;
720
721 if (rt->rt_type == ZFS_RANGE_SEG64)
722 ASSERT3U(start + size, >, start);
723
724 while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
725 uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
726 uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
727 zfs_range_tree_remove(rt, free_start, free_end - free_start);
728 }
729 }
730
731 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)732 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
733 {
734 zfs_range_tree_t *rt;
735
736 ASSERT0(zfs_range_tree_space(*rtdst));
737 ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
738
739 rt = *rtsrc;
740 *rtsrc = *rtdst;
741 *rtdst = rt;
742 }
743
744 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)745 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
746 void *arg)
747 {
748 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
749 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
750
751 if (func != NULL) {
752 zfs_range_seg_t *rs;
753 zfs_btree_index_t *cookie = NULL;
754
755 while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
756 NULL) {
757 func(arg, zfs_rs_get_start(rs, rt),
758 zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
759 }
760 } else {
761 zfs_btree_clear(&rt->rt_root);
762 }
763
764 memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
765 rt->rt_space = 0;
766 }
767
768 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)769 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
770 void *arg)
771 {
772 zfs_btree_index_t where;
773 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
774 rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
775 func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
776 zfs_rs_get_start(rs, rt));
777 }
778 }
779
780 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)781 zfs_range_tree_first(zfs_range_tree_t *rt)
782 {
783 return (zfs_btree_first(&rt->rt_root, NULL));
784 }
785
786 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)787 zfs_range_tree_space(zfs_range_tree_t *rt)
788 {
789 return (rt->rt_space);
790 }
791
792 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)793 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
794 {
795 return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
796 }
797
798 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)799 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
800 {
801 ASSERT(rt != NULL);
802 return (zfs_range_tree_space(rt) == 0);
803 }
804
805 /*
806 * Remove any overlapping ranges between the given segment [start, end)
807 * from removefrom. Add non-overlapping leftovers to addto.
808 */
809 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)810 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
811 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
812 {
813 zfs_btree_index_t where;
814 zfs_range_seg_max_t starting_rs;
815 zfs_rs_set_start(&starting_rs, removefrom, start);
816 zfs_rs_set_end_raw(&starting_rs, removefrom,
817 zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
818
819 zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
820 &starting_rs, &where);
821
822 if (curr == NULL)
823 curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
824
825 zfs_range_seg_t *next;
826 for (; curr != NULL; curr = next) {
827 if (start == end)
828 return;
829 VERIFY3U(start, <, end);
830
831 /* there is no overlap */
832 if (end <= zfs_rs_get_start(curr, removefrom)) {
833 zfs_range_tree_add(addto, start, end - start);
834 return;
835 }
836
837 uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
838 start);
839 uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
840 end);
841 uint64_t overlap_size = overlap_end - overlap_start;
842 ASSERT3S(overlap_size, >, 0);
843 zfs_range_seg_max_t rs;
844 zfs_rs_copy(curr, &rs, removefrom);
845
846 zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
847
848 if (start < overlap_start)
849 zfs_range_tree_add(addto, start, overlap_start - start);
850
851 start = overlap_end;
852 next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
853 /*
854 * If we find something here, we only removed part of the
855 * curr segment. Either there's some left at the end
856 * because we've reached the end of the range we're removing,
857 * or there's some left at the start because we started
858 * partway through the range. Either way, we continue with
859 * the loop. If it's the former, we'll return at the start of
860 * the loop, and if it's the latter we'll see if there is more
861 * area to process.
862 */
863 if (next != NULL) {
864 ASSERT(start == end || start == zfs_rs_get_end(&rs,
865 removefrom));
866 }
867
868 next = zfs_btree_next(&removefrom->rt_root, &where, &where);
869 }
870 VERIFY3P(curr, ==, NULL);
871
872 if (start != end) {
873 VERIFY3U(start, <, end);
874 zfs_range_tree_add(addto, start, end - start);
875 } else {
876 VERIFY3U(start, ==, end);
877 }
878 }
879
880 /*
881 * For each entry in rt, if it exists in removefrom, remove it
882 * from removefrom. Otherwise, add it to addto.
883 */
884 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)885 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
886 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
887 {
888 zfs_btree_index_t where;
889 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
890 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
891 zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
892 zfs_rs_get_end(rs, rt), removefrom, addto);
893 }
894 }
895
896 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)897 zfs_range_tree_min(zfs_range_tree_t *rt)
898 {
899 zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
900 return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
901 }
902
903 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)904 zfs_range_tree_max(zfs_range_tree_t *rt)
905 {
906 zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
907 return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
908 }
909
910 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)911 zfs_range_tree_span(zfs_range_tree_t *rt)
912 {
913 return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
914 }
915