1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * This file and its contents are supplied under the terms of the
6 * Common Development and Distribution License ("CDDL"), version 1.0.
7 * You may only use this file in accordance with the terms of version
8 * 1.0 of the CDDL.
9 *
10 * A full copy of the text of the CDDL should have accompanied this
11 * source. A copy of the CDDL is also available via the Internet at
12 * http://www.illumos.org/license/CDDL.
13 *
14 * CDDL HEADER END
15 */
16 /*
17 * Copyright (c) 2019 by Delphix. All rights reserved.
18 */
19
20 #include <sys/btree.h>
21 #include <sys/bitops.h>
22 #include <sys/zfs_context.h>
23
24 kmem_cache_t *zfs_btree_leaf_cache;
25
26 /*
27 * Control the extent of the verification that occurs when zfs_btree_verify is
28 * called. Primarily used for debugging when extending the btree logic and
29 * functionality. As the intensity is increased, new verification steps are
30 * added. These steps are cumulative; intensity = 3 includes the intensity = 1
31 * and intensity = 2 steps as well.
32 *
33 * Intensity 1: Verify that the tree's height is consistent throughout.
34 * Intensity 2: Verify that a core node's children's parent pointers point
35 * to the core node.
36 * Intensity 3: Verify that the total number of elements in the tree matches the
37 * sum of the number of elements in each node. Also verifies that each node's
38 * count obeys the invariants (less than or equal to maximum value, greater than
39 * or equal to half the maximum minus one).
40 * Intensity 4: Verify that each element compares less than the element
41 * immediately after it and greater than the one immediately before it using the
42 * comparator function. For core nodes, also checks that each element is greater
43 * than the last element in the first of the two nodes it separates, and less
44 * than the first element in the second of the two nodes.
45 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
46 * of each node is poisoned appropriately. Note that poisoning always occurs if
47 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
48 * operation.
49 *
50 * Intensity 4 and 5 are particularly expensive to perform; the previous levels
51 * are a few memory operations per node, while these levels require multiple
52 * operations per element. In addition, when creating large btrees, these
53 * operations are called at every step, resulting in extremely slow operation
54 * (while the asymptotic complexity of the other steps is the same, the
55 * importance of the constant factors cannot be denied).
56 */
57 uint_t zfs_btree_verify_intensity = 0;
58
59 /*
60 * Convenience functions to silence warnings from memcpy/memmove's
61 * return values and change argument order to src, dest.
62 */
63 static void
bcpy(const void * src,void * dest,size_t size)64 bcpy(const void *src, void *dest, size_t size)
65 {
66 (void) memcpy(dest, src, size);
67 }
68
69 static void
bmov(const void * src,void * dest,size_t size)70 bmov(const void *src, void *dest, size_t size)
71 {
72 (void) memmove(dest, src, size);
73 }
74
75 static boolean_t
zfs_btree_is_core(struct zfs_btree_hdr * hdr)76 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
77 {
78 return (hdr->bth_first == -1);
79 }
80
81 #ifdef _ILP32
82 #define BTREE_POISON 0xabadb10c
83 #else
84 #define BTREE_POISON 0xabadb10cdeadbeef
85 #endif
86
87 static void
zfs_btree_poison_node(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)88 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
89 {
90 #ifdef ZFS_DEBUG
91 size_t size = tree->bt_elem_size;
92 if (zfs_btree_is_core(hdr)) {
93 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
94 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
95 i++) {
96 node->btc_children[i] =
97 (zfs_btree_hdr_t *)BTREE_POISON;
98 }
99 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
100 (BTREE_CORE_ELEMS - hdr->bth_count) * size);
101 } else {
102 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
103 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
104 (void) memset(leaf->btl_elems +
105 (hdr->bth_first + hdr->bth_count) * size, 0x0f,
106 tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
107 (hdr->bth_first + hdr->bth_count) * size);
108 }
109 #endif
110 }
111
112 static inline void
zfs_btree_poison_node_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx,uint32_t count)113 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
114 uint32_t idx, uint32_t count)
115 {
116 #ifdef ZFS_DEBUG
117 size_t size = tree->bt_elem_size;
118 if (zfs_btree_is_core(hdr)) {
119 ASSERT3U(idx, >=, hdr->bth_count);
120 ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
121 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
122 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
123 for (uint32_t i = 1; i <= count; i++) {
124 node->btc_children[idx + i] =
125 (zfs_btree_hdr_t *)BTREE_POISON;
126 }
127 (void) memset(node->btc_elems + idx * size, 0x0f, count * size);
128 } else {
129 ASSERT3U(idx, <=, tree->bt_leaf_cap);
130 ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
131 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
132 (void) memset(leaf->btl_elems +
133 (hdr->bth_first + idx) * size, 0x0f, count * size);
134 }
135 #endif
136 }
137
138 static inline void
zfs_btree_verify_poison_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx)139 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
140 uint32_t idx)
141 {
142 #ifdef ZFS_DEBUG
143 size_t size = tree->bt_elem_size;
144 if (zfs_btree_is_core(hdr)) {
145 ASSERT3U(idx, <, BTREE_CORE_ELEMS);
146 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
147 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
148 VERIFY3P(node->btc_children[idx + 1], ==, cval);
149 for (size_t i = 0; i < size; i++)
150 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
151 } else {
152 ASSERT3U(idx, <, tree->bt_leaf_cap);
153 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
154 if (idx >= tree->bt_leaf_cap - hdr->bth_first)
155 return;
156 for (size_t i = 0; i < size; i++) {
157 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
158 * size + i], ==, 0x0f);
159 }
160 }
161 #endif
162 }
163
164 void
zfs_btree_init(void)165 zfs_btree_init(void)
166 {
167 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
168 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
169 }
170
171 void
zfs_btree_fini(void)172 zfs_btree_fini(void)
173 {
174 kmem_cache_destroy(zfs_btree_leaf_cache);
175 }
176
177 static void *
zfs_btree_leaf_alloc(zfs_btree_t * tree)178 zfs_btree_leaf_alloc(zfs_btree_t *tree)
179 {
180 if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
181 return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
182 else
183 return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
184 }
185
186 static void
zfs_btree_leaf_free(zfs_btree_t * tree,void * ptr)187 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
188 {
189 if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
190 return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
191 else
192 return (kmem_free(ptr, tree->bt_leaf_size));
193 }
194
195 void
zfs_btree_create(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size)196 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
197 bt_find_in_buf_f bt_find_in_buf, size_t size)
198 {
199 zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
200 BTREE_LEAF_SIZE);
201 }
202
203 static void *
204 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
205 const void *value, zfs_btree_index_t *where);
206
207 void
zfs_btree_create_custom(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size,size_t lsize)208 zfs_btree_create_custom(zfs_btree_t *tree,
209 int (*compar) (const void *, const void *),
210 bt_find_in_buf_f bt_find_in_buf,
211 size_t size, size_t lsize)
212 {
213 size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
214
215 ASSERT3U(size, <=, esize / 2);
216 memset(tree, 0, sizeof (*tree));
217 tree->bt_compar = compar;
218 tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
219 zfs_btree_find_in_buf : bt_find_in_buf;
220 tree->bt_elem_size = size;
221 tree->bt_leaf_size = lsize;
222 tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
223 tree->bt_height = -1;
224 tree->bt_bulk = NULL;
225 }
226
227 /*
228 * Find value in the array of elements provided. Uses a simple binary search.
229 */
230 static void *
zfs_btree_find_in_buf(zfs_btree_t * tree,uint8_t * buf,uint32_t nelems,const void * value,zfs_btree_index_t * where)231 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
232 const void *value, zfs_btree_index_t *where)
233 {
234 uint32_t max = nelems;
235 uint32_t min = 0;
236 while (max > min) {
237 uint32_t idx = (min + max) / 2;
238 uint8_t *cur = buf + idx * tree->bt_elem_size;
239 int comp = tree->bt_compar(cur, value);
240 if (comp < 0) {
241 min = idx + 1;
242 } else if (comp > 0) {
243 max = idx;
244 } else {
245 where->bti_offset = idx;
246 where->bti_before = B_FALSE;
247 return (cur);
248 }
249 }
250
251 where->bti_offset = max;
252 where->bti_before = B_TRUE;
253 return (NULL);
254 }
255
256 /*
257 * Find the given value in the tree. where may be passed as null to use as a
258 * membership test or if the btree is being used as a map.
259 */
260 void *
zfs_btree_find(zfs_btree_t * tree,const void * value,zfs_btree_index_t * where)261 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
262 {
263 if (tree->bt_height == -1) {
264 if (where != NULL) {
265 where->bti_node = NULL;
266 where->bti_offset = 0;
267 }
268 ASSERT0(tree->bt_num_elems);
269 return (NULL);
270 }
271
272 /*
273 * If we're in bulk-insert mode, we check the last spot in the tree
274 * and the last leaf in the tree before doing the normal search,
275 * because for most workloads the vast majority of finds in
276 * bulk-insert mode are to insert new elements.
277 */
278 zfs_btree_index_t idx;
279 size_t size = tree->bt_elem_size;
280 if (tree->bt_bulk != NULL) {
281 zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
282 int comp = tree->bt_compar(last_leaf->btl_elems +
283 (last_leaf->btl_hdr.bth_first +
284 last_leaf->btl_hdr.bth_count - 1) * size, value);
285 if (comp < 0) {
286 /*
287 * If what they're looking for is after the last
288 * element, it's not in the tree.
289 */
290 if (where != NULL) {
291 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
292 where->bti_offset =
293 last_leaf->btl_hdr.bth_count;
294 where->bti_before = B_TRUE;
295 }
296 return (NULL);
297 } else if (comp == 0) {
298 if (where != NULL) {
299 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
300 where->bti_offset =
301 last_leaf->btl_hdr.bth_count - 1;
302 where->bti_before = B_FALSE;
303 }
304 return (last_leaf->btl_elems +
305 (last_leaf->btl_hdr.bth_first +
306 last_leaf->btl_hdr.bth_count - 1) * size);
307 }
308 if (tree->bt_compar(last_leaf->btl_elems +
309 last_leaf->btl_hdr.bth_first * size, value) <= 0) {
310 /*
311 * If what they're looking for is after the first
312 * element in the last leaf, it's in the last leaf or
313 * it's not in the tree.
314 */
315 void *d = tree->bt_find_in_buf(tree,
316 last_leaf->btl_elems +
317 last_leaf->btl_hdr.bth_first * size,
318 last_leaf->btl_hdr.bth_count, value, &idx);
319
320 if (where != NULL) {
321 idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
322 *where = idx;
323 }
324 return (d);
325 }
326 }
327
328 zfs_btree_core_t *node = NULL;
329 uint32_t child = 0;
330 uint32_t depth = 0;
331
332 /*
333 * Iterate down the tree, finding which child the value should be in
334 * by comparing with the separators.
335 */
336 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
337 node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
338 ASSERT3P(node, !=, NULL);
339 void *d = tree->bt_find_in_buf(tree, node->btc_elems,
340 node->btc_hdr.bth_count, value, &idx);
341 EQUIV(d != NULL, !idx.bti_before);
342 if (d != NULL) {
343 if (where != NULL) {
344 idx.bti_node = (zfs_btree_hdr_t *)node;
345 *where = idx;
346 }
347 return (d);
348 }
349 ASSERT(idx.bti_before);
350 child = idx.bti_offset;
351 }
352
353 /*
354 * The value is in this leaf, or it would be if it were in the
355 * tree. Find its proper location and return it.
356 */
357 zfs_btree_leaf_t *leaf = (depth == 0 ?
358 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
359 void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
360 leaf->btl_hdr.bth_first * size,
361 leaf->btl_hdr.bth_count, value, &idx);
362
363 if (where != NULL) {
364 idx.bti_node = (zfs_btree_hdr_t *)leaf;
365 *where = idx;
366 }
367
368 return (d);
369 }
370
371 /*
372 * To explain the following functions, it is useful to understand the four
373 * kinds of shifts used in btree operation. First, a shift is a movement of
374 * elements within a node. It is used to create gaps for inserting new
375 * elements and children, or cover gaps created when things are removed. A
376 * shift has two fundamental properties, each of which can be one of two
377 * values, making four types of shifts. There is the direction of the shift
378 * (left or right) and the shape of the shift (parallelogram or isoceles
379 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
380 * applies to shifts of core nodes.
381 *
382 * The names derive from the following imagining of the layout of a node:
383 *
384 * Elements: * * * * * * * ... * * *
385 * Children: * * * * * * * * ... * * *
386 *
387 * This layout follows from the fact that the elements act as separators
388 * between pairs of children, and that children root subtrees "below" the
389 * current node. A left and right shift are fairly self-explanatory; a left
390 * shift moves things to the left, while a right shift moves things to the
391 * right. A parallelogram shift is a shift with the same number of elements
392 * and children being moved, while a trapezoid shift is a shift that moves one
393 * more children than elements. An example follows:
394 *
395 * A parallelogram shift could contain the following:
396 * _______________
397 * \* * * * \ * * * ... * * *
398 * * \ * * * *\ * * * ... * * *
399 * ---------------
400 * A trapezoid shift could contain the following:
401 * ___________
402 * * / * * * \ * * * ... * * *
403 * * / * * * *\ * * * ... * * *
404 * ---------------
405 *
406 * Note that a parallelogram shift is always shaped like a "left-leaning"
407 * parallelogram, where the starting index of the children being moved is
408 * always one higher than the starting index of the elements being moved. No
409 * "right-leaning" parallelogram shifts are needed (shifts where the starting
410 * element index and starting child index being moved are the same) to achieve
411 * any btree operations, so we ignore them.
412 */
413
414 enum bt_shift_shape {
415 BSS_TRAPEZOID,
416 BSS_PARALLELOGRAM
417 };
418
419 enum bt_shift_direction {
420 BSD_LEFT,
421 BSD_RIGHT
422 };
423
424 /*
425 * Shift elements and children in the provided core node by off spots. The
426 * first element moved is idx, and count elements are moved. The shape of the
427 * shift is determined by shape. The direction is determined by dir.
428 */
429 static inline void
bt_shift_core(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_shape shape,enum bt_shift_direction dir)430 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
431 uint32_t count, uint32_t off, enum bt_shift_shape shape,
432 enum bt_shift_direction dir)
433 {
434 size_t size = tree->bt_elem_size;
435 ASSERT(zfs_btree_is_core(&node->btc_hdr));
436
437 uint8_t *e_start = node->btc_elems + idx * size;
438 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
439 e_start + off * size);
440 bmov(e_start, e_out, count * size);
441
442 zfs_btree_hdr_t **c_start = node->btc_children + idx +
443 (shape == BSS_TRAPEZOID ? 0 : 1);
444 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
445 c_start + off);
446 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
447 bmov(c_start, c_out, c_count * sizeof (*c_start));
448 }
449
450 /*
451 * Shift elements and children in the provided core node left by one spot.
452 * The first element moved is idx, and count elements are moved. The
453 * shape of the shift is determined by trap; true if the shift is a trapezoid,
454 * false if it is a parallelogram.
455 */
456 static inline void
bt_shift_core_left(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)457 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
458 uint32_t count, enum bt_shift_shape shape)
459 {
460 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
461 }
462
463 /*
464 * Shift elements and children in the provided core node right by one spot.
465 * Starts with elements[idx] and children[idx] and one more child than element.
466 */
467 static inline void
bt_shift_core_right(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)468 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
469 uint32_t count, enum bt_shift_shape shape)
470 {
471 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
472 }
473
474 /*
475 * Shift elements and children in the provided leaf node by off spots.
476 * The first element moved is idx, and count elements are moved. The direction
477 * is determined by left.
478 */
479 static inline void
bt_shift_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_direction dir)480 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
481 uint32_t count, uint32_t off, enum bt_shift_direction dir)
482 {
483 size_t size = tree->bt_elem_size;
484 zfs_btree_hdr_t *hdr = &node->btl_hdr;
485 ASSERT(!zfs_btree_is_core(hdr));
486
487 if (count == 0)
488 return;
489 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
490 uint8_t *out = (dir == BSD_LEFT ? start - off * size :
491 start + off * size);
492 bmov(start, out, count * size);
493 }
494
495 /*
496 * Grow leaf for n new elements before idx.
497 */
498 static void
bt_grow_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)499 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
500 uint32_t n)
501 {
502 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
503 ASSERT(!zfs_btree_is_core(hdr));
504 ASSERT3U(idx, <=, hdr->bth_count);
505 uint32_t capacity = tree->bt_leaf_cap;
506 ASSERT3U(hdr->bth_count + n, <=, capacity);
507 boolean_t cl = (hdr->bth_first >= n);
508 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
509
510 if (cl && (!cr || idx <= hdr->bth_count / 2)) {
511 /* Grow left. */
512 hdr->bth_first -= n;
513 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
514 } else if (cr) {
515 /* Grow right. */
516 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
517 BSD_RIGHT);
518 } else {
519 /* Grow both ways. */
520 uint32_t fn = hdr->bth_first -
521 (capacity - (hdr->bth_count + n)) / 2;
522 hdr->bth_first -= fn;
523 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
524 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
525 n - fn, BSD_RIGHT);
526 }
527 hdr->bth_count += n;
528 }
529
530 /*
531 * Shrink leaf for count elements starting from idx.
532 */
533 static void
bt_shrink_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)534 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
535 uint32_t n)
536 {
537 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
538 ASSERT(!zfs_btree_is_core(hdr));
539 ASSERT3U(idx, <=, hdr->bth_count);
540 ASSERT3U(idx + n, <=, hdr->bth_count);
541
542 if (idx <= (hdr->bth_count - n) / 2) {
543 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
544 zfs_btree_poison_node_at(tree, hdr, 0, n);
545 hdr->bth_first += n;
546 } else {
547 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
548 BSD_LEFT);
549 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
550 }
551 hdr->bth_count -= n;
552 }
553
554 /*
555 * Move children and elements from one core node to another. The shape
556 * parameter behaves the same as it does in the shift logic.
557 */
558 static inline void
bt_transfer_core(zfs_btree_t * tree,zfs_btree_core_t * source,uint32_t sidx,uint32_t count,zfs_btree_core_t * dest,uint32_t didx,enum bt_shift_shape shape)559 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
560 uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
561 enum bt_shift_shape shape)
562 {
563 size_t size = tree->bt_elem_size;
564 ASSERT(zfs_btree_is_core(&source->btc_hdr));
565 ASSERT(zfs_btree_is_core(&dest->btc_hdr));
566
567 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
568 count * size);
569
570 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
571 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
572 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
573 c_count * sizeof (*source->btc_children));
574 }
575
576 static inline void
bt_transfer_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * source,uint32_t sidx,uint32_t count,zfs_btree_leaf_t * dest,uint32_t didx)577 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
578 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
579 {
580 size_t size = tree->bt_elem_size;
581 ASSERT(!zfs_btree_is_core(&source->btl_hdr));
582 ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
583
584 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
585 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
586 count * size);
587 }
588
589 /*
590 * Find the first element in the subtree rooted at hdr, return its value and
591 * put its location in where if non-null.
592 */
593 static void *
zfs_btree_first_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)594 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
595 zfs_btree_index_t *where)
596 {
597 zfs_btree_hdr_t *node;
598
599 for (node = hdr; zfs_btree_is_core(node);
600 node = ((zfs_btree_core_t *)node)->btc_children[0])
601 ;
602
603 ASSERT(!zfs_btree_is_core(node));
604 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
605 if (where != NULL) {
606 where->bti_node = node;
607 where->bti_offset = 0;
608 where->bti_before = B_FALSE;
609 }
610 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
611 }
612
613 /* Insert an element and a child into a core node at the given offset. */
614 static void
zfs_btree_insert_core_impl(zfs_btree_t * tree,zfs_btree_core_t * parent,uint32_t offset,zfs_btree_hdr_t * new_node,void * buf)615 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
616 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
617 {
618 size_t size = tree->bt_elem_size;
619 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
620 ASSERT3P(par_hdr, ==, new_node->bth_parent);
621 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
622
623 if (zfs_btree_verify_intensity >= 5) {
624 zfs_btree_verify_poison_at(tree, par_hdr,
625 par_hdr->bth_count);
626 }
627 /* Shift existing elements and children */
628 uint32_t count = par_hdr->bth_count - offset;
629 bt_shift_core_right(tree, parent, offset, count,
630 BSS_PARALLELOGRAM);
631
632 /* Insert new values */
633 parent->btc_children[offset + 1] = new_node;
634 bcpy(buf, parent->btc_elems + offset * size, size);
635 par_hdr->bth_count++;
636 }
637
638 /*
639 * Insert new_node into the parent of old_node directly after old_node, with
640 * buf as the dividing element between the two.
641 */
642 static void
zfs_btree_insert_into_parent(zfs_btree_t * tree,zfs_btree_hdr_t * old_node,zfs_btree_hdr_t * new_node,void * buf)643 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
644 zfs_btree_hdr_t *new_node, void *buf)
645 {
646 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
647 size_t size = tree->bt_elem_size;
648 zfs_btree_core_t *parent = old_node->bth_parent;
649
650 /*
651 * If this is the root node we were splitting, we create a new root
652 * and increase the height of the tree.
653 */
654 if (parent == NULL) {
655 ASSERT3P(old_node, ==, tree->bt_root);
656 tree->bt_num_nodes++;
657 zfs_btree_core_t *new_root =
658 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
659 size, KM_SLEEP);
660 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
661 new_root_hdr->bth_parent = NULL;
662 new_root_hdr->bth_first = -1;
663 new_root_hdr->bth_count = 1;
664
665 old_node->bth_parent = new_node->bth_parent = new_root;
666 new_root->btc_children[0] = old_node;
667 new_root->btc_children[1] = new_node;
668 bcpy(buf, new_root->btc_elems, size);
669
670 tree->bt_height++;
671 tree->bt_root = new_root_hdr;
672 zfs_btree_poison_node(tree, new_root_hdr);
673 return;
674 }
675
676 /*
677 * Since we have the new separator, binary search for where to put
678 * new_node.
679 */
680 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
681 zfs_btree_index_t idx;
682 ASSERT(zfs_btree_is_core(par_hdr));
683 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
684 par_hdr->bth_count, buf, &idx), ==, NULL);
685 ASSERT(idx.bti_before);
686 uint32_t offset = idx.bti_offset;
687 ASSERT3U(offset, <=, par_hdr->bth_count);
688 ASSERT3P(parent->btc_children[offset], ==, old_node);
689
690 /*
691 * If the parent isn't full, shift things to accommodate our insertions
692 * and return.
693 */
694 if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
695 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
696 return;
697 }
698
699 /*
700 * We need to split this core node into two. Currently there are
701 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
702 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
703 * current node, and the others will be moved to the new core node.
704 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
705 * will be used as the new separator in our parent, and the others
706 * will be split among the two core nodes.
707 *
708 * Usually we will split the node in half evenly, with
709 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
710 * instead move only about a quarter of the elements (and children) to
711 * the new node. Since the average state after a long time is a 3/4
712 * full node, shortcutting directly to that state improves efficiency.
713 *
714 * We do this in two stages: first we split into two nodes, and then we
715 * reuse our existing logic to insert the new element and child.
716 */
717 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
718 2 : 4)) - 1, 2);
719 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
720 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
721 tree->bt_num_nodes++;
722 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
723 BTREE_CORE_ELEMS * size, KM_SLEEP);
724 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
725 new_par_hdr->bth_parent = par_hdr->bth_parent;
726 new_par_hdr->bth_first = -1;
727 new_par_hdr->bth_count = move_count;
728 zfs_btree_poison_node(tree, new_par_hdr);
729
730 par_hdr->bth_count = keep_count;
731
732 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
733 0, BSS_TRAPEZOID);
734
735 /* Store the new separator in a buffer. */
736 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
737 bcpy(parent->btc_elems + keep_count * size, tmp_buf,
738 size);
739 zfs_btree_poison_node(tree, par_hdr);
740
741 if (offset < keep_count) {
742 /* Insert the new node into the left half */
743 zfs_btree_insert_core_impl(tree, parent, offset, new_node,
744 buf);
745
746 /*
747 * Move the new separator to the existing buffer.
748 */
749 bcpy(tmp_buf, buf, size);
750 } else if (offset > keep_count) {
751 /* Insert the new node into the right half */
752 new_node->bth_parent = new_parent;
753 zfs_btree_insert_core_impl(tree, new_parent,
754 offset - keep_count - 1, new_node, buf);
755
756 /*
757 * Move the new separator to the existing buffer.
758 */
759 bcpy(tmp_buf, buf, size);
760 } else {
761 /*
762 * Move the new separator into the right half, and replace it
763 * with buf. We also need to shift back the elements in the
764 * right half to accommodate new_node.
765 */
766 bt_shift_core_right(tree, new_parent, 0, move_count,
767 BSS_TRAPEZOID);
768 new_parent->btc_children[0] = new_node;
769 bcpy(tmp_buf, new_parent->btc_elems, size);
770 new_par_hdr->bth_count++;
771 }
772 kmem_free(tmp_buf, size);
773 zfs_btree_poison_node(tree, par_hdr);
774
775 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
776 new_parent->btc_children[i]->bth_parent = new_parent;
777
778 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
779 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
780
781 /*
782 * Now that the node is split, we need to insert the new node into its
783 * parent. This may cause further splitting.
784 */
785 zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
786 &new_parent->btc_hdr, buf);
787 }
788
789 /* Insert an element into a leaf node at the given offset. */
790 static void
zfs_btree_insert_leaf_impl(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,const void * value)791 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
792 uint32_t idx, const void *value)
793 {
794 size_t size = tree->bt_elem_size;
795 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
796 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
797
798 if (zfs_btree_verify_intensity >= 5) {
799 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
800 leaf->btl_hdr.bth_count);
801 }
802
803 bt_grow_leaf(tree, leaf, idx, 1);
804 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
805 bcpy(value, start, size);
806 }
807
808 static void
809 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
810
811 /* Helper function for inserting a new value into leaf at the given index. */
812 static void
zfs_btree_insert_into_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,const void * value,uint32_t idx)813 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
814 const void *value, uint32_t idx)
815 {
816 size_t size = tree->bt_elem_size;
817 uint32_t capacity = tree->bt_leaf_cap;
818
819 /*
820 * If the leaf isn't full, shift the elements after idx and insert
821 * value.
822 */
823 if (leaf->btl_hdr.bth_count != capacity) {
824 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
825 return;
826 }
827
828 /*
829 * Otherwise, we split the leaf node into two nodes. If we're not bulk
830 * inserting, each is of size (capacity / 2). If we are bulk
831 * inserting, we move a quarter of the elements to the new node so
832 * inserts into the old node don't cause immediate splitting but the
833 * tree stays relatively dense. Since the average state after a long
834 * time is a 3/4 full node, shortcutting directly to that state
835 * improves efficiency. At the end of the bulk insertion process
836 * we'll need to go through and fix up any nodes (the last leaf and
837 * its ancestors, potentially) that are below the minimum.
838 *
839 * In either case, we're left with one extra element. The leftover
840 * element will become the new dividing element between the two nodes.
841 */
842 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
843 uint32_t keep_count = capacity - move_count - 1;
844 ASSERT3U(keep_count, >=, 1);
845 /* If we insert on left. move one more to keep leaves balanced. */
846 if (idx < keep_count) {
847 keep_count--;
848 move_count++;
849 }
850 tree->bt_num_nodes++;
851 zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
852 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
853 new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
854 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
855 (idx >= keep_count && idx <= keep_count + move_count / 2);
856 new_hdr->bth_count = move_count;
857 zfs_btree_poison_node(tree, new_hdr);
858
859 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
860 tree->bt_bulk = new_leaf;
861
862 /* Copy the back part to the new leaf. */
863 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
864
865 /* We store the new separator in a buffer we control for simplicity. */
866 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
867 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
868 buf, size);
869
870 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
871
872 if (idx < keep_count) {
873 /* Insert into the existing leaf. */
874 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
875 } else if (idx > keep_count) {
876 /* Insert into the new leaf. */
877 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
878 1, value);
879 } else {
880 /*
881 * Insert planned separator into the new leaf, and use
882 * the new value as the new separator.
883 */
884 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
885 bcpy(value, buf, size);
886 }
887
888 /*
889 * Now that the node is split, we need to insert the new node into its
890 * parent. This may cause further splitting, bur only of core nodes.
891 */
892 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
893 buf);
894 kmem_free(buf, size);
895 }
896
897 static uint32_t
zfs_btree_find_parent_idx(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)898 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
899 {
900 void *buf;
901 if (zfs_btree_is_core(hdr)) {
902 buf = ((zfs_btree_core_t *)hdr)->btc_elems;
903 } else {
904 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
905 hdr->bth_first * tree->bt_elem_size;
906 }
907 zfs_btree_index_t idx;
908 zfs_btree_core_t *parent = hdr->bth_parent;
909 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
910 parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
911 ASSERT(idx.bti_before);
912 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
913 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
914 return (idx.bti_offset);
915 }
916
917 /*
918 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
919 * nodes may violate the invariant that non-root nodes must be at least half
920 * full. All nodes violating this invariant should be the last node in their
921 * particular level. To correct the invariant, we take values from their left
922 * neighbor until they are half full. They must have a left neighbor at their
923 * level because the last node at a level is not the first node unless it's
924 * the root.
925 */
926 static void
zfs_btree_bulk_finish(zfs_btree_t * tree)927 zfs_btree_bulk_finish(zfs_btree_t *tree)
928 {
929 ASSERT3P(tree->bt_bulk, !=, NULL);
930 ASSERT3P(tree->bt_root, !=, NULL);
931 zfs_btree_leaf_t *leaf = tree->bt_bulk;
932 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
933 zfs_btree_core_t *parent = hdr->bth_parent;
934 size_t size = tree->bt_elem_size;
935 uint32_t capacity = tree->bt_leaf_cap;
936
937 /*
938 * The invariant doesn't apply to the root node, if that's the only
939 * node in the tree we're done.
940 */
941 if (parent == NULL) {
942 tree->bt_bulk = NULL;
943 return;
944 }
945
946 /* First, take elements to rebalance the leaf node. */
947 if (hdr->bth_count < capacity / 2) {
948 /*
949 * First, find the left neighbor. The simplest way to do this
950 * is to call zfs_btree_prev twice; the first time finds some
951 * ancestor of this node, and the second time finds the left
952 * neighbor. The ancestor found is the lowest common ancestor
953 * of leaf and the neighbor.
954 */
955 zfs_btree_index_t idx = {
956 .bti_node = hdr,
957 .bti_offset = 0
958 };
959 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
960 ASSERT(zfs_btree_is_core(idx.bti_node));
961 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
962 uint32_t common_idx = idx.bti_offset;
963
964 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
965 ASSERT(!zfs_btree_is_core(idx.bti_node));
966 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
967 zfs_btree_hdr_t *l_hdr = idx.bti_node;
968 uint32_t move_count = (capacity / 2) - hdr->bth_count;
969 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
970 capacity / 2);
971
972 if (zfs_btree_verify_intensity >= 5) {
973 for (uint32_t i = 0; i < move_count; i++) {
974 zfs_btree_verify_poison_at(tree, hdr,
975 leaf->btl_hdr.bth_count + i);
976 }
977 }
978
979 /* First, shift elements in leaf back. */
980 bt_grow_leaf(tree, leaf, 0, move_count);
981
982 /* Next, move the separator from the common ancestor to leaf. */
983 uint8_t *separator = common->btc_elems + common_idx * size;
984 uint8_t *out = leaf->btl_elems +
985 (hdr->bth_first + move_count - 1) * size;
986 bcpy(separator, out, size);
987
988 /*
989 * Now we move elements from the tail of the left neighbor to
990 * fill the remaining spots in leaf.
991 */
992 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
993 (move_count - 1), move_count - 1, leaf, 0);
994
995 /*
996 * Finally, move the new last element in the left neighbor to
997 * the separator.
998 */
999 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
1000 l_hdr->bth_count - move_count) * size, separator, size);
1001
1002 /* Adjust the node's counts, and we're done. */
1003 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
1004 move_count);
1005
1006 ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
1007 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1008 }
1009
1010 /*
1011 * Now we have to rebalance any ancestors of leaf that may also
1012 * violate the invariant.
1013 */
1014 capacity = BTREE_CORE_ELEMS;
1015 while (parent->btc_hdr.bth_parent != NULL) {
1016 zfs_btree_core_t *cur = parent;
1017 zfs_btree_hdr_t *hdr = &cur->btc_hdr;
1018 parent = hdr->bth_parent;
1019 /*
1020 * If the invariant isn't violated, move on to the next
1021 * ancestor.
1022 */
1023 if (hdr->bth_count >= capacity / 2)
1024 continue;
1025
1026 /*
1027 * Because the smallest number of nodes we can move when
1028 * splitting is 2, we never need to worry about not having a
1029 * left sibling (a sibling is a neighbor with the same parent).
1030 */
1031 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1032 ASSERT3U(parent_idx, >, 0);
1033 zfs_btree_core_t *l_neighbor =
1034 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
1035 uint32_t move_count = (capacity / 2) - hdr->bth_count;
1036 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1037 capacity / 2);
1038
1039 if (zfs_btree_verify_intensity >= 5) {
1040 for (uint32_t i = 0; i < move_count; i++) {
1041 zfs_btree_verify_poison_at(tree, hdr,
1042 hdr->bth_count + i);
1043 }
1044 }
1045 /* First, shift things in the right node back. */
1046 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1047 BSS_TRAPEZOID, BSD_RIGHT);
1048
1049 /* Next, move the separator to the right node. */
1050 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1051 size);
1052 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1053 bcpy(separator, e_out, size);
1054
1055 /*
1056 * Now, move elements and children from the left node to the
1057 * right. We move one more child than elements.
1058 */
1059 move_count--;
1060 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1061 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1062 BSS_TRAPEZOID);
1063
1064 /*
1065 * Finally, move the last element in the left node to the
1066 * separator's position.
1067 */
1068 move_idx--;
1069 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1070
1071 l_neighbor->btc_hdr.bth_count -= move_count + 1;
1072 hdr->bth_count += move_count + 1;
1073
1074 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1075 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1076
1077 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1078
1079 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1080 cur->btc_children[i]->bth_parent = cur;
1081 }
1082
1083 tree->bt_bulk = NULL;
1084 zfs_btree_verify(tree);
1085 }
1086
1087 /*
1088 * Insert value into tree at the location specified by where.
1089 */
1090 void
zfs_btree_add_idx(zfs_btree_t * tree,const void * value,const zfs_btree_index_t * where)1091 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1092 const zfs_btree_index_t *where)
1093 {
1094 zfs_btree_index_t idx = {0};
1095
1096 /* If we're not inserting in the last leaf, end bulk insert mode. */
1097 if (tree->bt_bulk != NULL) {
1098 if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1099 zfs_btree_bulk_finish(tree);
1100 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1101 where = &idx;
1102 }
1103 }
1104
1105 tree->bt_num_elems++;
1106 /*
1107 * If this is the first element in the tree, create a leaf root node
1108 * and add the value to it.
1109 */
1110 if (where->bti_node == NULL) {
1111 ASSERT3U(tree->bt_num_elems, ==, 1);
1112 ASSERT3S(tree->bt_height, ==, -1);
1113 ASSERT3P(tree->bt_root, ==, NULL);
1114 ASSERT0(where->bti_offset);
1115
1116 tree->bt_num_nodes++;
1117 zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
1118 tree->bt_root = &leaf->btl_hdr;
1119 tree->bt_height++;
1120
1121 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1122 hdr->bth_parent = NULL;
1123 hdr->bth_first = 0;
1124 hdr->bth_count = 0;
1125 zfs_btree_poison_node(tree, hdr);
1126
1127 zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1128 tree->bt_bulk = leaf;
1129 } else if (!zfs_btree_is_core(where->bti_node)) {
1130 /*
1131 * If we're inserting into a leaf, go directly to the helper
1132 * function.
1133 */
1134 zfs_btree_insert_into_leaf(tree,
1135 (zfs_btree_leaf_t *)where->bti_node, value,
1136 where->bti_offset);
1137 } else {
1138 /*
1139 * If we're inserting into a core node, we can't just shift
1140 * the existing element in that slot in the same node without
1141 * breaking our ordering invariants. Instead we place the new
1142 * value in the node at that spot and then insert the old
1143 * separator into the first slot in the subtree to the right.
1144 */
1145 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1146
1147 /*
1148 * We can ignore bti_before, because either way the value
1149 * should end up in bti_offset.
1150 */
1151 uint32_t off = where->bti_offset;
1152 zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1153 size_t size = tree->bt_elem_size;
1154 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1155 bcpy(node->btc_elems + off * size, buf, size);
1156 bcpy(value, node->btc_elems + off * size, size);
1157
1158 /*
1159 * Find the first slot in the subtree to the right, insert
1160 * there.
1161 */
1162 zfs_btree_index_t new_idx;
1163 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1164 NULL);
1165 ASSERT0(new_idx.bti_offset);
1166 ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1167 zfs_btree_insert_into_leaf(tree,
1168 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1169 kmem_free(buf, size);
1170 }
1171 zfs_btree_verify(tree);
1172 }
1173
1174 /*
1175 * Return the first element in the tree, and put its location in where if
1176 * non-null.
1177 */
1178 void *
zfs_btree_first(zfs_btree_t * tree,zfs_btree_index_t * where)1179 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1180 {
1181 if (tree->bt_height == -1) {
1182 ASSERT0(tree->bt_num_elems);
1183 return (NULL);
1184 }
1185 return (zfs_btree_first_helper(tree, tree->bt_root, where));
1186 }
1187
1188 /*
1189 * Find the last element in the subtree rooted at hdr, return its value and
1190 * put its location in where if non-null.
1191 */
1192 static void *
zfs_btree_last_helper(zfs_btree_t * btree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)1193 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1194 zfs_btree_index_t *where)
1195 {
1196 zfs_btree_hdr_t *node;
1197
1198 for (node = hdr; zfs_btree_is_core(node); node =
1199 ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1200 ;
1201
1202 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1203 if (where != NULL) {
1204 where->bti_node = node;
1205 where->bti_offset = node->bth_count - 1;
1206 where->bti_before = B_FALSE;
1207 }
1208 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1209 btree->bt_elem_size);
1210 }
1211
1212 /*
1213 * Return the last element in the tree, and put its location in where if
1214 * non-null.
1215 */
1216 void *
zfs_btree_last(zfs_btree_t * tree,zfs_btree_index_t * where)1217 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1218 {
1219 if (tree->bt_height == -1) {
1220 ASSERT0(tree->bt_num_elems);
1221 return (NULL);
1222 }
1223 return (zfs_btree_last_helper(tree, tree->bt_root, where));
1224 }
1225
1226 /*
1227 * This function contains the logic to find the next node in the tree. A
1228 * helper function is used because there are multiple internal consumemrs of
1229 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1230 * node after we've finished with it.
1231 */
1232 static void *
zfs_btree_next_helper(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx,void (* done_func)(zfs_btree_t *,zfs_btree_hdr_t *))1233 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1234 zfs_btree_index_t *out_idx,
1235 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1236 {
1237 if (idx->bti_node == NULL) {
1238 ASSERT3S(tree->bt_height, ==, -1);
1239 return (NULL);
1240 }
1241
1242 uint32_t offset = idx->bti_offset;
1243 if (!zfs_btree_is_core(idx->bti_node)) {
1244 /*
1245 * When finding the next element of an element in a leaf,
1246 * there are two cases. If the element isn't the last one in
1247 * the leaf, in which case we just return the next element in
1248 * the leaf. Otherwise, we need to traverse up our parents
1249 * until we find one where our ancestor isn't the last child
1250 * of its parent. Once we do, the next element is the
1251 * separator after our ancestor in its parent.
1252 */
1253 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1254 uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1255 if (leaf->btl_hdr.bth_count > new_off) {
1256 out_idx->bti_node = &leaf->btl_hdr;
1257 out_idx->bti_offset = new_off;
1258 out_idx->bti_before = B_FALSE;
1259 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1260 new_off) * tree->bt_elem_size);
1261 }
1262
1263 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1264 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1265 node != NULL; node = node->btc_hdr.bth_parent) {
1266 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1267 ASSERT(zfs_btree_is_core(hdr));
1268 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1269 if (done_func != NULL)
1270 done_func(tree, prev);
1271 if (i == hdr->bth_count) {
1272 prev = hdr;
1273 continue;
1274 }
1275 out_idx->bti_node = hdr;
1276 out_idx->bti_offset = i;
1277 out_idx->bti_before = B_FALSE;
1278 return (node->btc_elems + i * tree->bt_elem_size);
1279 }
1280 if (done_func != NULL)
1281 done_func(tree, prev);
1282 /*
1283 * We've traversed all the way up and been at the end of the
1284 * node every time, so this was the last element in the tree.
1285 */
1286 return (NULL);
1287 }
1288
1289 /* If we were before an element in a core node, return that element. */
1290 ASSERT(zfs_btree_is_core(idx->bti_node));
1291 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1292 if (idx->bti_before) {
1293 out_idx->bti_before = B_FALSE;
1294 return (node->btc_elems + offset * tree->bt_elem_size);
1295 }
1296
1297 /*
1298 * The next element from one in a core node is the first element in
1299 * the subtree just to the right of the separator.
1300 */
1301 zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1302 return (zfs_btree_first_helper(tree, child, out_idx));
1303 }
1304
1305 /*
1306 * Return the next valued node in the tree. The same address can be safely
1307 * passed for idx and out_idx.
1308 */
1309 void *
zfs_btree_next(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1310 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1311 zfs_btree_index_t *out_idx)
1312 {
1313 return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1314 }
1315
1316 /*
1317 * Return the previous valued node in the tree. The same value can be safely
1318 * passed for idx and out_idx.
1319 */
1320 void *
zfs_btree_prev(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1321 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1322 zfs_btree_index_t *out_idx)
1323 {
1324 if (idx->bti_node == NULL) {
1325 ASSERT3S(tree->bt_height, ==, -1);
1326 return (NULL);
1327 }
1328
1329 uint32_t offset = idx->bti_offset;
1330 if (!zfs_btree_is_core(idx->bti_node)) {
1331 /*
1332 * When finding the previous element of an element in a leaf,
1333 * there are two cases. If the element isn't the first one in
1334 * the leaf, in which case we just return the previous element
1335 * in the leaf. Otherwise, we need to traverse up our parents
1336 * until we find one where our previous ancestor isn't the
1337 * first child. Once we do, the previous element is the
1338 * separator after our previous ancestor.
1339 */
1340 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1341 if (offset != 0) {
1342 out_idx->bti_node = &leaf->btl_hdr;
1343 out_idx->bti_offset = offset - 1;
1344 out_idx->bti_before = B_FALSE;
1345 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1346 offset - 1) * tree->bt_elem_size);
1347 }
1348 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1349 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1350 node != NULL; node = node->btc_hdr.bth_parent) {
1351 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1352 ASSERT(zfs_btree_is_core(hdr));
1353 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1354 if (i == 0) {
1355 prev = hdr;
1356 continue;
1357 }
1358 out_idx->bti_node = hdr;
1359 out_idx->bti_offset = i - 1;
1360 out_idx->bti_before = B_FALSE;
1361 return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1362 }
1363 /*
1364 * We've traversed all the way up and been at the start of the
1365 * node every time, so this was the first node in the tree.
1366 */
1367 return (NULL);
1368 }
1369
1370 /*
1371 * The previous element from one in a core node is the last element in
1372 * the subtree just to the left of the separator.
1373 */
1374 ASSERT(zfs_btree_is_core(idx->bti_node));
1375 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1376 zfs_btree_hdr_t *child = node->btc_children[offset];
1377 return (zfs_btree_last_helper(tree, child, out_idx));
1378 }
1379
1380 /*
1381 * Get the value at the provided index in the tree.
1382 *
1383 * Note that the value returned from this function can be mutated, but only
1384 * if it will not change the ordering of the element with respect to any other
1385 * elements that could be in the tree.
1386 */
1387 void *
zfs_btree_get(zfs_btree_t * tree,zfs_btree_index_t * idx)1388 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1389 {
1390 ASSERT(!idx->bti_before);
1391 size_t size = tree->bt_elem_size;
1392 if (!zfs_btree_is_core(idx->bti_node)) {
1393 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1394 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1395 idx->bti_offset) * size);
1396 }
1397 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1398 return (node->btc_elems + idx->bti_offset * size);
1399 }
1400
1401 /* Add the given value to the tree. Must not already be in the tree. */
1402 void
zfs_btree_add(zfs_btree_t * tree,const void * node)1403 zfs_btree_add(zfs_btree_t *tree, const void *node)
1404 {
1405 zfs_btree_index_t where = {0};
1406 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1407 zfs_btree_add_idx(tree, node, &where);
1408 }
1409
1410 /* Helper function to free a tree node. */
1411 static void
zfs_btree_node_destroy(zfs_btree_t * tree,zfs_btree_hdr_t * node)1412 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1413 {
1414 tree->bt_num_nodes--;
1415 if (!zfs_btree_is_core(node)) {
1416 zfs_btree_leaf_free(tree, node);
1417 } else {
1418 kmem_free(node, sizeof (zfs_btree_core_t) +
1419 BTREE_CORE_ELEMS * tree->bt_elem_size);
1420 }
1421 }
1422
1423 /*
1424 * Remove the rm_hdr and the separator to its left from the parent node. The
1425 * buffer that rm_hdr was stored in may already be freed, so its contents
1426 * cannot be accessed.
1427 */
1428 static void
zfs_btree_remove_from_node(zfs_btree_t * tree,zfs_btree_core_t * node,zfs_btree_hdr_t * rm_hdr)1429 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1430 zfs_btree_hdr_t *rm_hdr)
1431 {
1432 size_t size = tree->bt_elem_size;
1433 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1434 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1435 /*
1436 * If the node is the root node and rm_hdr is one of two children,
1437 * promote the other child to the root.
1438 */
1439 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1440 ASSERT3U(hdr->bth_count, ==, 1);
1441 ASSERT3P(tree->bt_root, ==, node);
1442 ASSERT3P(node->btc_children[1], ==, rm_hdr);
1443 tree->bt_root = node->btc_children[0];
1444 node->btc_children[0]->bth_parent = NULL;
1445 zfs_btree_node_destroy(tree, hdr);
1446 tree->bt_height--;
1447 return;
1448 }
1449
1450 uint32_t idx;
1451 for (idx = 0; idx <= hdr->bth_count; idx++) {
1452 if (node->btc_children[idx] == rm_hdr)
1453 break;
1454 }
1455 ASSERT3U(idx, <=, hdr->bth_count);
1456
1457 /*
1458 * If the node is the root or it has more than the minimum number of
1459 * children, just remove the child and separator, and return.
1460 */
1461 if (hdr->bth_parent == NULL ||
1462 hdr->bth_count > min_count) {
1463 /*
1464 * Shift the element and children to the right of rm_hdr to
1465 * the left by one spot.
1466 */
1467 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1468 BSS_PARALLELOGRAM);
1469 hdr->bth_count--;
1470 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1471 return;
1472 }
1473
1474 ASSERT3U(hdr->bth_count, ==, min_count);
1475
1476 /*
1477 * Now we try to take a node from a neighbor. We check left, then
1478 * right. If the neighbor exists and has more than the minimum number
1479 * of elements, we move the separator between us and them to our
1480 * node, move their closest element (last for left, first for right)
1481 * to the separator, and move their closest child to our node. Along
1482 * the way we need to collapse the gap made by idx, and (for our right
1483 * neighbor) the gap made by removing their first element and child.
1484 *
1485 * Note: this logic currently doesn't support taking from a neighbor
1486 * that isn't a sibling (i.e. a neighbor with a different
1487 * parent). This isn't critical functionality, but may be worth
1488 * implementing in the future for completeness' sake.
1489 */
1490 zfs_btree_core_t *parent = hdr->bth_parent;
1491 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1492
1493 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1494 parent->btc_children[parent_idx - 1]);
1495 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1496 /* We can take a node from the left neighbor. */
1497 ASSERT(zfs_btree_is_core(l_hdr));
1498 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1499
1500 /*
1501 * Start by shifting the elements and children in the current
1502 * node to the right by one spot.
1503 */
1504 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1505
1506 /*
1507 * Move the separator between node and neighbor to the first
1508 * element slot in the current node.
1509 */
1510 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1511 size;
1512 bcpy(separator, node->btc_elems, size);
1513
1514 /* Move the last child of neighbor to our first child slot. */
1515 node->btc_children[0] =
1516 neighbor->btc_children[l_hdr->bth_count];
1517 node->btc_children[0]->bth_parent = node;
1518
1519 /* Move the last element of neighbor to the separator spot. */
1520 uint8_t *take_elem = neighbor->btc_elems +
1521 (l_hdr->bth_count - 1) * size;
1522 bcpy(take_elem, separator, size);
1523 l_hdr->bth_count--;
1524 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1525 return;
1526 }
1527
1528 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1529 NULL : parent->btc_children[parent_idx + 1]);
1530 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1531 /* We can take a node from the right neighbor. */
1532 ASSERT(zfs_btree_is_core(r_hdr));
1533 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1534
1535 /*
1536 * Shift elements in node left by one spot to overwrite rm_hdr
1537 * and the separator before it.
1538 */
1539 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1540 BSS_PARALLELOGRAM);
1541
1542 /*
1543 * Move the separator between node and neighbor to the last
1544 * element spot in node.
1545 */
1546 uint8_t *separator = parent->btc_elems + parent_idx * size;
1547 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1548 size);
1549
1550 /*
1551 * Move the first child of neighbor to the last child spot in
1552 * node.
1553 */
1554 node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1555 node->btc_children[hdr->bth_count]->bth_parent = node;
1556
1557 /* Move the first element of neighbor to the separator spot. */
1558 uint8_t *take_elem = neighbor->btc_elems;
1559 bcpy(take_elem, separator, size);
1560 r_hdr->bth_count--;
1561
1562 /*
1563 * Shift the elements and children of neighbor to cover the
1564 * stolen elements.
1565 */
1566 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1567 BSS_TRAPEZOID);
1568 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1569 return;
1570 }
1571
1572 /*
1573 * In this case, neither of our neighbors can spare an element, so we
1574 * need to merge with one of them. We prefer the left one,
1575 * arbitrarily. Move the separator into the leftmost merging node
1576 * (which may be us or the left neighbor), and then move the right
1577 * merging node's elements. Once that's done, we go back and delete
1578 * the element we're removing. Finally, go into the parent and delete
1579 * the right merging node and the separator. This may cause further
1580 * merging.
1581 */
1582 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1583 uint32_t new_idx = idx;
1584 if (l_hdr != NULL) {
1585 keep_hdr = l_hdr;
1586 new_rm_hdr = hdr;
1587 new_idx += keep_hdr->bth_count + 1;
1588 } else {
1589 ASSERT3P(r_hdr, !=, NULL);
1590 keep_hdr = hdr;
1591 new_rm_hdr = r_hdr;
1592 parent_idx++;
1593 }
1594
1595 ASSERT(zfs_btree_is_core(keep_hdr));
1596 ASSERT(zfs_btree_is_core(new_rm_hdr));
1597
1598 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1599 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1600
1601 if (zfs_btree_verify_intensity >= 5) {
1602 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1603 zfs_btree_verify_poison_at(tree, keep_hdr,
1604 keep_hdr->bth_count + i);
1605 }
1606 }
1607
1608 /* Move the separator into the left node. */
1609 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1610 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1611 size;
1612 bcpy(separator, e_out, size);
1613 keep_hdr->bth_count++;
1614
1615 /* Move all our elements and children into the left node. */
1616 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1617 keep_hdr->bth_count, BSS_TRAPEZOID);
1618
1619 uint32_t old_count = keep_hdr->bth_count;
1620
1621 /* Update bookkeeping */
1622 keep_hdr->bth_count += new_rm_hdr->bth_count;
1623 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1624
1625 /*
1626 * Shift the element and children to the right of rm_hdr to
1627 * the left by one spot.
1628 */
1629 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1630 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1631 BSS_PARALLELOGRAM);
1632 keep_hdr->bth_count--;
1633
1634 /* Reparent all our children to point to the left node. */
1635 zfs_btree_hdr_t **new_start = keep->btc_children +
1636 old_count - 1;
1637 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1638 new_start[i]->bth_parent = keep;
1639 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1640 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1641 ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1642 }
1643 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1644
1645 new_rm_hdr->bth_count = 0;
1646 zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1647 zfs_btree_node_destroy(tree, new_rm_hdr);
1648 }
1649
1650 /* Remove the element at the specific location. */
1651 void
zfs_btree_remove_idx(zfs_btree_t * tree,zfs_btree_index_t * where)1652 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1653 {
1654 size_t size = tree->bt_elem_size;
1655 zfs_btree_hdr_t *hdr = where->bti_node;
1656 uint32_t idx = where->bti_offset;
1657
1658 ASSERT(!where->bti_before);
1659 if (tree->bt_bulk != NULL) {
1660 /*
1661 * Leave bulk insert mode. Note that our index would be
1662 * invalid after we correct the tree, so we copy the value
1663 * we're planning to remove and find it again after
1664 * bulk_finish.
1665 */
1666 uint8_t *value = zfs_btree_get(tree, where);
1667 uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1668 bcpy(value, tmp, size);
1669 zfs_btree_bulk_finish(tree);
1670 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1671 kmem_free(tmp, size);
1672 hdr = where->bti_node;
1673 idx = where->bti_offset;
1674 }
1675
1676 tree->bt_num_elems--;
1677 /*
1678 * If the element happens to be in a core node, we move a leaf node's
1679 * element into its place and then remove the leaf node element. This
1680 * makes the rebalance logic not need to be recursive both upwards and
1681 * downwards.
1682 */
1683 if (zfs_btree_is_core(hdr)) {
1684 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1685 zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1686 void *new_value = zfs_btree_last_helper(tree, left_subtree,
1687 where);
1688 ASSERT3P(new_value, !=, NULL);
1689
1690 bcpy(new_value, node->btc_elems + idx * size, size);
1691
1692 hdr = where->bti_node;
1693 idx = where->bti_offset;
1694 ASSERT(!where->bti_before);
1695 }
1696
1697 /*
1698 * First, we'll update the leaf's metadata. Then, we shift any
1699 * elements after the idx to the left. After that, we rebalance if
1700 * needed.
1701 */
1702 ASSERT(!zfs_btree_is_core(hdr));
1703 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1704 ASSERT3U(hdr->bth_count, >, 0);
1705
1706 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1707
1708 /*
1709 * If we're over the minimum size or this is the root, just overwrite
1710 * the value and return.
1711 */
1712 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1713 bt_shrink_leaf(tree, leaf, idx, 1);
1714 if (hdr->bth_parent == NULL) {
1715 ASSERT0(tree->bt_height);
1716 if (hdr->bth_count == 0) {
1717 tree->bt_root = NULL;
1718 tree->bt_height--;
1719 zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1720 }
1721 }
1722 zfs_btree_verify(tree);
1723 return;
1724 }
1725 ASSERT3U(hdr->bth_count, ==, min_count);
1726
1727 /*
1728 * Now we try to take a node from a sibling. We check left, then
1729 * right. If they exist and have more than the minimum number of
1730 * elements, we move the separator between us and them to our node
1731 * and move their closest element (last for left, first for right) to
1732 * the separator. Along the way we need to collapse the gap made by
1733 * idx, and (for our right neighbor) the gap made by removing their
1734 * first element.
1735 *
1736 * Note: this logic currently doesn't support taking from a neighbor
1737 * that isn't a sibling. This isn't critical functionality, but may be
1738 * worth implementing in the future for completeness' sake.
1739 */
1740 zfs_btree_core_t *parent = hdr->bth_parent;
1741 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1742
1743 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1744 parent->btc_children[parent_idx - 1]);
1745 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1746 /* We can take a node from the left neighbor. */
1747 ASSERT(!zfs_btree_is_core(l_hdr));
1748 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1749
1750 /*
1751 * Move our elements back by one spot to make room for the
1752 * stolen element and overwrite the element being removed.
1753 */
1754 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1755
1756 /* Move the separator to our first spot. */
1757 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1758 size;
1759 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1760
1761 /* Move our neighbor's last element to the separator. */
1762 uint8_t *take_elem = neighbor->btl_elems +
1763 (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1764 bcpy(take_elem, separator, size);
1765
1766 /* Delete our neighbor's last element. */
1767 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1768 zfs_btree_verify(tree);
1769 return;
1770 }
1771
1772 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1773 NULL : parent->btc_children[parent_idx + 1]);
1774 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1775 /* We can take a node from the right neighbor. */
1776 ASSERT(!zfs_btree_is_core(r_hdr));
1777 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1778
1779 /*
1780 * Move our elements after the element being removed forwards
1781 * by one spot to make room for the stolen element and
1782 * overwrite the element being removed.
1783 */
1784 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1785 1, BSD_LEFT);
1786
1787 /* Move the separator between us to our last spot. */
1788 uint8_t *separator = parent->btc_elems + parent_idx * size;
1789 bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1790 hdr->bth_count - 1) * size, size);
1791
1792 /* Move our neighbor's first element to the separator. */
1793 uint8_t *take_elem = neighbor->btl_elems +
1794 r_hdr->bth_first * size;
1795 bcpy(take_elem, separator, size);
1796
1797 /* Delete our neighbor's first element. */
1798 bt_shrink_leaf(tree, neighbor, 0, 1);
1799 zfs_btree_verify(tree);
1800 return;
1801 }
1802
1803 /*
1804 * In this case, neither of our neighbors can spare an element, so we
1805 * need to merge with one of them. We prefer the left one, arbitrarily.
1806 * After remove we move the separator into the leftmost merging node
1807 * (which may be us or the left neighbor), and then move the right
1808 * merging node's elements. Once that's done, we go back and delete
1809 * the element we're removing. Finally, go into the parent and delete
1810 * the right merging node and the separator. This may cause further
1811 * merging.
1812 */
1813 zfs_btree_hdr_t *rm_hdr, *k_hdr;
1814 if (l_hdr != NULL) {
1815 k_hdr = l_hdr;
1816 rm_hdr = hdr;
1817 } else {
1818 ASSERT3P(r_hdr, !=, NULL);
1819 k_hdr = hdr;
1820 rm_hdr = r_hdr;
1821 parent_idx++;
1822 }
1823 ASSERT(!zfs_btree_is_core(k_hdr));
1824 ASSERT(!zfs_btree_is_core(rm_hdr));
1825 ASSERT3U(k_hdr->bth_count, ==, min_count);
1826 ASSERT3U(rm_hdr->bth_count, ==, min_count);
1827 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1828 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1829
1830 if (zfs_btree_verify_intensity >= 5) {
1831 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1832 zfs_btree_verify_poison_at(tree, k_hdr,
1833 k_hdr->bth_count + i);
1834 }
1835 }
1836
1837 /*
1838 * Remove the value from the node. It will go below the minimum,
1839 * but we'll fix it in no time.
1840 */
1841 bt_shrink_leaf(tree, leaf, idx, 1);
1842
1843 /* Prepare space for elements to be moved from the right. */
1844 uint32_t k_count = k_hdr->bth_count;
1845 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1846 ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1847
1848 /* Move the separator into the first open spot. */
1849 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1850 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1851 bcpy(separator, out, size);
1852
1853 /* Move our elements to the left neighbor. */
1854 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1855
1856 /* Remove the emptied node from the parent. */
1857 zfs_btree_remove_from_node(tree, parent, rm_hdr);
1858 zfs_btree_node_destroy(tree, rm_hdr);
1859 zfs_btree_verify(tree);
1860 }
1861
1862 /* Remove the given value from the tree. */
1863 void
zfs_btree_remove(zfs_btree_t * tree,const void * value)1864 zfs_btree_remove(zfs_btree_t *tree, const void *value)
1865 {
1866 zfs_btree_index_t where = {0};
1867 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1868 zfs_btree_remove_idx(tree, &where);
1869 }
1870
1871 /* Return the number of elements in the tree. */
1872 ulong_t
zfs_btree_numnodes(zfs_btree_t * tree)1873 zfs_btree_numnodes(zfs_btree_t *tree)
1874 {
1875 return (tree->bt_num_elems);
1876 }
1877
1878 /*
1879 * This function is used to visit all the elements in the tree before
1880 * destroying the tree. This allows the calling code to perform any cleanup it
1881 * needs to do. This is more efficient than just removing the first element
1882 * over and over, because it removes all rebalancing. Once the destroy_nodes()
1883 * function has been called, no other btree operations are valid until it
1884 * returns NULL, which point the only valid operation is zfs_btree_destroy().
1885 *
1886 * example:
1887 *
1888 * zfs_btree_index_t *cookie = NULL;
1889 * my_data_t *node;
1890 *
1891 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1892 * free(node->ptr);
1893 * zfs_btree_destroy(tree);
1894 *
1895 */
1896 void *
zfs_btree_destroy_nodes(zfs_btree_t * tree,zfs_btree_index_t ** cookie)1897 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1898 {
1899 if (*cookie == NULL) {
1900 if (tree->bt_height == -1)
1901 return (NULL);
1902 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1903 return (zfs_btree_first(tree, *cookie));
1904 }
1905
1906 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1907 zfs_btree_node_destroy);
1908 if (rval == NULL) {
1909 tree->bt_root = NULL;
1910 tree->bt_height = -1;
1911 tree->bt_num_elems = 0;
1912 kmem_free(*cookie, sizeof (**cookie));
1913 tree->bt_bulk = NULL;
1914 }
1915 return (rval);
1916 }
1917
1918 static void
zfs_btree_clear_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1919 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1920 {
1921 if (zfs_btree_is_core(hdr)) {
1922 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1923 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1924 zfs_btree_clear_helper(tree, btc->btc_children[i]);
1925 }
1926
1927 zfs_btree_node_destroy(tree, hdr);
1928 }
1929
1930 void
zfs_btree_clear(zfs_btree_t * tree)1931 zfs_btree_clear(zfs_btree_t *tree)
1932 {
1933 if (tree->bt_root == NULL) {
1934 ASSERT0(tree->bt_num_elems);
1935 return;
1936 }
1937
1938 zfs_btree_clear_helper(tree, tree->bt_root);
1939 tree->bt_num_elems = 0;
1940 tree->bt_root = NULL;
1941 tree->bt_num_nodes = 0;
1942 tree->bt_height = -1;
1943 tree->bt_bulk = NULL;
1944 }
1945
1946 void
zfs_btree_destroy(zfs_btree_t * tree)1947 zfs_btree_destroy(zfs_btree_t *tree)
1948 {
1949 ASSERT0(tree->bt_num_elems);
1950 ASSERT3P(tree->bt_root, ==, NULL);
1951 }
1952
1953 /* Verify that every child of this node has the correct parent pointer. */
1954 static void
zfs_btree_verify_pointers_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1955 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1956 {
1957 if (!zfs_btree_is_core(hdr))
1958 return;
1959
1960 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1961 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1962 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1963 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1964 }
1965 }
1966
1967 /* Verify that every node has the correct parent pointer. */
1968 static void
zfs_btree_verify_pointers(zfs_btree_t * tree)1969 zfs_btree_verify_pointers(zfs_btree_t *tree)
1970 {
1971 if (tree->bt_height == -1) {
1972 VERIFY3P(tree->bt_root, ==, NULL);
1973 return;
1974 }
1975 VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
1976 zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1977 }
1978
1979 /*
1980 * Verify that all the current node and its children satisfy the count
1981 * invariants, and return the total count in the subtree rooted in this node.
1982 */
1983 static uint64_t
zfs_btree_verify_counts_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1984 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1985 {
1986 if (!zfs_btree_is_core(hdr)) {
1987 if (tree->bt_root != hdr && tree->bt_bulk &&
1988 hdr != &tree->bt_bulk->btl_hdr) {
1989 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
1990 }
1991
1992 return (hdr->bth_count);
1993 } else {
1994
1995 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1996 uint64_t ret = hdr->bth_count;
1997 if (tree->bt_root != hdr && tree->bt_bulk == NULL)
1998 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
1999 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2000 ret += zfs_btree_verify_counts_helper(tree,
2001 node->btc_children[i]);
2002 }
2003
2004 return (ret);
2005 }
2006 }
2007
2008 /*
2009 * Verify that all nodes satisfy the invariants and that the total number of
2010 * elements is correct.
2011 */
2012 static void
zfs_btree_verify_counts(zfs_btree_t * tree)2013 zfs_btree_verify_counts(zfs_btree_t *tree)
2014 {
2015 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
2016 if (tree->bt_height == -1) {
2017 return;
2018 }
2019 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
2020 tree->bt_num_elems);
2021 }
2022
2023 /*
2024 * Check that the subtree rooted at this node has a uniform height. Returns
2025 * the number of nodes under this node, to help verify bt_num_nodes.
2026 */
2027 static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,int32_t height)2028 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
2029 int32_t height)
2030 {
2031 if (!zfs_btree_is_core(hdr)) {
2032 VERIFY0(height);
2033 return (1);
2034 }
2035
2036 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2037 uint64_t ret = 1;
2038 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2039 ret += zfs_btree_verify_height_helper(tree,
2040 node->btc_children[i], height - 1);
2041 }
2042 return (ret);
2043 }
2044
2045 /*
2046 * Check that the tree rooted at this node has a uniform height, and that the
2047 * bt_height in the tree is correct.
2048 */
2049 static void
zfs_btree_verify_height(zfs_btree_t * tree)2050 zfs_btree_verify_height(zfs_btree_t *tree)
2051 {
2052 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2053 if (tree->bt_height == -1) {
2054 return;
2055 }
2056
2057 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2058 tree->bt_height), ==, tree->bt_num_nodes);
2059 }
2060
2061 /*
2062 * Check that the elements in this node are sorted, and that if this is a core
2063 * node, the separators are properly between the subtrees they separaate and
2064 * that the children also satisfy this requirement.
2065 */
2066 static void
zfs_btree_verify_order_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2067 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2068 {
2069 size_t size = tree->bt_elem_size;
2070 if (!zfs_btree_is_core(hdr)) {
2071 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2072 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2073 VERIFY3S(tree->bt_compar(leaf->btl_elems +
2074 (hdr->bth_first + i - 1) * size,
2075 leaf->btl_elems +
2076 (hdr->bth_first + i) * size), ==, -1);
2077 }
2078 return;
2079 }
2080
2081 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2082 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2083 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2084 node->btc_elems + i * size), ==, -1);
2085 }
2086 for (uint32_t i = 0; i < hdr->bth_count; i++) {
2087 uint8_t *left_child_last = NULL;
2088 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2089 if (zfs_btree_is_core(left_child_hdr)) {
2090 zfs_btree_core_t *left_child =
2091 (zfs_btree_core_t *)left_child_hdr;
2092 left_child_last = left_child->btc_elems +
2093 (left_child_hdr->bth_count - 1) * size;
2094 } else {
2095 zfs_btree_leaf_t *left_child =
2096 (zfs_btree_leaf_t *)left_child_hdr;
2097 left_child_last = left_child->btl_elems +
2098 (left_child_hdr->bth_first +
2099 left_child_hdr->bth_count - 1) * size;
2100 }
2101 int comp = tree->bt_compar(node->btc_elems + i * size,
2102 left_child_last);
2103 if (comp <= 0) {
2104 panic("btree: compar returned %d (expected 1) at "
2105 "%px %d: compar(%px, %px)", comp, node, i,
2106 node->btc_elems + i * size, left_child_last);
2107 }
2108
2109 uint8_t *right_child_first = NULL;
2110 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2111 if (zfs_btree_is_core(right_child_hdr)) {
2112 zfs_btree_core_t *right_child =
2113 (zfs_btree_core_t *)right_child_hdr;
2114 right_child_first = right_child->btc_elems;
2115 } else {
2116 zfs_btree_leaf_t *right_child =
2117 (zfs_btree_leaf_t *)right_child_hdr;
2118 right_child_first = right_child->btl_elems +
2119 right_child_hdr->bth_first * size;
2120 }
2121 comp = tree->bt_compar(node->btc_elems + i * size,
2122 right_child_first);
2123 if (comp >= 0) {
2124 panic("btree: compar returned %d (expected -1) at "
2125 "%px %d: compar(%px, %px)", comp, node, i,
2126 node->btc_elems + i * size, right_child_first);
2127 }
2128 }
2129 for (uint32_t i = 0; i <= hdr->bth_count; i++)
2130 zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2131 }
2132
2133 /* Check that all elements in the tree are in sorted order. */
2134 static void
zfs_btree_verify_order(zfs_btree_t * tree)2135 zfs_btree_verify_order(zfs_btree_t *tree)
2136 {
2137 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2138 if (tree->bt_height == -1) {
2139 return;
2140 }
2141
2142 zfs_btree_verify_order_helper(tree, tree->bt_root);
2143 }
2144
2145 #ifdef ZFS_DEBUG
2146 /* Check that all unused memory is poisoned correctly. */
2147 static void
zfs_btree_verify_poison_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2148 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2149 {
2150 size_t size = tree->bt_elem_size;
2151 if (!zfs_btree_is_core(hdr)) {
2152 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2153 for (size_t i = 0; i < hdr->bth_first * size; i++)
2154 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2155 size_t esize = tree->bt_leaf_size -
2156 offsetof(zfs_btree_leaf_t, btl_elems);
2157 for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2158 i < esize; i++)
2159 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2160 } else {
2161 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2162 for (size_t i = hdr->bth_count * size;
2163 i < BTREE_CORE_ELEMS * size; i++)
2164 VERIFY3U(node->btc_elems[i], ==, 0x0f);
2165
2166 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2167 i++) {
2168 VERIFY3P(node->btc_children[i], ==,
2169 (zfs_btree_hdr_t *)BTREE_POISON);
2170 }
2171
2172 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2173 zfs_btree_verify_poison_helper(tree,
2174 node->btc_children[i]);
2175 }
2176 }
2177 }
2178 #endif
2179
2180 /* Check that unused memory in the tree is still poisoned. */
2181 static void
zfs_btree_verify_poison(zfs_btree_t * tree)2182 zfs_btree_verify_poison(zfs_btree_t *tree)
2183 {
2184 #ifdef ZFS_DEBUG
2185 if (tree->bt_height == -1)
2186 return;
2187 zfs_btree_verify_poison_helper(tree, tree->bt_root);
2188 #endif
2189 }
2190
2191 void
zfs_btree_verify(zfs_btree_t * tree)2192 zfs_btree_verify(zfs_btree_t *tree)
2193 {
2194 if (zfs_btree_verify_intensity == 0)
2195 return;
2196 zfs_btree_verify_height(tree);
2197 if (zfs_btree_verify_intensity == 1)
2198 return;
2199 zfs_btree_verify_pointers(tree);
2200 if (zfs_btree_verify_intensity == 2)
2201 return;
2202 zfs_btree_verify_counts(tree);
2203 if (zfs_btree_verify_intensity == 3)
2204 return;
2205 zfs_btree_verify_order(tree);
2206
2207 if (zfs_btree_verify_intensity == 4)
2208 return;
2209 zfs_btree_verify_poison(tree);
2210 }
2211
2212 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
2213 "Enable btree verification. Levels above 4 require ZFS be built "
2214 "with debugging");
2215