xref: /freebsd/sys/contrib/openzfs/module/zfs/btree.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * This file and its contents are supplied under the terms of the
6  * Common Development and Distribution License ("CDDL"), version 1.0.
7  * You may only use this file in accordance with the terms of version
8  * 1.0 of the CDDL.
9  *
10  * A full copy of the text of the CDDL should have accompanied this
11  * source.  A copy of the CDDL is also available via the Internet at
12  * http://www.illumos.org/license/CDDL.
13  *
14  * CDDL HEADER END
15  */
16 /*
17  * Copyright (c) 2019 by Delphix. All rights reserved.
18  */
19 
20 #include	<sys/btree.h>
21 #include	<sys/bitops.h>
22 #include	<sys/zfs_context.h>
23 
24 kmem_cache_t *zfs_btree_leaf_cache;
25 
26 /*
27  * Control the extent of the verification that occurs when zfs_btree_verify is
28  * called. Primarily used for debugging when extending the btree logic and
29  * functionality. As the intensity is increased, new verification steps are
30  * added. These steps are cumulative; intensity = 3 includes the intensity = 1
31  * and intensity = 2 steps as well.
32  *
33  * Intensity 1: Verify that the tree's height is consistent throughout.
34  * Intensity 2: Verify that a core node's children's parent pointers point
35  * to the core node.
36  * Intensity 3: Verify that the total number of elements in the tree matches the
37  * sum of the number of elements in each node. Also verifies that each node's
38  * count obeys the invariants (less than or equal to maximum value, greater than
39  * or equal to half the maximum minus one).
40  * Intensity 4: Verify that each element compares less than the element
41  * immediately after it and greater than the one immediately before it using the
42  * comparator function. For core nodes, also checks that each element is greater
43  * than the last element in the first of the two nodes it separates, and less
44  * than the first element in the second of the two nodes.
45  * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
46  * of each node is poisoned appropriately. Note that poisoning always occurs if
47  * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
48  * operation.
49  *
50  * Intensity 4 and 5 are particularly expensive to perform; the previous levels
51  * are a few memory operations per node, while these levels require multiple
52  * operations per element. In addition, when creating large btrees, these
53  * operations are called at every step, resulting in extremely slow operation
54  * (while the asymptotic complexity of the other steps is the same, the
55  * importance of the constant factors cannot be denied).
56  */
57 uint_t zfs_btree_verify_intensity = 0;
58 
59 /*
60  * Convenience functions to silence warnings from memcpy/memmove's
61  * return values and change argument order to src, dest.
62  */
63 static void
bcpy(const void * src,void * dest,size_t size)64 bcpy(const void *src, void *dest, size_t size)
65 {
66 	(void) memcpy(dest, src, size);
67 }
68 
69 static void
bmov(const void * src,void * dest,size_t size)70 bmov(const void *src, void *dest, size_t size)
71 {
72 	(void) memmove(dest, src, size);
73 }
74 
75 static boolean_t
zfs_btree_is_core(struct zfs_btree_hdr * hdr)76 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
77 {
78 	return (hdr->bth_first == -1);
79 }
80 
81 #ifdef _ILP32
82 #define	BTREE_POISON 0xabadb10c
83 #else
84 #define	BTREE_POISON 0xabadb10cdeadbeef
85 #endif
86 
87 static void
zfs_btree_poison_node(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)88 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
89 {
90 #ifdef ZFS_DEBUG
91 	size_t size = tree->bt_elem_size;
92 	if (zfs_btree_is_core(hdr)) {
93 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
94 		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
95 		    i++) {
96 			node->btc_children[i] =
97 			    (zfs_btree_hdr_t *)BTREE_POISON;
98 		}
99 		(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
100 		    (BTREE_CORE_ELEMS - hdr->bth_count) * size);
101 	} else {
102 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
103 		(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
104 		(void) memset(leaf->btl_elems +
105 		    (hdr->bth_first + hdr->bth_count) * size, 0x0f,
106 		    tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
107 		    (hdr->bth_first + hdr->bth_count) * size);
108 	}
109 #endif
110 }
111 
112 static inline void
zfs_btree_poison_node_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx,uint32_t count)113 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
114     uint32_t idx, uint32_t count)
115 {
116 #ifdef ZFS_DEBUG
117 	size_t size = tree->bt_elem_size;
118 	if (zfs_btree_is_core(hdr)) {
119 		ASSERT3U(idx, >=, hdr->bth_count);
120 		ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
121 		ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
122 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
123 		for (uint32_t i = 1; i <= count; i++) {
124 			node->btc_children[idx + i] =
125 			    (zfs_btree_hdr_t *)BTREE_POISON;
126 		}
127 		(void) memset(node->btc_elems + idx * size, 0x0f, count * size);
128 	} else {
129 		ASSERT3U(idx, <=, tree->bt_leaf_cap);
130 		ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
131 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
132 		(void) memset(leaf->btl_elems +
133 		    (hdr->bth_first + idx) * size, 0x0f, count * size);
134 	}
135 #endif
136 }
137 
138 static inline void
zfs_btree_verify_poison_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx)139 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
140     uint32_t idx)
141 {
142 #ifdef ZFS_DEBUG
143 	size_t size = tree->bt_elem_size;
144 	if (zfs_btree_is_core(hdr)) {
145 		ASSERT3U(idx, <, BTREE_CORE_ELEMS);
146 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
147 		zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
148 		VERIFY3P(node->btc_children[idx + 1], ==, cval);
149 		for (size_t i = 0; i < size; i++)
150 			VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
151 	} else  {
152 		ASSERT3U(idx, <, tree->bt_leaf_cap);
153 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
154 		if (idx >= tree->bt_leaf_cap - hdr->bth_first)
155 			return;
156 		for (size_t i = 0; i < size; i++) {
157 			VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
158 			    * size + i], ==, 0x0f);
159 		}
160 	}
161 #endif
162 }
163 
164 void
zfs_btree_init(void)165 zfs_btree_init(void)
166 {
167 	zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
168 	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
169 }
170 
171 void
zfs_btree_fini(void)172 zfs_btree_fini(void)
173 {
174 	kmem_cache_destroy(zfs_btree_leaf_cache);
175 }
176 
177 static void *
zfs_btree_leaf_alloc(zfs_btree_t * tree)178 zfs_btree_leaf_alloc(zfs_btree_t *tree)
179 {
180 	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
181 		return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
182 	else
183 		return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
184 }
185 
186 static void
zfs_btree_leaf_free(zfs_btree_t * tree,void * ptr)187 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
188 {
189 	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
190 		return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
191 	else
192 		return (kmem_free(ptr, tree->bt_leaf_size));
193 }
194 
195 void
zfs_btree_create(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size)196 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
197     bt_find_in_buf_f bt_find_in_buf, size_t size)
198 {
199 	zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
200 	    BTREE_LEAF_SIZE);
201 }
202 
203 static void *
204 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
205     const void *value, zfs_btree_index_t *where);
206 
207 void
zfs_btree_create_custom(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size,size_t lsize)208 zfs_btree_create_custom(zfs_btree_t *tree,
209     int (*compar) (const void *, const void *),
210     bt_find_in_buf_f bt_find_in_buf,
211     size_t size, size_t lsize)
212 {
213 	size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
214 
215 	ASSERT3U(size, <=, esize / 2);
216 	memset(tree, 0, sizeof (*tree));
217 	tree->bt_compar = compar;
218 	tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
219 	    zfs_btree_find_in_buf : bt_find_in_buf;
220 	tree->bt_elem_size = size;
221 	tree->bt_leaf_size = lsize;
222 	tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
223 	tree->bt_height = -1;
224 	tree->bt_bulk = NULL;
225 }
226 
227 /*
228  * Find value in the array of elements provided. Uses a simple binary search.
229  */
230 static void *
zfs_btree_find_in_buf(zfs_btree_t * tree,uint8_t * buf,uint32_t nelems,const void * value,zfs_btree_index_t * where)231 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
232     const void *value, zfs_btree_index_t *where)
233 {
234 	uint32_t max = nelems;
235 	uint32_t min = 0;
236 	while (max > min) {
237 		uint32_t idx = (min + max) / 2;
238 		uint8_t *cur = buf + idx * tree->bt_elem_size;
239 		int comp = tree->bt_compar(cur, value);
240 		if (comp < 0) {
241 			min = idx + 1;
242 		} else if (comp > 0) {
243 			max = idx;
244 		} else {
245 			where->bti_offset = idx;
246 			where->bti_before = B_FALSE;
247 			return (cur);
248 		}
249 	}
250 
251 	where->bti_offset = max;
252 	where->bti_before = B_TRUE;
253 	return (NULL);
254 }
255 
256 /*
257  * Find the given value in the tree. where may be passed as null to use as a
258  * membership test or if the btree is being used as a map.
259  */
260 void *
zfs_btree_find(zfs_btree_t * tree,const void * value,zfs_btree_index_t * where)261 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
262 {
263 	if (tree->bt_height == -1) {
264 		if (where != NULL) {
265 			where->bti_node = NULL;
266 			where->bti_offset = 0;
267 		}
268 		ASSERT0(tree->bt_num_elems);
269 		return (NULL);
270 	}
271 
272 	/*
273 	 * If we're in bulk-insert mode, we check the last spot in the tree
274 	 * and the last leaf in the tree before doing the normal search,
275 	 * because for most workloads the vast majority of finds in
276 	 * bulk-insert mode are to insert new elements.
277 	 */
278 	zfs_btree_index_t idx;
279 	size_t size = tree->bt_elem_size;
280 	if (tree->bt_bulk != NULL) {
281 		zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
282 		int comp = tree->bt_compar(last_leaf->btl_elems +
283 		    (last_leaf->btl_hdr.bth_first +
284 		    last_leaf->btl_hdr.bth_count - 1) * size, value);
285 		if (comp < 0) {
286 			/*
287 			 * If what they're looking for is after the last
288 			 * element, it's not in the tree.
289 			 */
290 			if (where != NULL) {
291 				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
292 				where->bti_offset =
293 				    last_leaf->btl_hdr.bth_count;
294 				where->bti_before = B_TRUE;
295 			}
296 			return (NULL);
297 		} else if (comp == 0) {
298 			if (where != NULL) {
299 				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
300 				where->bti_offset =
301 				    last_leaf->btl_hdr.bth_count - 1;
302 				where->bti_before = B_FALSE;
303 			}
304 			return (last_leaf->btl_elems +
305 			    (last_leaf->btl_hdr.bth_first +
306 			    last_leaf->btl_hdr.bth_count - 1) * size);
307 		}
308 		if (tree->bt_compar(last_leaf->btl_elems +
309 		    last_leaf->btl_hdr.bth_first * size, value) <= 0) {
310 			/*
311 			 * If what they're looking for is after the first
312 			 * element in the last leaf, it's in the last leaf or
313 			 * it's not in the tree.
314 			 */
315 			void *d = tree->bt_find_in_buf(tree,
316 			    last_leaf->btl_elems +
317 			    last_leaf->btl_hdr.bth_first * size,
318 			    last_leaf->btl_hdr.bth_count, value, &idx);
319 
320 			if (where != NULL) {
321 				idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
322 				*where = idx;
323 			}
324 			return (d);
325 		}
326 	}
327 
328 	zfs_btree_core_t *node = NULL;
329 	uint32_t child = 0;
330 	uint32_t depth = 0;
331 
332 	/*
333 	 * Iterate down the tree, finding which child the value should be in
334 	 * by comparing with the separators.
335 	 */
336 	for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
337 	    node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
338 		ASSERT3P(node, !=, NULL);
339 		void *d = tree->bt_find_in_buf(tree, node->btc_elems,
340 		    node->btc_hdr.bth_count, value, &idx);
341 		EQUIV(d != NULL, !idx.bti_before);
342 		if (d != NULL) {
343 			if (where != NULL) {
344 				idx.bti_node = (zfs_btree_hdr_t *)node;
345 				*where = idx;
346 			}
347 			return (d);
348 		}
349 		ASSERT(idx.bti_before);
350 		child = idx.bti_offset;
351 	}
352 
353 	/*
354 	 * The value is in this leaf, or it would be if it were in the
355 	 * tree. Find its proper location and return it.
356 	 */
357 	zfs_btree_leaf_t *leaf = (depth == 0 ?
358 	    (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
359 	void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
360 	    leaf->btl_hdr.bth_first * size,
361 	    leaf->btl_hdr.bth_count, value, &idx);
362 
363 	if (where != NULL) {
364 		idx.bti_node = (zfs_btree_hdr_t *)leaf;
365 		*where = idx;
366 	}
367 
368 	return (d);
369 }
370 
371 /*
372  * To explain the following functions, it is useful to understand the four
373  * kinds of shifts used in btree operation. First, a shift is a movement of
374  * elements within a node. It is used to create gaps for inserting new
375  * elements and children, or cover gaps created when things are removed. A
376  * shift has two fundamental properties, each of which can be one of two
377  * values, making four types of shifts.  There is the direction of the shift
378  * (left or right) and the shape of the shift (parallelogram or isoceles
379  * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
380  * applies to shifts of core nodes.
381  *
382  * The names derive from the following imagining of the layout of a node:
383  *
384  *  Elements:       *   *   *   *   *   *   *   ...   *   *   *
385  *  Children:     *   *   *   *   *   *   *   *   ...   *   *   *
386  *
387  * This layout follows from the fact that the elements act as separators
388  * between pairs of children, and that children root subtrees "below" the
389  * current node. A left and right shift are fairly self-explanatory; a left
390  * shift moves things to the left, while a right shift moves things to the
391  * right. A parallelogram shift is a shift with the same number of elements
392  * and children being moved, while a trapezoid shift is a shift that moves one
393  * more children than elements. An example follows:
394  *
395  * A parallelogram shift could contain the following:
396  *      _______________
397  *      \*   *   *   * \ *   *   *   ...   *   *   *
398  *     * \ *   *   *   *\  *   *   *   ...   *   *   *
399  *        ---------------
400  * A trapezoid shift could contain the following:
401  *          ___________
402  *       * / *   *   * \ *   *   *   ...   *   *   *
403  *     *  / *  *   *   *\  *   *   *   ...   *   *   *
404  *        ---------------
405  *
406  * Note that a parallelogram shift is always shaped like a "left-leaning"
407  * parallelogram, where the starting index of the children being moved is
408  * always one higher than the starting index of the elements being moved. No
409  * "right-leaning" parallelogram shifts are needed (shifts where the starting
410  * element index and starting child index being moved are the same) to achieve
411  * any btree operations, so we ignore them.
412  */
413 
414 enum bt_shift_shape {
415 	BSS_TRAPEZOID,
416 	BSS_PARALLELOGRAM
417 };
418 
419 enum bt_shift_direction {
420 	BSD_LEFT,
421 	BSD_RIGHT
422 };
423 
424 /*
425  * Shift elements and children in the provided core node by off spots.  The
426  * first element moved is idx, and count elements are moved. The shape of the
427  * shift is determined by shape. The direction is determined by dir.
428  */
429 static inline void
bt_shift_core(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_shape shape,enum bt_shift_direction dir)430 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
431     uint32_t count, uint32_t off, enum bt_shift_shape shape,
432     enum bt_shift_direction dir)
433 {
434 	size_t size = tree->bt_elem_size;
435 	ASSERT(zfs_btree_is_core(&node->btc_hdr));
436 
437 	uint8_t *e_start = node->btc_elems + idx * size;
438 	uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
439 	    e_start + off * size);
440 	bmov(e_start, e_out, count * size);
441 
442 	zfs_btree_hdr_t **c_start = node->btc_children + idx +
443 	    (shape == BSS_TRAPEZOID ? 0 : 1);
444 	zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
445 	    c_start + off);
446 	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
447 	bmov(c_start, c_out, c_count * sizeof (*c_start));
448 }
449 
450 /*
451  * Shift elements and children in the provided core node left by one spot.
452  * The first element moved is idx, and count elements are moved. The
453  * shape of the shift is determined by trap; true if the shift is a trapezoid,
454  * false if it is a parallelogram.
455  */
456 static inline void
bt_shift_core_left(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)457 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
458     uint32_t count, enum bt_shift_shape shape)
459 {
460 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
461 }
462 
463 /*
464  * Shift elements and children in the provided core node right by one spot.
465  * Starts with elements[idx] and children[idx] and one more child than element.
466  */
467 static inline void
bt_shift_core_right(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)468 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
469     uint32_t count, enum bt_shift_shape shape)
470 {
471 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
472 }
473 
474 /*
475  * Shift elements and children in the provided leaf node by off spots.
476  * The first element moved is idx, and count elements are moved. The direction
477  * is determined by left.
478  */
479 static inline void
bt_shift_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_direction dir)480 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
481     uint32_t count, uint32_t off, enum bt_shift_direction dir)
482 {
483 	size_t size = tree->bt_elem_size;
484 	zfs_btree_hdr_t *hdr = &node->btl_hdr;
485 	ASSERT(!zfs_btree_is_core(hdr));
486 
487 	if (count == 0)
488 		return;
489 	uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
490 	uint8_t *out = (dir == BSD_LEFT ? start - off * size :
491 	    start + off * size);
492 	bmov(start, out, count * size);
493 }
494 
495 /*
496  * Grow leaf for n new elements before idx.
497  */
498 static void
bt_grow_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)499 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
500     uint32_t n)
501 {
502 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
503 	ASSERT(!zfs_btree_is_core(hdr));
504 	ASSERT3U(idx, <=, hdr->bth_count);
505 	uint32_t capacity = tree->bt_leaf_cap;
506 	ASSERT3U(hdr->bth_count + n, <=, capacity);
507 	boolean_t cl = (hdr->bth_first >= n);
508 	boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
509 
510 	if (cl && (!cr || idx <= hdr->bth_count / 2)) {
511 		/* Grow left. */
512 		hdr->bth_first -= n;
513 		bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
514 	} else if (cr) {
515 		/* Grow right. */
516 		bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
517 		    BSD_RIGHT);
518 	} else {
519 		/* Grow both ways. */
520 		uint32_t fn = hdr->bth_first -
521 		    (capacity - (hdr->bth_count + n)) / 2;
522 		hdr->bth_first -= fn;
523 		bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
524 		bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
525 		    n - fn, BSD_RIGHT);
526 	}
527 	hdr->bth_count += n;
528 }
529 
530 /*
531  * Shrink leaf for count elements starting from idx.
532  */
533 static void
bt_shrink_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)534 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
535     uint32_t n)
536 {
537 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
538 	ASSERT(!zfs_btree_is_core(hdr));
539 	ASSERT3U(idx, <=, hdr->bth_count);
540 	ASSERT3U(idx + n, <=, hdr->bth_count);
541 
542 	if (idx <= (hdr->bth_count - n) / 2) {
543 		bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
544 		zfs_btree_poison_node_at(tree, hdr, 0, n);
545 		hdr->bth_first += n;
546 	} else {
547 		bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
548 		    BSD_LEFT);
549 		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
550 	}
551 	hdr->bth_count -= n;
552 }
553 
554 /*
555  * Move children and elements from one core node to another. The shape
556  * parameter behaves the same as it does in the shift logic.
557  */
558 static inline void
bt_transfer_core(zfs_btree_t * tree,zfs_btree_core_t * source,uint32_t sidx,uint32_t count,zfs_btree_core_t * dest,uint32_t didx,enum bt_shift_shape shape)559 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
560     uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
561     enum bt_shift_shape shape)
562 {
563 	size_t size = tree->bt_elem_size;
564 	ASSERT(zfs_btree_is_core(&source->btc_hdr));
565 	ASSERT(zfs_btree_is_core(&dest->btc_hdr));
566 
567 	bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
568 	    count * size);
569 
570 	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
571 	bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
572 	    dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
573 	    c_count * sizeof (*source->btc_children));
574 }
575 
576 static inline void
bt_transfer_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * source,uint32_t sidx,uint32_t count,zfs_btree_leaf_t * dest,uint32_t didx)577 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
578     uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
579 {
580 	size_t size = tree->bt_elem_size;
581 	ASSERT(!zfs_btree_is_core(&source->btl_hdr));
582 	ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
583 
584 	bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
585 	    dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
586 	    count * size);
587 }
588 
589 /*
590  * Find the first element in the subtree rooted at hdr, return its value and
591  * put its location in where if non-null.
592  */
593 static void *
zfs_btree_first_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)594 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
595     zfs_btree_index_t *where)
596 {
597 	zfs_btree_hdr_t *node;
598 
599 	for (node = hdr; zfs_btree_is_core(node);
600 	    node = ((zfs_btree_core_t *)node)->btc_children[0])
601 		;
602 
603 	ASSERT(!zfs_btree_is_core(node));
604 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
605 	if (where != NULL) {
606 		where->bti_node = node;
607 		where->bti_offset = 0;
608 		where->bti_before = B_FALSE;
609 	}
610 	return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
611 }
612 
613 /* Insert an element and a child into a core node at the given offset. */
614 static void
zfs_btree_insert_core_impl(zfs_btree_t * tree,zfs_btree_core_t * parent,uint32_t offset,zfs_btree_hdr_t * new_node,void * buf)615 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
616     uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
617 {
618 	size_t size = tree->bt_elem_size;
619 	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
620 	ASSERT3P(par_hdr, ==, new_node->bth_parent);
621 	ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
622 
623 	if (zfs_btree_verify_intensity >= 5) {
624 		zfs_btree_verify_poison_at(tree, par_hdr,
625 		    par_hdr->bth_count);
626 	}
627 	/* Shift existing elements and children */
628 	uint32_t count = par_hdr->bth_count - offset;
629 	bt_shift_core_right(tree, parent, offset, count,
630 	    BSS_PARALLELOGRAM);
631 
632 	/* Insert new values */
633 	parent->btc_children[offset + 1] = new_node;
634 	bcpy(buf, parent->btc_elems + offset * size, size);
635 	par_hdr->bth_count++;
636 }
637 
638 /*
639  * Insert new_node into the parent of old_node directly after old_node, with
640  * buf as the dividing element between the two.
641  */
642 static void
zfs_btree_insert_into_parent(zfs_btree_t * tree,zfs_btree_hdr_t * old_node,zfs_btree_hdr_t * new_node,void * buf)643 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
644     zfs_btree_hdr_t *new_node, void *buf)
645 {
646 	ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
647 	size_t size = tree->bt_elem_size;
648 	zfs_btree_core_t *parent = old_node->bth_parent;
649 
650 	/*
651 	 * If this is the root node we were splitting, we create a new root
652 	 * and increase the height of the tree.
653 	 */
654 	if (parent == NULL) {
655 		ASSERT3P(old_node, ==, tree->bt_root);
656 		tree->bt_num_nodes++;
657 		zfs_btree_core_t *new_root =
658 		    kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
659 		    size, KM_SLEEP);
660 		zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
661 		new_root_hdr->bth_parent = NULL;
662 		new_root_hdr->bth_first = -1;
663 		new_root_hdr->bth_count = 1;
664 
665 		old_node->bth_parent = new_node->bth_parent = new_root;
666 		new_root->btc_children[0] = old_node;
667 		new_root->btc_children[1] = new_node;
668 		bcpy(buf, new_root->btc_elems, size);
669 
670 		tree->bt_height++;
671 		tree->bt_root = new_root_hdr;
672 		zfs_btree_poison_node(tree, new_root_hdr);
673 		return;
674 	}
675 
676 	/*
677 	 * Since we have the new separator, binary search for where to put
678 	 * new_node.
679 	 */
680 	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
681 	zfs_btree_index_t idx;
682 	ASSERT(zfs_btree_is_core(par_hdr));
683 	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
684 	    par_hdr->bth_count, buf, &idx), ==, NULL);
685 	ASSERT(idx.bti_before);
686 	uint32_t offset = idx.bti_offset;
687 	ASSERT3U(offset, <=, par_hdr->bth_count);
688 	ASSERT3P(parent->btc_children[offset], ==, old_node);
689 
690 	/*
691 	 * If the parent isn't full, shift things to accommodate our insertions
692 	 * and return.
693 	 */
694 	if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
695 		zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
696 		return;
697 	}
698 
699 	/*
700 	 * We need to split this core node into two. Currently there are
701 	 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
702 	 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
703 	 * current node, and the others will be moved to the new core node.
704 	 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
705 	 * will be used as the new separator in our parent, and the others
706 	 * will be split among the two core nodes.
707 	 *
708 	 * Usually we will split the node in half evenly, with
709 	 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
710 	 * instead move only about a quarter of the elements (and children) to
711 	 * the new node. Since the average state after a long time is a 3/4
712 	 * full node, shortcutting directly to that state improves efficiency.
713 	 *
714 	 * We do this in two stages: first we split into two nodes, and then we
715 	 * reuse our existing logic to insert the new element and child.
716 	 */
717 	uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
718 	    2 : 4)) - 1, 2);
719 	uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
720 	ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
721 	tree->bt_num_nodes++;
722 	zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
723 	    BTREE_CORE_ELEMS * size, KM_SLEEP);
724 	zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
725 	new_par_hdr->bth_parent = par_hdr->bth_parent;
726 	new_par_hdr->bth_first = -1;
727 	new_par_hdr->bth_count = move_count;
728 	zfs_btree_poison_node(tree, new_par_hdr);
729 
730 	par_hdr->bth_count = keep_count;
731 
732 	bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
733 	    0, BSS_TRAPEZOID);
734 
735 	/* Store the new separator in a buffer. */
736 	uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
737 	bcpy(parent->btc_elems + keep_count * size, tmp_buf,
738 	    size);
739 	zfs_btree_poison_node(tree, par_hdr);
740 
741 	if (offset < keep_count) {
742 		/* Insert the new node into the left half */
743 		zfs_btree_insert_core_impl(tree, parent, offset, new_node,
744 		    buf);
745 
746 		/*
747 		 * Move the new separator to the existing buffer.
748 		 */
749 		bcpy(tmp_buf, buf, size);
750 	} else if (offset > keep_count) {
751 		/* Insert the new node into the right half */
752 		new_node->bth_parent = new_parent;
753 		zfs_btree_insert_core_impl(tree, new_parent,
754 		    offset - keep_count - 1, new_node, buf);
755 
756 		/*
757 		 * Move the new separator to the existing buffer.
758 		 */
759 		bcpy(tmp_buf, buf, size);
760 	} else {
761 		/*
762 		 * Move the new separator into the right half, and replace it
763 		 * with buf. We also need to shift back the elements in the
764 		 * right half to accommodate new_node.
765 		 */
766 		bt_shift_core_right(tree, new_parent, 0, move_count,
767 		    BSS_TRAPEZOID);
768 		new_parent->btc_children[0] = new_node;
769 		bcpy(tmp_buf, new_parent->btc_elems, size);
770 		new_par_hdr->bth_count++;
771 	}
772 	kmem_free(tmp_buf, size);
773 	zfs_btree_poison_node(tree, par_hdr);
774 
775 	for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
776 		new_parent->btc_children[i]->bth_parent = new_parent;
777 
778 	for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
779 		ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
780 
781 	/*
782 	 * Now that the node is split, we need to insert the new node into its
783 	 * parent. This may cause further splitting.
784 	 */
785 	zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
786 	    &new_parent->btc_hdr, buf);
787 }
788 
789 /* Insert an element into a leaf node at the given offset. */
790 static void
zfs_btree_insert_leaf_impl(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,const void * value)791 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
792     uint32_t idx, const void *value)
793 {
794 	size_t size = tree->bt_elem_size;
795 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
796 	ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
797 
798 	if (zfs_btree_verify_intensity >= 5) {
799 		zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
800 		    leaf->btl_hdr.bth_count);
801 	}
802 
803 	bt_grow_leaf(tree, leaf, idx, 1);
804 	uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
805 	bcpy(value, start, size);
806 }
807 
808 static void
809 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
810 
811 /* Helper function for inserting a new value into leaf at the given index. */
812 static void
zfs_btree_insert_into_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,const void * value,uint32_t idx)813 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
814     const void *value, uint32_t idx)
815 {
816 	size_t size = tree->bt_elem_size;
817 	uint32_t capacity = tree->bt_leaf_cap;
818 
819 	/*
820 	 * If the leaf isn't full, shift the elements after idx and insert
821 	 * value.
822 	 */
823 	if (leaf->btl_hdr.bth_count != capacity) {
824 		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
825 		return;
826 	}
827 
828 	/*
829 	 * Otherwise, we split the leaf node into two nodes. If we're not bulk
830 	 * inserting, each is of size (capacity / 2).  If we are bulk
831 	 * inserting, we move a quarter of the elements to the new node so
832 	 * inserts into the old node don't cause immediate splitting but the
833 	 * tree stays relatively dense. Since the average state after a long
834 	 * time is a 3/4 full node, shortcutting directly to that state
835 	 * improves efficiency.  At the end of the bulk insertion process
836 	 * we'll need to go through and fix up any nodes (the last leaf and
837 	 * its ancestors, potentially) that are below the minimum.
838 	 *
839 	 * In either case, we're left with one extra element. The leftover
840 	 * element will become the new dividing element between the two nodes.
841 	 */
842 	uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
843 	uint32_t keep_count = capacity - move_count - 1;
844 	ASSERT3U(keep_count, >=, 1);
845 	/* If we insert on left. move one more to keep leaves balanced.  */
846 	if (idx < keep_count) {
847 		keep_count--;
848 		move_count++;
849 	}
850 	tree->bt_num_nodes++;
851 	zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
852 	zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
853 	new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
854 	new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
855 	    (idx >= keep_count && idx <= keep_count + move_count / 2);
856 	new_hdr->bth_count = move_count;
857 	zfs_btree_poison_node(tree, new_hdr);
858 
859 	if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
860 		tree->bt_bulk = new_leaf;
861 
862 	/* Copy the back part to the new leaf. */
863 	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
864 
865 	/* We store the new separator in a buffer we control for simplicity. */
866 	uint8_t *buf = kmem_alloc(size, KM_SLEEP);
867 	bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
868 	    buf, size);
869 
870 	bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
871 
872 	if (idx < keep_count) {
873 		/* Insert into the existing leaf. */
874 		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
875 	} else if (idx > keep_count) {
876 		/* Insert into the new leaf. */
877 		zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
878 		    1, value);
879 	} else {
880 		/*
881 		 * Insert planned separator into the new leaf, and use
882 		 * the new value as the new separator.
883 		 */
884 		zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
885 		bcpy(value, buf, size);
886 	}
887 
888 	/*
889 	 * Now that the node is split, we need to insert the new node into its
890 	 * parent. This may cause further splitting, bur only of core nodes.
891 	 */
892 	zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
893 	    buf);
894 	kmem_free(buf, size);
895 }
896 
897 static uint32_t
zfs_btree_find_parent_idx(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)898 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
899 {
900 	void *buf;
901 	if (zfs_btree_is_core(hdr)) {
902 		buf = ((zfs_btree_core_t *)hdr)->btc_elems;
903 	} else {
904 		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
905 		    hdr->bth_first * tree->bt_elem_size;
906 	}
907 	zfs_btree_index_t idx;
908 	zfs_btree_core_t *parent = hdr->bth_parent;
909 	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
910 	    parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
911 	ASSERT(idx.bti_before);
912 	ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
913 	ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
914 	return (idx.bti_offset);
915 }
916 
917 /*
918  * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
919  * nodes may violate the invariant that non-root nodes must be at least half
920  * full. All nodes violating this invariant should be the last node in their
921  * particular level. To correct the invariant, we take values from their left
922  * neighbor until they are half full. They must have a left neighbor at their
923  * level because the last node at a level is not the first node unless it's
924  * the root.
925  */
926 static void
zfs_btree_bulk_finish(zfs_btree_t * tree)927 zfs_btree_bulk_finish(zfs_btree_t *tree)
928 {
929 	ASSERT3P(tree->bt_bulk, !=, NULL);
930 	ASSERT3P(tree->bt_root, !=, NULL);
931 	zfs_btree_leaf_t *leaf = tree->bt_bulk;
932 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
933 	zfs_btree_core_t *parent = hdr->bth_parent;
934 	size_t size = tree->bt_elem_size;
935 	uint32_t capacity = tree->bt_leaf_cap;
936 
937 	/*
938 	 * The invariant doesn't apply to the root node, if that's the only
939 	 * node in the tree we're done.
940 	 */
941 	if (parent == NULL) {
942 		tree->bt_bulk = NULL;
943 		return;
944 	}
945 
946 	/* First, take elements to rebalance the leaf node. */
947 	if (hdr->bth_count < capacity / 2) {
948 		/*
949 		 * First, find the left neighbor. The simplest way to do this
950 		 * is to call zfs_btree_prev twice; the first time finds some
951 		 * ancestor of this node, and the second time finds the left
952 		 * neighbor. The ancestor found is the lowest common ancestor
953 		 * of leaf and the neighbor.
954 		 */
955 		zfs_btree_index_t idx = {
956 			.bti_node = hdr,
957 			.bti_offset = 0
958 		};
959 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
960 		ASSERT(zfs_btree_is_core(idx.bti_node));
961 		zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
962 		uint32_t common_idx = idx.bti_offset;
963 
964 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
965 		ASSERT(!zfs_btree_is_core(idx.bti_node));
966 		zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
967 		zfs_btree_hdr_t *l_hdr = idx.bti_node;
968 		uint32_t move_count = (capacity / 2) - hdr->bth_count;
969 		ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
970 		    capacity / 2);
971 
972 		if (zfs_btree_verify_intensity >= 5) {
973 			for (uint32_t i = 0; i < move_count; i++) {
974 				zfs_btree_verify_poison_at(tree, hdr,
975 				    leaf->btl_hdr.bth_count + i);
976 			}
977 		}
978 
979 		/* First, shift elements in leaf back. */
980 		bt_grow_leaf(tree, leaf, 0, move_count);
981 
982 		/* Next, move the separator from the common ancestor to leaf. */
983 		uint8_t *separator = common->btc_elems + common_idx * size;
984 		uint8_t *out = leaf->btl_elems +
985 		    (hdr->bth_first + move_count - 1) * size;
986 		bcpy(separator, out, size);
987 
988 		/*
989 		 * Now we move elements from the tail of the left neighbor to
990 		 * fill the remaining spots in leaf.
991 		 */
992 		bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
993 		    (move_count - 1), move_count - 1, leaf, 0);
994 
995 		/*
996 		 * Finally, move the new last element in the left neighbor to
997 		 * the separator.
998 		 */
999 		bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
1000 		    l_hdr->bth_count - move_count) * size, separator, size);
1001 
1002 		/* Adjust the node's counts, and we're done. */
1003 		bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
1004 		    move_count);
1005 
1006 		ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
1007 		ASSERT3U(hdr->bth_count, >=, capacity / 2);
1008 	}
1009 
1010 	/*
1011 	 * Now we have to rebalance any ancestors of leaf that may also
1012 	 * violate the invariant.
1013 	 */
1014 	capacity = BTREE_CORE_ELEMS;
1015 	while (parent->btc_hdr.bth_parent != NULL) {
1016 		zfs_btree_core_t *cur = parent;
1017 		zfs_btree_hdr_t *hdr = &cur->btc_hdr;
1018 		parent = hdr->bth_parent;
1019 		/*
1020 		 * If the invariant isn't violated, move on to the next
1021 		 * ancestor.
1022 		 */
1023 		if (hdr->bth_count >= capacity / 2)
1024 			continue;
1025 
1026 		/*
1027 		 * Because the smallest number of nodes we can move when
1028 		 * splitting is 2, we never need to worry about not having a
1029 		 * left sibling (a sibling is a neighbor with the same parent).
1030 		 */
1031 		uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1032 		ASSERT3U(parent_idx, >, 0);
1033 		zfs_btree_core_t *l_neighbor =
1034 		    (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
1035 		uint32_t move_count = (capacity / 2) - hdr->bth_count;
1036 		ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1037 		    capacity / 2);
1038 
1039 		if (zfs_btree_verify_intensity >= 5) {
1040 			for (uint32_t i = 0; i < move_count; i++) {
1041 				zfs_btree_verify_poison_at(tree, hdr,
1042 				    hdr->bth_count + i);
1043 			}
1044 		}
1045 		/* First, shift things in the right node back. */
1046 		bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1047 		    BSS_TRAPEZOID, BSD_RIGHT);
1048 
1049 		/* Next, move the separator to the right node. */
1050 		uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1051 		    size);
1052 		uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1053 		bcpy(separator, e_out, size);
1054 
1055 		/*
1056 		 * Now, move elements and children from the left node to the
1057 		 * right.  We move one more child than elements.
1058 		 */
1059 		move_count--;
1060 		uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1061 		bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1062 		    BSS_TRAPEZOID);
1063 
1064 		/*
1065 		 * Finally, move the last element in the left node to the
1066 		 * separator's position.
1067 		 */
1068 		move_idx--;
1069 		bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1070 
1071 		l_neighbor->btc_hdr.bth_count -= move_count + 1;
1072 		hdr->bth_count += move_count + 1;
1073 
1074 		ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1075 		ASSERT3U(hdr->bth_count, >=, capacity / 2);
1076 
1077 		zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1078 
1079 		for (uint32_t i = 0; i <= hdr->bth_count; i++)
1080 			cur->btc_children[i]->bth_parent = cur;
1081 	}
1082 
1083 	tree->bt_bulk = NULL;
1084 	zfs_btree_verify(tree);
1085 }
1086 
1087 /*
1088  * Insert value into tree at the location specified by where.
1089  */
1090 void
zfs_btree_add_idx(zfs_btree_t * tree,const void * value,const zfs_btree_index_t * where)1091 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1092     const zfs_btree_index_t *where)
1093 {
1094 	zfs_btree_index_t idx = {0};
1095 
1096 	/* If we're not inserting in the last leaf, end bulk insert mode. */
1097 	if (tree->bt_bulk != NULL) {
1098 		if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1099 			zfs_btree_bulk_finish(tree);
1100 			VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1101 			where = &idx;
1102 		}
1103 	}
1104 
1105 	tree->bt_num_elems++;
1106 	/*
1107 	 * If this is the first element in the tree, create a leaf root node
1108 	 * and add the value to it.
1109 	 */
1110 	if (where->bti_node == NULL) {
1111 		ASSERT3U(tree->bt_num_elems, ==, 1);
1112 		ASSERT3S(tree->bt_height, ==, -1);
1113 		ASSERT3P(tree->bt_root, ==, NULL);
1114 		ASSERT0(where->bti_offset);
1115 
1116 		tree->bt_num_nodes++;
1117 		zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
1118 		tree->bt_root = &leaf->btl_hdr;
1119 		tree->bt_height++;
1120 
1121 		zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1122 		hdr->bth_parent = NULL;
1123 		hdr->bth_first = 0;
1124 		hdr->bth_count = 0;
1125 		zfs_btree_poison_node(tree, hdr);
1126 
1127 		zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1128 		tree->bt_bulk = leaf;
1129 	} else if (!zfs_btree_is_core(where->bti_node)) {
1130 		/*
1131 		 * If we're inserting into a leaf, go directly to the helper
1132 		 * function.
1133 		 */
1134 		zfs_btree_insert_into_leaf(tree,
1135 		    (zfs_btree_leaf_t *)where->bti_node, value,
1136 		    where->bti_offset);
1137 	} else {
1138 		/*
1139 		 * If we're inserting into a core node, we can't just shift
1140 		 * the existing element in that slot in the same node without
1141 		 * breaking our ordering invariants. Instead we place the new
1142 		 * value in the node at that spot and then insert the old
1143 		 * separator into the first slot in the subtree to the right.
1144 		 */
1145 		zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1146 
1147 		/*
1148 		 * We can ignore bti_before, because either way the value
1149 		 * should end up in bti_offset.
1150 		 */
1151 		uint32_t off = where->bti_offset;
1152 		zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1153 		size_t size = tree->bt_elem_size;
1154 		uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1155 		bcpy(node->btc_elems + off * size, buf, size);
1156 		bcpy(value, node->btc_elems + off * size, size);
1157 
1158 		/*
1159 		 * Find the first slot in the subtree to the right, insert
1160 		 * there.
1161 		 */
1162 		zfs_btree_index_t new_idx;
1163 		VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1164 		    NULL);
1165 		ASSERT0(new_idx.bti_offset);
1166 		ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1167 		zfs_btree_insert_into_leaf(tree,
1168 		    (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1169 		kmem_free(buf, size);
1170 	}
1171 	zfs_btree_verify(tree);
1172 }
1173 
1174 /*
1175  * Return the first element in the tree, and put its location in where if
1176  * non-null.
1177  */
1178 void *
zfs_btree_first(zfs_btree_t * tree,zfs_btree_index_t * where)1179 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1180 {
1181 	if (tree->bt_height == -1) {
1182 		ASSERT0(tree->bt_num_elems);
1183 		return (NULL);
1184 	}
1185 	return (zfs_btree_first_helper(tree, tree->bt_root, where));
1186 }
1187 
1188 /*
1189  * Find the last element in the subtree rooted at hdr, return its value and
1190  * put its location in where if non-null.
1191  */
1192 static void *
zfs_btree_last_helper(zfs_btree_t * btree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)1193 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1194     zfs_btree_index_t *where)
1195 {
1196 	zfs_btree_hdr_t *node;
1197 
1198 	for (node = hdr; zfs_btree_is_core(node); node =
1199 	    ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1200 		;
1201 
1202 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1203 	if (where != NULL) {
1204 		where->bti_node = node;
1205 		where->bti_offset = node->bth_count - 1;
1206 		where->bti_before = B_FALSE;
1207 	}
1208 	return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1209 	    btree->bt_elem_size);
1210 }
1211 
1212 /*
1213  * Return the last element in the tree, and put its location in where if
1214  * non-null.
1215  */
1216 void *
zfs_btree_last(zfs_btree_t * tree,zfs_btree_index_t * where)1217 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1218 {
1219 	if (tree->bt_height == -1) {
1220 		ASSERT0(tree->bt_num_elems);
1221 		return (NULL);
1222 	}
1223 	return (zfs_btree_last_helper(tree, tree->bt_root, where));
1224 }
1225 
1226 /*
1227  * This function contains the logic to find the next node in the tree. A
1228  * helper function is used because there are multiple internal consumemrs of
1229  * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1230  * node after we've finished with it.
1231  */
1232 static void *
zfs_btree_next_helper(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx,void (* done_func)(zfs_btree_t *,zfs_btree_hdr_t *))1233 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1234     zfs_btree_index_t *out_idx,
1235     void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1236 {
1237 	if (idx->bti_node == NULL) {
1238 		ASSERT3S(tree->bt_height, ==, -1);
1239 		return (NULL);
1240 	}
1241 
1242 	uint32_t offset = idx->bti_offset;
1243 	if (!zfs_btree_is_core(idx->bti_node)) {
1244 		/*
1245 		 * When finding the next element of an element in a leaf,
1246 		 * there are two cases. If the element isn't the last one in
1247 		 * the leaf, in which case we just return the next element in
1248 		 * the leaf. Otherwise, we need to traverse up our parents
1249 		 * until we find one where our ancestor isn't the last child
1250 		 * of its parent. Once we do, the next element is the
1251 		 * separator after our ancestor in its parent.
1252 		 */
1253 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1254 		uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1255 		if (leaf->btl_hdr.bth_count > new_off) {
1256 			out_idx->bti_node = &leaf->btl_hdr;
1257 			out_idx->bti_offset = new_off;
1258 			out_idx->bti_before = B_FALSE;
1259 			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1260 			    new_off) * tree->bt_elem_size);
1261 		}
1262 
1263 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1264 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1265 		    node != NULL; node = node->btc_hdr.bth_parent) {
1266 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
1267 			ASSERT(zfs_btree_is_core(hdr));
1268 			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1269 			if (done_func != NULL)
1270 				done_func(tree, prev);
1271 			if (i == hdr->bth_count) {
1272 				prev = hdr;
1273 				continue;
1274 			}
1275 			out_idx->bti_node = hdr;
1276 			out_idx->bti_offset = i;
1277 			out_idx->bti_before = B_FALSE;
1278 			return (node->btc_elems + i * tree->bt_elem_size);
1279 		}
1280 		if (done_func != NULL)
1281 			done_func(tree, prev);
1282 		/*
1283 		 * We've traversed all the way up and been at the end of the
1284 		 * node every time, so this was the last element in the tree.
1285 		 */
1286 		return (NULL);
1287 	}
1288 
1289 	/* If we were before an element in a core node, return that element. */
1290 	ASSERT(zfs_btree_is_core(idx->bti_node));
1291 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1292 	if (idx->bti_before) {
1293 		out_idx->bti_before = B_FALSE;
1294 		return (node->btc_elems + offset * tree->bt_elem_size);
1295 	}
1296 
1297 	/*
1298 	 * The next element from one in a core node is the first element in
1299 	 * the subtree just to the right of the separator.
1300 	 */
1301 	zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1302 	return (zfs_btree_first_helper(tree, child, out_idx));
1303 }
1304 
1305 /*
1306  * Return the next valued node in the tree.  The same address can be safely
1307  * passed for idx and out_idx.
1308  */
1309 void *
zfs_btree_next(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1310 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1311     zfs_btree_index_t *out_idx)
1312 {
1313 	return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1314 }
1315 
1316 /*
1317  * Return the previous valued node in the tree.  The same value can be safely
1318  * passed for idx and out_idx.
1319  */
1320 void *
zfs_btree_prev(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1321 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1322     zfs_btree_index_t *out_idx)
1323 {
1324 	if (idx->bti_node == NULL) {
1325 		ASSERT3S(tree->bt_height, ==, -1);
1326 		return (NULL);
1327 	}
1328 
1329 	uint32_t offset = idx->bti_offset;
1330 	if (!zfs_btree_is_core(idx->bti_node)) {
1331 		/*
1332 		 * When finding the previous element of an element in a leaf,
1333 		 * there are two cases. If the element isn't the first one in
1334 		 * the leaf, in which case we just return the previous element
1335 		 * in the leaf. Otherwise, we need to traverse up our parents
1336 		 * until we find one where our previous ancestor isn't the
1337 		 * first child. Once we do, the previous element is the
1338 		 * separator after our previous ancestor.
1339 		 */
1340 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1341 		if (offset != 0) {
1342 			out_idx->bti_node = &leaf->btl_hdr;
1343 			out_idx->bti_offset = offset - 1;
1344 			out_idx->bti_before = B_FALSE;
1345 			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1346 			    offset - 1) * tree->bt_elem_size);
1347 		}
1348 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1349 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1350 		    node != NULL; node = node->btc_hdr.bth_parent) {
1351 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
1352 			ASSERT(zfs_btree_is_core(hdr));
1353 			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1354 			if (i == 0) {
1355 				prev = hdr;
1356 				continue;
1357 			}
1358 			out_idx->bti_node = hdr;
1359 			out_idx->bti_offset = i - 1;
1360 			out_idx->bti_before = B_FALSE;
1361 			return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1362 		}
1363 		/*
1364 		 * We've traversed all the way up and been at the start of the
1365 		 * node every time, so this was the first node in the tree.
1366 		 */
1367 		return (NULL);
1368 	}
1369 
1370 	/*
1371 	 * The previous element from one in a core node is the last element in
1372 	 * the subtree just to the left of the separator.
1373 	 */
1374 	ASSERT(zfs_btree_is_core(idx->bti_node));
1375 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1376 	zfs_btree_hdr_t *child = node->btc_children[offset];
1377 	return (zfs_btree_last_helper(tree, child, out_idx));
1378 }
1379 
1380 /*
1381  * Get the value at the provided index in the tree.
1382  *
1383  * Note that the value returned from this function can be mutated, but only
1384  * if it will not change the ordering of the element with respect to any other
1385  * elements that could be in the tree.
1386  */
1387 void *
zfs_btree_get(zfs_btree_t * tree,zfs_btree_index_t * idx)1388 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1389 {
1390 	ASSERT(!idx->bti_before);
1391 	size_t size = tree->bt_elem_size;
1392 	if (!zfs_btree_is_core(idx->bti_node)) {
1393 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1394 		return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1395 		    idx->bti_offset) * size);
1396 	}
1397 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1398 	return (node->btc_elems + idx->bti_offset * size);
1399 }
1400 
1401 /* Add the given value to the tree. Must not already be in the tree. */
1402 void
zfs_btree_add(zfs_btree_t * tree,const void * node)1403 zfs_btree_add(zfs_btree_t *tree, const void *node)
1404 {
1405 	zfs_btree_index_t where = {0};
1406 	VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1407 	zfs_btree_add_idx(tree, node, &where);
1408 }
1409 
1410 /* Helper function to free a tree node. */
1411 static void
zfs_btree_node_destroy(zfs_btree_t * tree,zfs_btree_hdr_t * node)1412 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1413 {
1414 	tree->bt_num_nodes--;
1415 	if (!zfs_btree_is_core(node)) {
1416 		zfs_btree_leaf_free(tree, node);
1417 	} else {
1418 		kmem_free(node, sizeof (zfs_btree_core_t) +
1419 		    BTREE_CORE_ELEMS * tree->bt_elem_size);
1420 	}
1421 }
1422 
1423 /*
1424  * Remove the rm_hdr and the separator to its left from the parent node. The
1425  * buffer that rm_hdr was stored in may already be freed, so its contents
1426  * cannot be accessed.
1427  */
1428 static void
zfs_btree_remove_from_node(zfs_btree_t * tree,zfs_btree_core_t * node,zfs_btree_hdr_t * rm_hdr)1429 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1430     zfs_btree_hdr_t *rm_hdr)
1431 {
1432 	size_t size = tree->bt_elem_size;
1433 	uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1434 	zfs_btree_hdr_t *hdr = &node->btc_hdr;
1435 	/*
1436 	 * If the node is the root node and rm_hdr is one of two children,
1437 	 * promote the other child to the root.
1438 	 */
1439 	if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1440 		ASSERT3U(hdr->bth_count, ==, 1);
1441 		ASSERT3P(tree->bt_root, ==, node);
1442 		ASSERT3P(node->btc_children[1], ==, rm_hdr);
1443 		tree->bt_root = node->btc_children[0];
1444 		node->btc_children[0]->bth_parent = NULL;
1445 		zfs_btree_node_destroy(tree, hdr);
1446 		tree->bt_height--;
1447 		return;
1448 	}
1449 
1450 	uint32_t idx;
1451 	for (idx = 0; idx <= hdr->bth_count; idx++) {
1452 		if (node->btc_children[idx] == rm_hdr)
1453 			break;
1454 	}
1455 	ASSERT3U(idx, <=, hdr->bth_count);
1456 
1457 	/*
1458 	 * If the node is the root or it has more than the minimum number of
1459 	 * children, just remove the child and separator, and return.
1460 	 */
1461 	if (hdr->bth_parent == NULL ||
1462 	    hdr->bth_count > min_count) {
1463 		/*
1464 		 * Shift the element and children to the right of rm_hdr to
1465 		 * the left by one spot.
1466 		 */
1467 		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1468 		    BSS_PARALLELOGRAM);
1469 		hdr->bth_count--;
1470 		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1471 		return;
1472 	}
1473 
1474 	ASSERT3U(hdr->bth_count, ==, min_count);
1475 
1476 	/*
1477 	 * Now we try to take a node from a neighbor. We check left, then
1478 	 * right. If the neighbor exists and has more than the minimum number
1479 	 * of elements, we move the separator between us and them to our
1480 	 * node, move their closest element (last for left, first for right)
1481 	 * to the separator, and move their closest child to our node. Along
1482 	 * the way we need to collapse the gap made by idx, and (for our right
1483 	 * neighbor) the gap made by removing their first element and child.
1484 	 *
1485 	 * Note: this logic currently doesn't support taking from a neighbor
1486 	 * that isn't a sibling (i.e. a neighbor with a different
1487 	 * parent). This isn't critical functionality, but may be worth
1488 	 * implementing in the future for completeness' sake.
1489 	 */
1490 	zfs_btree_core_t *parent = hdr->bth_parent;
1491 	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1492 
1493 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1494 	    parent->btc_children[parent_idx - 1]);
1495 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1496 		/* We can take a node from the left neighbor. */
1497 		ASSERT(zfs_btree_is_core(l_hdr));
1498 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1499 
1500 		/*
1501 		 * Start by shifting the elements and children in the current
1502 		 * node to the right by one spot.
1503 		 */
1504 		bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1505 
1506 		/*
1507 		 * Move the separator between node and neighbor to the first
1508 		 * element slot in the current node.
1509 		 */
1510 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1511 		    size;
1512 		bcpy(separator, node->btc_elems, size);
1513 
1514 		/* Move the last child of neighbor to our first child slot. */
1515 		node->btc_children[0] =
1516 		    neighbor->btc_children[l_hdr->bth_count];
1517 		node->btc_children[0]->bth_parent = node;
1518 
1519 		/* Move the last element of neighbor to the separator spot. */
1520 		uint8_t *take_elem = neighbor->btc_elems +
1521 		    (l_hdr->bth_count - 1) * size;
1522 		bcpy(take_elem, separator, size);
1523 		l_hdr->bth_count--;
1524 		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1525 		return;
1526 	}
1527 
1528 	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1529 	    NULL : parent->btc_children[parent_idx + 1]);
1530 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1531 		/* We can take a node from the right neighbor. */
1532 		ASSERT(zfs_btree_is_core(r_hdr));
1533 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1534 
1535 		/*
1536 		 * Shift elements in node left by one spot to overwrite rm_hdr
1537 		 * and the separator before it.
1538 		 */
1539 		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1540 		    BSS_PARALLELOGRAM);
1541 
1542 		/*
1543 		 * Move the separator between node and neighbor to the last
1544 		 * element spot in node.
1545 		 */
1546 		uint8_t *separator = parent->btc_elems + parent_idx * size;
1547 		bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1548 		    size);
1549 
1550 		/*
1551 		 * Move the first child of neighbor to the last child spot in
1552 		 * node.
1553 		 */
1554 		node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1555 		node->btc_children[hdr->bth_count]->bth_parent = node;
1556 
1557 		/* Move the first element of neighbor to the separator spot. */
1558 		uint8_t *take_elem = neighbor->btc_elems;
1559 		bcpy(take_elem, separator, size);
1560 		r_hdr->bth_count--;
1561 
1562 		/*
1563 		 * Shift the elements and children of neighbor to cover the
1564 		 * stolen elements.
1565 		 */
1566 		bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1567 		    BSS_TRAPEZOID);
1568 		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1569 		return;
1570 	}
1571 
1572 	/*
1573 	 * In this case, neither of our neighbors can spare an element, so we
1574 	 * need to merge with one of them. We prefer the left one,
1575 	 * arbitrarily. Move the separator into the leftmost merging node
1576 	 * (which may be us or the left neighbor), and then move the right
1577 	 * merging node's elements. Once that's done, we go back and delete
1578 	 * the element we're removing. Finally, go into the parent and delete
1579 	 * the right merging node and the separator. This may cause further
1580 	 * merging.
1581 	 */
1582 	zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1583 	uint32_t new_idx = idx;
1584 	if (l_hdr != NULL) {
1585 		keep_hdr = l_hdr;
1586 		new_rm_hdr = hdr;
1587 		new_idx += keep_hdr->bth_count + 1;
1588 	} else {
1589 		ASSERT3P(r_hdr, !=, NULL);
1590 		keep_hdr = hdr;
1591 		new_rm_hdr = r_hdr;
1592 		parent_idx++;
1593 	}
1594 
1595 	ASSERT(zfs_btree_is_core(keep_hdr));
1596 	ASSERT(zfs_btree_is_core(new_rm_hdr));
1597 
1598 	zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1599 	zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1600 
1601 	if (zfs_btree_verify_intensity >= 5) {
1602 		for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1603 			zfs_btree_verify_poison_at(tree, keep_hdr,
1604 			    keep_hdr->bth_count + i);
1605 		}
1606 	}
1607 
1608 	/* Move the separator into the left node. */
1609 	uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1610 	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1611 	    size;
1612 	bcpy(separator, e_out, size);
1613 	keep_hdr->bth_count++;
1614 
1615 	/* Move all our elements and children into the left node. */
1616 	bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1617 	    keep_hdr->bth_count, BSS_TRAPEZOID);
1618 
1619 	uint32_t old_count = keep_hdr->bth_count;
1620 
1621 	/* Update bookkeeping */
1622 	keep_hdr->bth_count += new_rm_hdr->bth_count;
1623 	ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1624 
1625 	/*
1626 	 * Shift the element and children to the right of rm_hdr to
1627 	 * the left by one spot.
1628 	 */
1629 	ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1630 	bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1631 	    BSS_PARALLELOGRAM);
1632 	keep_hdr->bth_count--;
1633 
1634 	/* Reparent all our children to point to the left node. */
1635 	zfs_btree_hdr_t **new_start = keep->btc_children +
1636 	    old_count - 1;
1637 	for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1638 		new_start[i]->bth_parent = keep;
1639 	for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1640 		ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1641 		ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1642 	}
1643 	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1644 
1645 	new_rm_hdr->bth_count = 0;
1646 	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1647 	zfs_btree_node_destroy(tree, new_rm_hdr);
1648 }
1649 
1650 /* Remove the element at the specific location. */
1651 void
zfs_btree_remove_idx(zfs_btree_t * tree,zfs_btree_index_t * where)1652 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1653 {
1654 	size_t size = tree->bt_elem_size;
1655 	zfs_btree_hdr_t *hdr = where->bti_node;
1656 	uint32_t idx = where->bti_offset;
1657 
1658 	ASSERT(!where->bti_before);
1659 	if (tree->bt_bulk != NULL) {
1660 		/*
1661 		 * Leave bulk insert mode. Note that our index would be
1662 		 * invalid after we correct the tree, so we copy the value
1663 		 * we're planning to remove and find it again after
1664 		 * bulk_finish.
1665 		 */
1666 		uint8_t *value = zfs_btree_get(tree, where);
1667 		uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1668 		bcpy(value, tmp, size);
1669 		zfs_btree_bulk_finish(tree);
1670 		VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1671 		kmem_free(tmp, size);
1672 		hdr = where->bti_node;
1673 		idx = where->bti_offset;
1674 	}
1675 
1676 	tree->bt_num_elems--;
1677 	/*
1678 	 * If the element happens to be in a core node, we move a leaf node's
1679 	 * element into its place and then remove the leaf node element. This
1680 	 * makes the rebalance logic not need to be recursive both upwards and
1681 	 * downwards.
1682 	 */
1683 	if (zfs_btree_is_core(hdr)) {
1684 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1685 		zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1686 		void *new_value = zfs_btree_last_helper(tree, left_subtree,
1687 		    where);
1688 		ASSERT3P(new_value, !=, NULL);
1689 
1690 		bcpy(new_value, node->btc_elems + idx * size, size);
1691 
1692 		hdr = where->bti_node;
1693 		idx = where->bti_offset;
1694 		ASSERT(!where->bti_before);
1695 	}
1696 
1697 	/*
1698 	 * First, we'll update the leaf's metadata. Then, we shift any
1699 	 * elements after the idx to the left. After that, we rebalance if
1700 	 * needed.
1701 	 */
1702 	ASSERT(!zfs_btree_is_core(hdr));
1703 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1704 	ASSERT3U(hdr->bth_count, >, 0);
1705 
1706 	uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1707 
1708 	/*
1709 	 * If we're over the minimum size or this is the root, just overwrite
1710 	 * the value and return.
1711 	 */
1712 	if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1713 		bt_shrink_leaf(tree, leaf, idx, 1);
1714 		if (hdr->bth_parent == NULL) {
1715 			ASSERT0(tree->bt_height);
1716 			if (hdr->bth_count == 0) {
1717 				tree->bt_root = NULL;
1718 				tree->bt_height--;
1719 				zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1720 			}
1721 		}
1722 		zfs_btree_verify(tree);
1723 		return;
1724 	}
1725 	ASSERT3U(hdr->bth_count, ==, min_count);
1726 
1727 	/*
1728 	 * Now we try to take a node from a sibling. We check left, then
1729 	 * right. If they exist and have more than the minimum number of
1730 	 * elements, we move the separator between us and them to our node
1731 	 * and move their closest element (last for left, first for right) to
1732 	 * the separator. Along the way we need to collapse the gap made by
1733 	 * idx, and (for our right neighbor) the gap made by removing their
1734 	 * first element.
1735 	 *
1736 	 * Note: this logic currently doesn't support taking from a neighbor
1737 	 * that isn't a sibling. This isn't critical functionality, but may be
1738 	 * worth implementing in the future for completeness' sake.
1739 	 */
1740 	zfs_btree_core_t *parent = hdr->bth_parent;
1741 	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1742 
1743 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1744 	    parent->btc_children[parent_idx - 1]);
1745 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1746 		/* We can take a node from the left neighbor. */
1747 		ASSERT(!zfs_btree_is_core(l_hdr));
1748 		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1749 
1750 		/*
1751 		 * Move our elements back by one spot to make room for the
1752 		 * stolen element and overwrite the element being removed.
1753 		 */
1754 		bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1755 
1756 		/* Move the separator to our first spot. */
1757 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1758 		    size;
1759 		bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1760 
1761 		/* Move our neighbor's last element to the separator. */
1762 		uint8_t *take_elem = neighbor->btl_elems +
1763 		    (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1764 		bcpy(take_elem, separator, size);
1765 
1766 		/* Delete our neighbor's last element. */
1767 		bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1768 		zfs_btree_verify(tree);
1769 		return;
1770 	}
1771 
1772 	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1773 	    NULL : parent->btc_children[parent_idx + 1]);
1774 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1775 		/* We can take a node from the right neighbor. */
1776 		ASSERT(!zfs_btree_is_core(r_hdr));
1777 		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1778 
1779 		/*
1780 		 * Move our elements after the element being removed forwards
1781 		 * by one spot to make room for the stolen element and
1782 		 * overwrite the element being removed.
1783 		 */
1784 		bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1785 		    1, BSD_LEFT);
1786 
1787 		/* Move the separator between us to our last spot. */
1788 		uint8_t *separator = parent->btc_elems + parent_idx * size;
1789 		bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1790 		    hdr->bth_count - 1) * size, size);
1791 
1792 		/* Move our neighbor's first element to the separator. */
1793 		uint8_t *take_elem = neighbor->btl_elems +
1794 		    r_hdr->bth_first * size;
1795 		bcpy(take_elem, separator, size);
1796 
1797 		/* Delete our neighbor's first element. */
1798 		bt_shrink_leaf(tree, neighbor, 0, 1);
1799 		zfs_btree_verify(tree);
1800 		return;
1801 	}
1802 
1803 	/*
1804 	 * In this case, neither of our neighbors can spare an element, so we
1805 	 * need to merge with one of them. We prefer the left one, arbitrarily.
1806 	 * After remove we move the separator into the leftmost merging node
1807 	 * (which may be us or the left neighbor), and then move the right
1808 	 * merging node's elements. Once that's done, we go back and delete
1809 	 * the element we're removing. Finally, go into the parent and delete
1810 	 * the right merging node and the separator. This may cause further
1811 	 * merging.
1812 	 */
1813 	zfs_btree_hdr_t *rm_hdr, *k_hdr;
1814 	if (l_hdr != NULL) {
1815 		k_hdr = l_hdr;
1816 		rm_hdr = hdr;
1817 	} else {
1818 		ASSERT3P(r_hdr, !=, NULL);
1819 		k_hdr = hdr;
1820 		rm_hdr = r_hdr;
1821 		parent_idx++;
1822 	}
1823 	ASSERT(!zfs_btree_is_core(k_hdr));
1824 	ASSERT(!zfs_btree_is_core(rm_hdr));
1825 	ASSERT3U(k_hdr->bth_count, ==, min_count);
1826 	ASSERT3U(rm_hdr->bth_count, ==, min_count);
1827 	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1828 	zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1829 
1830 	if (zfs_btree_verify_intensity >= 5) {
1831 		for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1832 			zfs_btree_verify_poison_at(tree, k_hdr,
1833 			    k_hdr->bth_count + i);
1834 		}
1835 	}
1836 
1837 	/*
1838 	 * Remove the value from the node.  It will go below the minimum,
1839 	 * but we'll fix it in no time.
1840 	 */
1841 	bt_shrink_leaf(tree, leaf, idx, 1);
1842 
1843 	/* Prepare space for elements to be moved from the right. */
1844 	uint32_t k_count = k_hdr->bth_count;
1845 	bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1846 	ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1847 
1848 	/* Move the separator into the first open spot. */
1849 	uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1850 	uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1851 	bcpy(separator, out, size);
1852 
1853 	/* Move our elements to the left neighbor. */
1854 	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1855 
1856 	/* Remove the emptied node from the parent. */
1857 	zfs_btree_remove_from_node(tree, parent, rm_hdr);
1858 	zfs_btree_node_destroy(tree, rm_hdr);
1859 	zfs_btree_verify(tree);
1860 }
1861 
1862 /* Remove the given value from the tree. */
1863 void
zfs_btree_remove(zfs_btree_t * tree,const void * value)1864 zfs_btree_remove(zfs_btree_t *tree, const void *value)
1865 {
1866 	zfs_btree_index_t where = {0};
1867 	VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1868 	zfs_btree_remove_idx(tree, &where);
1869 }
1870 
1871 /* Return the number of elements in the tree. */
1872 ulong_t
zfs_btree_numnodes(zfs_btree_t * tree)1873 zfs_btree_numnodes(zfs_btree_t *tree)
1874 {
1875 	return (tree->bt_num_elems);
1876 }
1877 
1878 /*
1879  * This function is used to visit all the elements in the tree before
1880  * destroying the tree. This allows the calling code to perform any cleanup it
1881  * needs to do. This is more efficient than just removing the first element
1882  * over and over, because it removes all rebalancing. Once the destroy_nodes()
1883  * function has been called, no other btree operations are valid until it
1884  * returns NULL, which point the only valid operation is zfs_btree_destroy().
1885  *
1886  * example:
1887  *
1888  *      zfs_btree_index_t *cookie = NULL;
1889  *      my_data_t *node;
1890  *
1891  *      while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1892  *              free(node->ptr);
1893  *      zfs_btree_destroy(tree);
1894  *
1895  */
1896 void *
zfs_btree_destroy_nodes(zfs_btree_t * tree,zfs_btree_index_t ** cookie)1897 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1898 {
1899 	if (*cookie == NULL) {
1900 		if (tree->bt_height == -1)
1901 			return (NULL);
1902 		*cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1903 		return (zfs_btree_first(tree, *cookie));
1904 	}
1905 
1906 	void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1907 	    zfs_btree_node_destroy);
1908 	if (rval == NULL)   {
1909 		tree->bt_root = NULL;
1910 		tree->bt_height = -1;
1911 		tree->bt_num_elems = 0;
1912 		kmem_free(*cookie, sizeof (**cookie));
1913 		tree->bt_bulk = NULL;
1914 	}
1915 	return (rval);
1916 }
1917 
1918 static void
zfs_btree_clear_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1919 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1920 {
1921 	if (zfs_btree_is_core(hdr)) {
1922 		zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1923 		for (uint32_t i = 0; i <= hdr->bth_count; i++)
1924 			zfs_btree_clear_helper(tree, btc->btc_children[i]);
1925 	}
1926 
1927 	zfs_btree_node_destroy(tree, hdr);
1928 }
1929 
1930 void
zfs_btree_clear(zfs_btree_t * tree)1931 zfs_btree_clear(zfs_btree_t *tree)
1932 {
1933 	if (tree->bt_root == NULL) {
1934 		ASSERT0(tree->bt_num_elems);
1935 		return;
1936 	}
1937 
1938 	zfs_btree_clear_helper(tree, tree->bt_root);
1939 	tree->bt_num_elems = 0;
1940 	tree->bt_root = NULL;
1941 	tree->bt_num_nodes = 0;
1942 	tree->bt_height = -1;
1943 	tree->bt_bulk = NULL;
1944 }
1945 
1946 void
zfs_btree_destroy(zfs_btree_t * tree)1947 zfs_btree_destroy(zfs_btree_t *tree)
1948 {
1949 	ASSERT0(tree->bt_num_elems);
1950 	ASSERT3P(tree->bt_root, ==, NULL);
1951 }
1952 
1953 /* Verify that every child of this node has the correct parent pointer. */
1954 static void
zfs_btree_verify_pointers_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1955 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1956 {
1957 	if (!zfs_btree_is_core(hdr))
1958 		return;
1959 
1960 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1961 	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1962 		VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1963 		zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1964 	}
1965 }
1966 
1967 /* Verify that every node has the correct parent pointer. */
1968 static void
zfs_btree_verify_pointers(zfs_btree_t * tree)1969 zfs_btree_verify_pointers(zfs_btree_t *tree)
1970 {
1971 	if (tree->bt_height == -1) {
1972 		VERIFY3P(tree->bt_root, ==, NULL);
1973 		return;
1974 	}
1975 	VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
1976 	zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1977 }
1978 
1979 /*
1980  * Verify that all the current node and its children satisfy the count
1981  * invariants, and return the total count in the subtree rooted in this node.
1982  */
1983 static uint64_t
zfs_btree_verify_counts_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1984 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1985 {
1986 	if (!zfs_btree_is_core(hdr)) {
1987 		if (tree->bt_root != hdr && tree->bt_bulk &&
1988 		    hdr != &tree->bt_bulk->btl_hdr) {
1989 			VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
1990 		}
1991 
1992 		return (hdr->bth_count);
1993 	} else {
1994 
1995 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1996 		uint64_t ret = hdr->bth_count;
1997 		if (tree->bt_root != hdr && tree->bt_bulk == NULL)
1998 			VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
1999 		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2000 			ret += zfs_btree_verify_counts_helper(tree,
2001 			    node->btc_children[i]);
2002 		}
2003 
2004 		return (ret);
2005 	}
2006 }
2007 
2008 /*
2009  * Verify that all nodes satisfy the invariants and that the total number of
2010  * elements is correct.
2011  */
2012 static void
zfs_btree_verify_counts(zfs_btree_t * tree)2013 zfs_btree_verify_counts(zfs_btree_t *tree)
2014 {
2015 	EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
2016 	if (tree->bt_height == -1) {
2017 		return;
2018 	}
2019 	VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
2020 	    tree->bt_num_elems);
2021 }
2022 
2023 /*
2024  * Check that the subtree rooted at this node has a uniform height. Returns
2025  * the number of nodes under this node, to help verify bt_num_nodes.
2026  */
2027 static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,int32_t height)2028 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
2029     int32_t height)
2030 {
2031 	if (!zfs_btree_is_core(hdr)) {
2032 		VERIFY0(height);
2033 		return (1);
2034 	}
2035 
2036 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2037 	uint64_t ret = 1;
2038 	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2039 		ret += zfs_btree_verify_height_helper(tree,
2040 		    node->btc_children[i], height - 1);
2041 	}
2042 	return (ret);
2043 }
2044 
2045 /*
2046  * Check that the tree rooted at this node has a uniform height, and that the
2047  * bt_height in the tree is correct.
2048  */
2049 static void
zfs_btree_verify_height(zfs_btree_t * tree)2050 zfs_btree_verify_height(zfs_btree_t *tree)
2051 {
2052 	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2053 	if (tree->bt_height == -1) {
2054 		return;
2055 	}
2056 
2057 	VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2058 	    tree->bt_height), ==, tree->bt_num_nodes);
2059 }
2060 
2061 /*
2062  * Check that the elements in this node are sorted, and that if this is a core
2063  * node, the separators are properly between the subtrees they separaate and
2064  * that the children also satisfy this requirement.
2065  */
2066 static void
zfs_btree_verify_order_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2067 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2068 {
2069 	size_t size = tree->bt_elem_size;
2070 	if (!zfs_btree_is_core(hdr)) {
2071 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2072 		for (uint32_t i = 1; i < hdr->bth_count; i++) {
2073 			VERIFY3S(tree->bt_compar(leaf->btl_elems +
2074 			    (hdr->bth_first + i - 1) * size,
2075 			    leaf->btl_elems +
2076 			    (hdr->bth_first + i) * size), ==, -1);
2077 		}
2078 		return;
2079 	}
2080 
2081 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2082 	for (uint32_t i = 1; i < hdr->bth_count; i++) {
2083 		VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2084 		    node->btc_elems + i * size), ==, -1);
2085 	}
2086 	for (uint32_t i = 0; i < hdr->bth_count; i++) {
2087 		uint8_t *left_child_last = NULL;
2088 		zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2089 		if (zfs_btree_is_core(left_child_hdr)) {
2090 			zfs_btree_core_t *left_child =
2091 			    (zfs_btree_core_t *)left_child_hdr;
2092 			left_child_last = left_child->btc_elems +
2093 			    (left_child_hdr->bth_count - 1) * size;
2094 		} else {
2095 			zfs_btree_leaf_t *left_child =
2096 			    (zfs_btree_leaf_t *)left_child_hdr;
2097 			left_child_last = left_child->btl_elems +
2098 			    (left_child_hdr->bth_first +
2099 			    left_child_hdr->bth_count - 1) * size;
2100 		}
2101 		int comp = tree->bt_compar(node->btc_elems + i * size,
2102 		    left_child_last);
2103 		if (comp <= 0) {
2104 			panic("btree: compar returned %d (expected 1) at "
2105 			    "%px %d: compar(%px,  %px)", comp, node, i,
2106 			    node->btc_elems + i * size, left_child_last);
2107 		}
2108 
2109 		uint8_t *right_child_first = NULL;
2110 		zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2111 		if (zfs_btree_is_core(right_child_hdr)) {
2112 			zfs_btree_core_t *right_child =
2113 			    (zfs_btree_core_t *)right_child_hdr;
2114 			right_child_first = right_child->btc_elems;
2115 		} else {
2116 			zfs_btree_leaf_t *right_child =
2117 			    (zfs_btree_leaf_t *)right_child_hdr;
2118 			right_child_first = right_child->btl_elems +
2119 			    right_child_hdr->bth_first * size;
2120 		}
2121 		comp = tree->bt_compar(node->btc_elems + i * size,
2122 		    right_child_first);
2123 		if (comp >= 0) {
2124 			panic("btree: compar returned %d (expected -1) at "
2125 			    "%px %d: compar(%px,  %px)", comp, node, i,
2126 			    node->btc_elems + i * size, right_child_first);
2127 		}
2128 	}
2129 	for (uint32_t i = 0; i <= hdr->bth_count; i++)
2130 		zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2131 }
2132 
2133 /* Check that all elements in the tree are in sorted order. */
2134 static void
zfs_btree_verify_order(zfs_btree_t * tree)2135 zfs_btree_verify_order(zfs_btree_t *tree)
2136 {
2137 	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2138 	if (tree->bt_height == -1) {
2139 		return;
2140 	}
2141 
2142 	zfs_btree_verify_order_helper(tree, tree->bt_root);
2143 }
2144 
2145 #ifdef ZFS_DEBUG
2146 /* Check that all unused memory is poisoned correctly. */
2147 static void
zfs_btree_verify_poison_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2148 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2149 {
2150 	size_t size = tree->bt_elem_size;
2151 	if (!zfs_btree_is_core(hdr)) {
2152 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2153 		for (size_t i = 0; i < hdr->bth_first * size; i++)
2154 			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2155 		size_t esize = tree->bt_leaf_size -
2156 		    offsetof(zfs_btree_leaf_t, btl_elems);
2157 		for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2158 		    i < esize; i++)
2159 			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2160 	} else {
2161 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2162 		for (size_t i = hdr->bth_count * size;
2163 		    i < BTREE_CORE_ELEMS * size; i++)
2164 			VERIFY3U(node->btc_elems[i], ==, 0x0f);
2165 
2166 		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2167 		    i++) {
2168 			VERIFY3P(node->btc_children[i], ==,
2169 			    (zfs_btree_hdr_t *)BTREE_POISON);
2170 		}
2171 
2172 		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2173 			zfs_btree_verify_poison_helper(tree,
2174 			    node->btc_children[i]);
2175 		}
2176 	}
2177 }
2178 #endif
2179 
2180 /* Check that unused memory in the tree is still poisoned. */
2181 static void
zfs_btree_verify_poison(zfs_btree_t * tree)2182 zfs_btree_verify_poison(zfs_btree_t *tree)
2183 {
2184 #ifdef ZFS_DEBUG
2185 	if (tree->bt_height == -1)
2186 		return;
2187 	zfs_btree_verify_poison_helper(tree, tree->bt_root);
2188 #endif
2189 }
2190 
2191 void
zfs_btree_verify(zfs_btree_t * tree)2192 zfs_btree_verify(zfs_btree_t *tree)
2193 {
2194 	if (zfs_btree_verify_intensity == 0)
2195 		return;
2196 	zfs_btree_verify_height(tree);
2197 	if (zfs_btree_verify_intensity == 1)
2198 		return;
2199 	zfs_btree_verify_pointers(tree);
2200 	if (zfs_btree_verify_intensity == 2)
2201 		return;
2202 	zfs_btree_verify_counts(tree);
2203 	if (zfs_btree_verify_intensity == 3)
2204 		return;
2205 	zfs_btree_verify_order(tree);
2206 
2207 	if (zfs_btree_verify_intensity == 4)
2208 		return;
2209 	zfs_btree_verify_poison(tree);
2210 }
2211 
2212 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
2213 	"Enable btree verification. Levels above 4 require ZFS be built "
2214 	"with debugging");
2215