xref: /freebsd/sys/vm/vm_radix.c (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52 
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55 
56 #include "opt_ddb.h"
57 
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/libkern.h>
62 #include <sys/proc.h>
63 #include <sys/vmmeter.h>
64 #include <sys/smr.h>
65 #include <sys/smr_types.h>
66 
67 #include <vm/uma.h>
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_radix.h>
73 
74 #ifdef DDB
75 #include <ddb/ddb.h>
76 #endif
77 
78 /*
79  * These widths should allow the pointers to a node's children to fit within
80  * a single cache line.  The extra levels from a narrow width should not be
81  * a problem thanks to path compression.
82  */
83 #ifdef __LP64__
84 #define	VM_RADIX_WIDTH	4
85 #else
86 #define	VM_RADIX_WIDTH	3
87 #endif
88 
89 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
90 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
91 #define	VM_RADIX_LIMIT							\
92 	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
93 
94 /* Flag bits stored in node pointers. */
95 #define	VM_RADIX_ISLEAF	0x1
96 #define	VM_RADIX_FLAGS	0x1
97 #define	VM_RADIX_PAD	VM_RADIX_FLAGS
98 
99 /* Returns one unit associated with specified level. */
100 #define	VM_RADIX_UNITLEVEL(lev)						\
101 	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
102 
103 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
104 
105 struct vm_radix_node;
106 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
107 
108 struct vm_radix_node {
109 	vm_pindex_t	rn_owner;			/* Owner of record. */
110 	uint16_t	rn_count;			/* Valid children. */
111 	uint8_t		rn_clev;			/* Current level. */
112 	int8_t		rn_last;			/* zero last ptr. */
113 	smrnode_t	rn_child[VM_RADIX_COUNT];	/* Child nodes. */
114 };
115 
116 static uma_zone_t vm_radix_node_zone;
117 static smr_t vm_radix_smr;
118 
119 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
120     enum vm_radix_access access);
121 
122 /*
123  * Allocate a radix node.
124  */
125 static struct vm_radix_node *
126 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
127 {
128 	struct vm_radix_node *rnode;
129 
130 	rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
131 	if (rnode == NULL)
132 		return (NULL);
133 
134 	/*
135 	 * We want to clear the last child pointer after the final section
136 	 * has exited so lookup can not return false negatives.  It is done
137 	 * here because it will be cache-cold in the dtor callback.
138 	 */
139 	if (rnode->rn_last != 0) {
140 		vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
141 		    NULL, UNSERIALIZED);
142 		rnode->rn_last = 0;
143 	}
144 	rnode->rn_owner = owner;
145 	rnode->rn_count = count;
146 	rnode->rn_clev = clevel;
147 	return (rnode);
148 }
149 
150 /*
151  * Free radix node.
152  */
153 static __inline void
154 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
155 {
156 #ifdef INVARIANTS
157 	int slot;
158 
159 	KASSERT(rnode->rn_count == 0,
160 	    ("vm_radix_node_put: rnode %p has %d children", rnode,
161 	    rnode->rn_count));
162 	for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
163 		if (slot == last)
164 			continue;
165 		KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
166 		    NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
167 	}
168 #endif
169 	/* Off by one so a freshly zero'd node is not assigned to. */
170 	rnode->rn_last = last + 1;
171 	uma_zfree_smr(vm_radix_node_zone, rnode);
172 }
173 
174 /*
175  * Return the position in the array for a given level.
176  */
177 static __inline int
178 vm_radix_slot(vm_pindex_t index, uint16_t level)
179 {
180 
181 	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
182 }
183 
184 /* Trims the key after the specified level. */
185 static __inline vm_pindex_t
186 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
187 {
188 	vm_pindex_t ret;
189 
190 	ret = index;
191 	if (level > 0) {
192 		ret >>= level * VM_RADIX_WIDTH;
193 		ret <<= level * VM_RADIX_WIDTH;
194 	}
195 	return (ret);
196 }
197 
198 /*
199  * Fetch a node pointer from a slot in another node.
200  */
201 static __inline struct vm_radix_node *
202 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
203 {
204 
205 	switch (access) {
206 	case UNSERIALIZED:
207 		return (smr_unserialized_load(p, true));
208 	case LOCKED:
209 		return (smr_serialized_load(p, true));
210 	case SMR:
211 		return (smr_entered_load(p, vm_radix_smr));
212 	}
213 	__assert_unreachable();
214 }
215 
216 static __inline void
217 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
218     enum vm_radix_access access)
219 {
220 
221 	switch (access) {
222 	case UNSERIALIZED:
223 		smr_unserialized_store(p, v, true);
224 		break;
225 	case LOCKED:
226 		smr_serialized_store(p, v, true);
227 		break;
228 	case SMR:
229 		panic("vm_radix_node_store: Not supported in smr section.");
230 	}
231 }
232 
233 /*
234  * Get the root node for a radix tree.
235  */
236 static __inline struct vm_radix_node *
237 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
238 {
239 
240 	return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
241 }
242 
243 /*
244  * Set the root node for a radix tree.
245  */
246 static __inline void
247 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
248     enum vm_radix_access access)
249 {
250 
251 	vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
252 }
253 
254 /*
255  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
256  */
257 static __inline bool
258 vm_radix_isleaf(struct vm_radix_node *rnode)
259 {
260 
261 	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
262 }
263 
264 /*
265  * Returns the associated page extracted from rnode.
266  */
267 static __inline vm_page_t
268 vm_radix_topage(struct vm_radix_node *rnode)
269 {
270 
271 	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
272 }
273 
274 /*
275  * Adds the page as a child of the provided node.
276  */
277 static __inline void
278 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
279     vm_page_t page, enum vm_radix_access access)
280 {
281 	int slot;
282 
283 	slot = vm_radix_slot(index, clev);
284 	vm_radix_node_store(&rnode->rn_child[slot],
285 	    (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
286 }
287 
288 /*
289  * Returns the level where two keys differ.
290  * It cannot accept 2 equal keys.
291  */
292 static __inline uint16_t
293 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
294 {
295 
296 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
297 	    __func__, (uintmax_t)index1));
298 	CTASSERT(sizeof(long long) >= sizeof(vm_pindex_t));
299 
300 	/*
301 	 * From the highest-order bit where the indexes differ,
302 	 * compute the highest level in the trie where they differ.
303 	 */
304 	return ((flsll(index1 ^ index2) - 1) / VM_RADIX_WIDTH);
305 }
306 
307 /*
308  * Returns TRUE if it can be determined that key does not belong to the
309  * specified rnode.  Otherwise, returns FALSE.
310  */
311 static __inline bool
312 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
313 {
314 
315 	if (rnode->rn_clev < VM_RADIX_LIMIT) {
316 		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
317 		return (idx != rnode->rn_owner);
318 	}
319 	return (false);
320 }
321 
322 /*
323  * Internal helper for vm_radix_reclaim_allnodes().
324  * This function is recursive.
325  */
326 static void
327 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
328 {
329 	struct vm_radix_node *child;
330 	int slot;
331 
332 	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
333 	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
334 	for (slot = 0; rnode->rn_count != 0; slot++) {
335 		child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
336 		if (child == NULL)
337 			continue;
338 		if (!vm_radix_isleaf(child))
339 			vm_radix_reclaim_allnodes_int(child);
340 		vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
341 		rnode->rn_count--;
342 	}
343 	vm_radix_node_put(rnode, -1);
344 }
345 
346 #ifndef UMA_MD_SMALL_ALLOC
347 void vm_radix_reserve_kva(void);
348 /*
349  * Reserve the KVA necessary to satisfy the node allocation.
350  * This is mandatory in architectures not supporting direct
351  * mapping as they will need otherwise to carve into the kernel maps for
352  * every node allocation, resulting into deadlocks for consumers already
353  * working with kernel maps.
354  */
355 void
356 vm_radix_reserve_kva(void)
357 {
358 
359 	/*
360 	 * Calculate the number of reserved nodes, discounting the pages that
361 	 * are needed to store them.
362 	 */
363 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
364 	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
365 	    sizeof(struct vm_radix_node))))
366 		panic("%s: unable to reserve KVA", __func__);
367 }
368 #endif
369 
370 /*
371  * Initialize the UMA slab zone.
372  */
373 void
374 vm_radix_zinit(void)
375 {
376 
377 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
378 	    sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
379 	    VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
380 	vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
381 }
382 
383 /*
384  * Inserts the key-value pair into the trie.
385  * Panics if the key already exists.
386  */
387 int
388 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
389 {
390 	vm_pindex_t index, newind;
391 	struct vm_radix_node *rnode, *tmp;
392 	smrnode_t *parentp;
393 	vm_page_t m;
394 	int slot;
395 	uint16_t clev;
396 
397 	index = page->pindex;
398 
399 	/*
400 	 * The owner of record for root is not really important because it
401 	 * will never be used.
402 	 */
403 	rnode = vm_radix_root_load(rtree, LOCKED);
404 	if (rnode == NULL) {
405 		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
406 		return (0);
407 	}
408 	parentp = (smrnode_t *)&rtree->rt_root;
409 	for (;;) {
410 		if (vm_radix_isleaf(rnode)) {
411 			m = vm_radix_topage(rnode);
412 			if (m->pindex == index)
413 				panic("%s: key %jx is already present",
414 				    __func__, (uintmax_t)index);
415 			clev = vm_radix_keydiff(m->pindex, index);
416 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
417 			    clev + 1), 2, clev);
418 			if (tmp == NULL)
419 				return (ENOMEM);
420 			/* These writes are not yet visible due to ordering. */
421 			vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
422 			vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
423 			/* Synchronize to make leaf visible. */
424 			vm_radix_node_store(parentp, tmp, LOCKED);
425 			return (0);
426 		} else if (vm_radix_keybarr(rnode, index))
427 			break;
428 		slot = vm_radix_slot(index, rnode->rn_clev);
429 		parentp = &rnode->rn_child[slot];
430 		tmp = vm_radix_node_load(parentp, LOCKED);
431 		if (tmp == NULL) {
432 			rnode->rn_count++;
433 			vm_radix_addpage(rnode, index, rnode->rn_clev, page,
434 			    LOCKED);
435 			return (0);
436 		}
437 		rnode = tmp;
438 	}
439 
440 	/*
441 	 * A new node is needed because the right insertion level is reached.
442 	 * Setup the new intermediate node and add the 2 children: the
443 	 * new object and the older edge.
444 	 */
445 	newind = rnode->rn_owner;
446 	clev = vm_radix_keydiff(newind, index);
447 	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
448 	if (tmp == NULL)
449 		return (ENOMEM);
450 	slot = vm_radix_slot(newind, clev);
451 	/* These writes are not yet visible due to ordering. */
452 	vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
453 	vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
454 	/* Serializing write to make the above visible. */
455 	vm_radix_node_store(parentp, tmp, LOCKED);
456 
457 	return (0);
458 }
459 
460 /*
461  * Returns the value stored at the index.  If the index is not present,
462  * NULL is returned.
463  */
464 static __always_inline vm_page_t
465 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
466     enum vm_radix_access access)
467 {
468 	struct vm_radix_node *rnode;
469 	vm_page_t m;
470 	int slot;
471 
472 	rnode = vm_radix_root_load(rtree, access);
473 	while (rnode != NULL) {
474 		if (vm_radix_isleaf(rnode)) {
475 			m = vm_radix_topage(rnode);
476 			if (m->pindex == index)
477 				return (m);
478 			break;
479 		}
480 		if (vm_radix_keybarr(rnode, index))
481 			break;
482 		slot = vm_radix_slot(index, rnode->rn_clev);
483 		rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
484 	}
485 	return (NULL);
486 }
487 
488 /*
489  * Returns the value stored at the index assuming there is an external lock.
490  *
491  * If the index is not present, NULL is returned.
492  */
493 vm_page_t
494 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
495 {
496 
497 	return _vm_radix_lookup(rtree, index, LOCKED);
498 }
499 
500 /*
501  * Returns the value stored at the index without requiring an external lock.
502  *
503  * If the index is not present, NULL is returned.
504  */
505 vm_page_t
506 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
507 {
508 	vm_page_t m;
509 
510 	smr_enter(vm_radix_smr);
511 	m = _vm_radix_lookup(rtree, index, SMR);
512 	smr_exit(vm_radix_smr);
513 
514 	return (m);
515 }
516 
517 /*
518  * Look up the nearest entry at a position greater than or equal to index.
519  */
520 vm_page_t
521 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
522 {
523 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
524 	vm_pindex_t inc;
525 	vm_page_t m;
526 	struct vm_radix_node *child, *rnode;
527 #ifdef INVARIANTS
528 	int loops = 0;
529 #endif
530 	int slot, tos;
531 
532 	rnode = vm_radix_root_load(rtree, LOCKED);
533 	if (rnode == NULL)
534 		return (NULL);
535 	else if (vm_radix_isleaf(rnode)) {
536 		m = vm_radix_topage(rnode);
537 		if (m->pindex >= index)
538 			return (m);
539 		else
540 			return (NULL);
541 	}
542 	tos = 0;
543 	for (;;) {
544 		/*
545 		 * If the keys differ before the current bisection node,
546 		 * then the search key might rollback to the earliest
547 		 * available bisection node or to the smallest key
548 		 * in the current node (if the owner is greater than the
549 		 * search key).
550 		 */
551 		if (vm_radix_keybarr(rnode, index)) {
552 			if (index > rnode->rn_owner) {
553 ascend:
554 				KASSERT(++loops < 1000,
555 				    ("vm_radix_lookup_ge: too many loops"));
556 
557 				/*
558 				 * Pop nodes from the stack until either the
559 				 * stack is empty or a node that could have a
560 				 * matching descendant is found.
561 				 */
562 				do {
563 					if (tos == 0)
564 						return (NULL);
565 					rnode = stack[--tos];
566 				} while (vm_radix_slot(index,
567 				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
568 
569 				/*
570 				 * The following computation cannot overflow
571 				 * because index's slot at the current level
572 				 * is less than VM_RADIX_COUNT - 1.
573 				 */
574 				index = vm_radix_trimkey(index,
575 				    rnode->rn_clev);
576 				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
577 			} else
578 				index = rnode->rn_owner;
579 			KASSERT(!vm_radix_keybarr(rnode, index),
580 			    ("vm_radix_lookup_ge: keybarr failed"));
581 		}
582 		slot = vm_radix_slot(index, rnode->rn_clev);
583 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
584 		if (vm_radix_isleaf(child)) {
585 			m = vm_radix_topage(child);
586 			if (m->pindex >= index)
587 				return (m);
588 		} else if (child != NULL)
589 			goto descend;
590 
591 		/*
592 		 * Look for an available edge or page within the current
593 		 * bisection node.
594 		 */
595                 if (slot < (VM_RADIX_COUNT - 1)) {
596 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
597 			index = vm_radix_trimkey(index, rnode->rn_clev);
598 			do {
599 				index += inc;
600 				slot++;
601 				child = vm_radix_node_load(&rnode->rn_child[slot],
602 				    LOCKED);
603 				if (vm_radix_isleaf(child)) {
604 					m = vm_radix_topage(child);
605 					if (m->pindex >= index)
606 						return (m);
607 				} else if (child != NULL)
608 					goto descend;
609 			} while (slot < (VM_RADIX_COUNT - 1));
610 		}
611 		KASSERT(child == NULL || vm_radix_isleaf(child),
612 		    ("vm_radix_lookup_ge: child is radix node"));
613 
614 		/*
615 		 * If a page or edge greater than the search slot is not found
616 		 * in the current node, ascend to the next higher-level node.
617 		 */
618 		goto ascend;
619 descend:
620 		KASSERT(rnode->rn_clev > 0,
621 		    ("vm_radix_lookup_ge: pushing leaf's parent"));
622 		KASSERT(tos < VM_RADIX_LIMIT,
623 		    ("vm_radix_lookup_ge: stack overflow"));
624 		stack[tos++] = rnode;
625 		rnode = child;
626 	}
627 }
628 
629 /*
630  * Look up the nearest entry at a position less than or equal to index.
631  */
632 vm_page_t
633 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
634 {
635 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
636 	vm_pindex_t inc;
637 	vm_page_t m;
638 	struct vm_radix_node *child, *rnode;
639 #ifdef INVARIANTS
640 	int loops = 0;
641 #endif
642 	int slot, tos;
643 
644 	rnode = vm_radix_root_load(rtree, LOCKED);
645 	if (rnode == NULL)
646 		return (NULL);
647 	else if (vm_radix_isleaf(rnode)) {
648 		m = vm_radix_topage(rnode);
649 		if (m->pindex <= index)
650 			return (m);
651 		else
652 			return (NULL);
653 	}
654 	tos = 0;
655 	for (;;) {
656 		/*
657 		 * If the keys differ before the current bisection node,
658 		 * then the search key might rollback to the earliest
659 		 * available bisection node or to the largest key
660 		 * in the current node (if the owner is smaller than the
661 		 * search key).
662 		 */
663 		if (vm_radix_keybarr(rnode, index)) {
664 			if (index > rnode->rn_owner) {
665 				index = rnode->rn_owner + VM_RADIX_COUNT *
666 				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
667 			} else {
668 ascend:
669 				KASSERT(++loops < 1000,
670 				    ("vm_radix_lookup_le: too many loops"));
671 
672 				/*
673 				 * Pop nodes from the stack until either the
674 				 * stack is empty or a node that could have a
675 				 * matching descendant is found.
676 				 */
677 				do {
678 					if (tos == 0)
679 						return (NULL);
680 					rnode = stack[--tos];
681 				} while (vm_radix_slot(index,
682 				    rnode->rn_clev) == 0);
683 
684 				/*
685 				 * The following computation cannot overflow
686 				 * because index's slot at the current level
687 				 * is greater than 0.
688 				 */
689 				index = vm_radix_trimkey(index,
690 				    rnode->rn_clev);
691 			}
692 			index--;
693 			KASSERT(!vm_radix_keybarr(rnode, index),
694 			    ("vm_radix_lookup_le: keybarr failed"));
695 		}
696 		slot = vm_radix_slot(index, rnode->rn_clev);
697 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
698 		if (vm_radix_isleaf(child)) {
699 			m = vm_radix_topage(child);
700 			if (m->pindex <= index)
701 				return (m);
702 		} else if (child != NULL)
703 			goto descend;
704 
705 		/*
706 		 * Look for an available edge or page within the current
707 		 * bisection node.
708 		 */
709 		if (slot > 0) {
710 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
711 			index |= inc - 1;
712 			do {
713 				index -= inc;
714 				slot--;
715 				child = vm_radix_node_load(&rnode->rn_child[slot],
716 				    LOCKED);
717 				if (vm_radix_isleaf(child)) {
718 					m = vm_radix_topage(child);
719 					if (m->pindex <= index)
720 						return (m);
721 				} else if (child != NULL)
722 					goto descend;
723 			} while (slot > 0);
724 		}
725 		KASSERT(child == NULL || vm_radix_isleaf(child),
726 		    ("vm_radix_lookup_le: child is radix node"));
727 
728 		/*
729 		 * If a page or edge smaller than the search slot is not found
730 		 * in the current node, ascend to the next higher-level node.
731 		 */
732 		goto ascend;
733 descend:
734 		KASSERT(rnode->rn_clev > 0,
735 		    ("vm_radix_lookup_le: pushing leaf's parent"));
736 		KASSERT(tos < VM_RADIX_LIMIT,
737 		    ("vm_radix_lookup_le: stack overflow"));
738 		stack[tos++] = rnode;
739 		rnode = child;
740 	}
741 }
742 
743 /*
744  * Remove the specified index from the trie, and return the value stored at
745  * that index.  If the index is not present, return NULL.
746  */
747 vm_page_t
748 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
749 {
750 	struct vm_radix_node *rnode, *parent, *tmp;
751 	vm_page_t m;
752 	int i, slot;
753 
754 	rnode = vm_radix_root_load(rtree, LOCKED);
755 	if (vm_radix_isleaf(rnode)) {
756 		m = vm_radix_topage(rnode);
757 		if (m->pindex != index)
758 			return (NULL);
759 		vm_radix_root_store(rtree, NULL, LOCKED);
760 		return (m);
761 	}
762 	parent = NULL;
763 	for (;;) {
764 		if (rnode == NULL)
765 			return (NULL);
766 		slot = vm_radix_slot(index, rnode->rn_clev);
767 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
768 		if (vm_radix_isleaf(tmp)) {
769 			m = vm_radix_topage(tmp);
770 			if (m->pindex != index)
771 				return (NULL);
772 			vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
773 			rnode->rn_count--;
774 			if (rnode->rn_count > 1)
775 				return (m);
776 			for (i = 0; i < VM_RADIX_COUNT; i++)
777 				if (vm_radix_node_load(&rnode->rn_child[i],
778 				    LOCKED) != NULL)
779 					break;
780 			KASSERT(i != VM_RADIX_COUNT,
781 			    ("%s: invalid node configuration", __func__));
782 			tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
783 			if (parent == NULL)
784 				vm_radix_root_store(rtree, tmp, LOCKED);
785 			else {
786 				slot = vm_radix_slot(index, parent->rn_clev);
787 				KASSERT(vm_radix_node_load(
788 				    &parent->rn_child[slot], LOCKED) == rnode,
789 				    ("%s: invalid child value", __func__));
790 				vm_radix_node_store(&parent->rn_child[slot],
791 				    tmp, LOCKED);
792 			}
793 			/*
794 			 * The child is still valid and we can not zero the
795 			 * pointer until all smr references are gone.
796 			 */
797 			rnode->rn_count--;
798 			vm_radix_node_put(rnode, i);
799 			return (m);
800 		}
801 		parent = rnode;
802 		rnode = tmp;
803 	}
804 }
805 
806 /*
807  * Remove and free all the nodes from the radix tree.
808  * This function is recursive but there is a tight control on it as the
809  * maximum depth of the tree is fixed.
810  */
811 void
812 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
813 {
814 	struct vm_radix_node *root;
815 
816 	root = vm_radix_root_load(rtree, LOCKED);
817 	if (root == NULL)
818 		return;
819 	vm_radix_root_store(rtree, NULL, UNSERIALIZED);
820 	if (!vm_radix_isleaf(root))
821 		vm_radix_reclaim_allnodes_int(root);
822 }
823 
824 /*
825  * Replace an existing page in the trie with another one.
826  * Panics if there is not an old page in the trie at the new page's index.
827  */
828 vm_page_t
829 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
830 {
831 	struct vm_radix_node *rnode, *tmp;
832 	vm_page_t m;
833 	vm_pindex_t index;
834 	int slot;
835 
836 	index = newpage->pindex;
837 	rnode = vm_radix_root_load(rtree, LOCKED);
838 	if (rnode == NULL)
839 		panic("%s: replacing page on an empty trie", __func__);
840 	if (vm_radix_isleaf(rnode)) {
841 		m = vm_radix_topage(rnode);
842 		if (m->pindex != index)
843 			panic("%s: original replacing root key not found",
844 			    __func__);
845 		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
846 		return (m);
847 	}
848 	for (;;) {
849 		slot = vm_radix_slot(index, rnode->rn_clev);
850 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
851 		if (vm_radix_isleaf(tmp)) {
852 			m = vm_radix_topage(tmp);
853 			if (m->pindex == index) {
854 				vm_radix_node_store(&rnode->rn_child[slot],
855 				    (struct vm_radix_node *)((uintptr_t)newpage |
856 				    VM_RADIX_ISLEAF), LOCKED);
857 				return (m);
858 			} else
859 				break;
860 		} else if (tmp == NULL || vm_radix_keybarr(tmp, index))
861 			break;
862 		rnode = tmp;
863 	}
864 	panic("%s: original replacing page not found", __func__);
865 }
866 
867 void
868 vm_radix_wait(void)
869 {
870 	uma_zwait(vm_radix_node_zone);
871 }
872 
873 #ifdef DDB
874 /*
875  * Show details about the given radix node.
876  */
877 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
878 {
879 	struct vm_radix_node *rnode, *tmp;
880 	int i;
881 
882         if (!have_addr)
883                 return;
884 	rnode = (struct vm_radix_node *)addr;
885 	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
886 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
887 	    rnode->rn_clev);
888 	for (i = 0; i < VM_RADIX_COUNT; i++) {
889 		tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
890 		if (tmp != NULL)
891 			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
892 			    i, (void *)tmp,
893 			    vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
894 			    rnode->rn_clev);
895 	}
896 }
897 #endif /* DDB */
898