xref: /freebsd/sys/vm/vm_radix.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52 
53 #include <sys/cdefs.h>
54 #include "opt_ddb.h"
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/libkern.h>
60 #include <sys/proc.h>
61 #include <sys/vmmeter.h>
62 #include <sys/smr.h>
63 #include <sys/smr_types.h>
64 
65 #include <vm/uma.h>
66 #include <vm/vm.h>
67 #include <vm/vm_param.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_radix.h>
71 
72 #ifdef DDB
73 #include <ddb/ddb.h>
74 #endif
75 
76 /*
77  * These widths should allow the pointers to a node's children to fit within
78  * a single cache line.  The extra levels from a narrow width should not be
79  * a problem thanks to path compression.
80  */
81 #ifdef __LP64__
82 #define	VM_RADIX_WIDTH	4
83 #else
84 #define	VM_RADIX_WIDTH	3
85 #endif
86 
87 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
88 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
89 #define	VM_RADIX_LIMIT							\
90 	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
91 
92 #if VM_RADIX_WIDTH == 3
93 typedef uint8_t rn_popmap_t;
94 #elif VM_RADIX_WIDTH == 4
95 typedef uint16_t rn_popmap_t;
96 #elif VM_RADIX_WIDTH == 5
97 typedef uint32_t rn_popmap_t;
98 #else
99 #error Unsupported width
100 #endif
101 _Static_assert(sizeof(rn_popmap_t) <= sizeof(int),
102     "rn_popmap_t too wide");
103 
104 /* Set of all flag bits stored in node pointers. */
105 #define	VM_RADIX_FLAGS	(VM_RADIX_ISLEAF)
106 #define	VM_RADIX_PAD	VM_RADIX_FLAGS
107 
108 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
109 
110 struct vm_radix_node;
111 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
112 
113 struct vm_radix_node {
114 	vm_pindex_t	rn_owner;			/* Owner of record. */
115 	rn_popmap_t	rn_popmap;			/* Valid children. */
116 	uint8_t		rn_clev;			/* Level * WIDTH. */
117 	smrnode_t	rn_child[VM_RADIX_COUNT];	/* Child nodes. */
118 };
119 
120 static uma_zone_t vm_radix_node_zone;
121 static smr_t vm_radix_smr;
122 
123 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
124     enum vm_radix_access access);
125 
126 /*
127  * Map index to an array position for the children of rnode,
128  */
129 static __inline int
130 vm_radix_slot(struct vm_radix_node *rnode, vm_pindex_t index)
131 {
132 	return ((index >> rnode->rn_clev) & VM_RADIX_MASK);
133 }
134 
135 /*
136  * Returns true if index does not belong to the specified rnode.  Otherwise,
137  * sets slot value, and returns false.
138  */
139 static __inline bool
140 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t index, int *slot)
141 {
142 	index = (index - rnode->rn_owner) >> rnode->rn_clev;
143 	if (index >= VM_RADIX_COUNT)
144 		return (true);
145 	*slot = index;
146 	return (false);
147 }
148 
149 /*
150  * Allocate a radix node.
151  */
152 static struct vm_radix_node *
153 vm_radix_node_get(vm_pindex_t index, vm_pindex_t newind)
154 {
155 	struct vm_radix_node *rnode;
156 
157 	rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
158 	if (rnode == NULL)
159 		return (NULL);
160 
161 	/*
162 	 * We want to clear the last child pointer after the final section
163 	 * has exited so lookup can not return false negatives.  It is done
164 	 * here because it will be cache-cold in the dtor callback.
165 	 */
166 	if (rnode->rn_popmap != 0) {
167 		vm_radix_node_store(&rnode->rn_child[ffs(rnode->rn_popmap) - 1],
168 		    VM_RADIX_NULL, UNSERIALIZED);
169 		rnode->rn_popmap = 0;
170 	}
171 
172 	/*
173 	 * From the highest-order bit where the indexes differ,
174 	 * compute the highest level in the trie where they differ.  Then,
175 	 * compute the least index of this subtrie.
176 	 */
177 	KASSERT(index != newind, ("%s: passing the same key value %jx",
178 	    __func__, (uintmax_t)index));
179 	_Static_assert(sizeof(long long) >= sizeof(vm_pindex_t),
180 	    "vm_pindex_t too wide");
181 	_Static_assert(sizeof(vm_pindex_t) * NBBY <=
182 	    (1 << (sizeof(rnode->rn_clev) * NBBY)), "rn_clev too narrow");
183 	rnode->rn_clev = rounddown(flsll(index ^ newind) - 1, VM_RADIX_WIDTH);
184 	rnode->rn_owner = VM_RADIX_COUNT;
185 	rnode->rn_owner = index & -(rnode->rn_owner << rnode->rn_clev);
186 	return (rnode);
187 }
188 
189 /*
190  * Free radix node.
191  */
192 static __inline void
193 vm_radix_node_put(struct vm_radix_node *rnode)
194 {
195 #ifdef INVARIANTS
196 	int slot;
197 
198 	KASSERT(powerof2(rnode->rn_popmap),
199 	    ("vm_radix_node_put: rnode %p has too many children %04x", rnode,
200 	    rnode->rn_popmap));
201 	for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
202 		if ((rnode->rn_popmap & (1 << slot)) != 0)
203 			continue;
204 		KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
205 		    VM_RADIX_NULL,
206 		    ("vm_radix_node_put: rnode %p has a child", rnode));
207 	}
208 #endif
209 	uma_zfree_smr(vm_radix_node_zone, rnode);
210 }
211 
212 /*
213  * Fetch a node pointer from a slot in another node.
214  */
215 static __inline struct vm_radix_node *
216 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
217 {
218 
219 	switch (access) {
220 	case UNSERIALIZED:
221 		return (smr_unserialized_load(p, true));
222 	case LOCKED:
223 		return (smr_serialized_load(p, true));
224 	case SMR:
225 		return (smr_entered_load(p, vm_radix_smr));
226 	}
227 	__assert_unreachable();
228 }
229 
230 static __inline void
231 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
232     enum vm_radix_access access)
233 {
234 
235 	switch (access) {
236 	case UNSERIALIZED:
237 		smr_unserialized_store(p, v, true);
238 		break;
239 	case LOCKED:
240 		smr_serialized_store(p, v, true);
241 		break;
242 	case SMR:
243 		panic("vm_radix_node_store: Not supported in smr section.");
244 	}
245 }
246 
247 /*
248  * Get the root node for a radix tree.
249  */
250 static __inline struct vm_radix_node *
251 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
252 {
253 
254 	return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
255 }
256 
257 /*
258  * Set the root node for a radix tree.
259  */
260 static __inline void
261 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
262     enum vm_radix_access access)
263 {
264 
265 	vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
266 }
267 
268 /*
269  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
270  */
271 static __inline bool
272 vm_radix_isleaf(struct vm_radix_node *rnode)
273 {
274 
275 	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
276 }
277 
278 /*
279  * Returns page cast to radix node with leaf bit set.
280  */
281 static __inline struct vm_radix_node *
282 vm_radix_toleaf(vm_page_t page)
283 {
284 	return ((struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF));
285 }
286 
287 /*
288  * Returns the associated page extracted from rnode.
289  */
290 static __inline vm_page_t
291 vm_radix_topage(struct vm_radix_node *rnode)
292 {
293 
294 	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
295 }
296 
297 /*
298  * Make 'child' a child of 'rnode'.
299  */
300 static __inline void
301 vm_radix_addnode(struct vm_radix_node *rnode, vm_pindex_t index,
302     struct vm_radix_node *child, enum vm_radix_access access)
303 {
304 	int slot;
305 
306 	slot = vm_radix_slot(rnode, index);
307 	vm_radix_node_store(&rnode->rn_child[slot], child, access);
308 	rnode->rn_popmap ^= 1 << slot;
309 	KASSERT((rnode->rn_popmap & (1 << slot)) != 0,
310 	    ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
311 }
312 
313 /*
314  * Internal helper for vm_radix_reclaim_allnodes().
315  * This function is recursive.
316  */
317 static void
318 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
319 {
320 	struct vm_radix_node *child;
321 	int slot;
322 
323 	while (rnode->rn_popmap != 0) {
324 		slot = ffs(rnode->rn_popmap) - 1;
325 		child = vm_radix_node_load(&rnode->rn_child[slot],
326 		    UNSERIALIZED);
327 		KASSERT(child != VM_RADIX_NULL,
328 		    ("%s: bad popmap slot %d in rnode %p",
329 		    __func__, slot, rnode));
330 		if (!vm_radix_isleaf(child))
331 			vm_radix_reclaim_allnodes_int(child);
332 		rnode->rn_popmap ^= 1 << slot;
333 		vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL,
334 		    UNSERIALIZED);
335 	}
336 	vm_radix_node_put(rnode);
337 }
338 
339 /*
340  * radix node zone initializer.
341  */
342 static int
343 vm_radix_zone_init(void *mem, int size, int flags)
344 {
345 	struct vm_radix_node *rnode;
346 
347 	rnode = mem;
348 	rnode->rn_popmap = 0;
349 	for (int i = 0; i < nitems(rnode->rn_child); i++)
350 		vm_radix_node_store(&rnode->rn_child[i], VM_RADIX_NULL,
351 		    UNSERIALIZED);
352 	return (0);
353 }
354 
355 #ifndef UMA_MD_SMALL_ALLOC
356 void vm_radix_reserve_kva(void);
357 /*
358  * Reserve the KVA necessary to satisfy the node allocation.
359  * This is mandatory in architectures not supporting direct
360  * mapping as they will need otherwise to carve into the kernel maps for
361  * every node allocation, resulting into deadlocks for consumers already
362  * working with kernel maps.
363  */
364 void
365 vm_radix_reserve_kva(void)
366 {
367 
368 	/*
369 	 * Calculate the number of reserved nodes, discounting the pages that
370 	 * are needed to store them.
371 	 */
372 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
373 	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
374 	    sizeof(struct vm_radix_node))))
375 		panic("%s: unable to reserve KVA", __func__);
376 }
377 #endif
378 
379 /*
380  * Initialize the UMA slab zone.
381  */
382 void
383 vm_radix_zinit(void)
384 {
385 
386 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
387 	    sizeof(struct vm_radix_node), NULL, NULL, vm_radix_zone_init, NULL,
388 	    VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR);
389 	vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
390 }
391 
392 /*
393  * Inserts the key-value pair into the trie.
394  * Panics if the key already exists.
395  */
396 int
397 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
398 {
399 	vm_pindex_t index, newind;
400 	struct vm_radix_node *leaf, *parent, *rnode;
401 	smrnode_t *parentp;
402 	int slot;
403 
404 	index = page->pindex;
405 	leaf = vm_radix_toleaf(page);
406 
407 	/*
408 	 * The owner of record for root is not really important because it
409 	 * will never be used.
410 	 */
411 	rnode = vm_radix_root_load(rtree, LOCKED);
412 	parent = NULL;
413 	for (;;) {
414 		if (vm_radix_isleaf(rnode)) {
415 			if (rnode == VM_RADIX_NULL) {
416 				if (parent == NULL)
417 					rtree->rt_root = leaf;
418 				else
419 					vm_radix_addnode(parent, index, leaf,
420 					    LOCKED);
421 				return (0);
422 			}
423 			newind = vm_radix_topage(rnode)->pindex;
424 			if (newind == index)
425 				panic("%s: key %jx is already present",
426 				    __func__, (uintmax_t)index);
427 			break;
428 		}
429 		if (vm_radix_keybarr(rnode, index, &slot)) {
430 			newind = rnode->rn_owner;
431 			break;
432 		}
433 		parent = rnode;
434 		rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
435 	}
436 
437 	/*
438 	 * A new node is needed because the right insertion level is reached.
439 	 * Setup the new intermediate node and add the 2 children: the
440 	 * new object and the older edge or object.
441 	 */
442 	parentp = (parent != NULL) ? &parent->rn_child[slot]:
443 	    (smrnode_t *)&rtree->rt_root;
444 	parent = vm_radix_node_get(index, newind);
445 	if (parent == NULL)
446 		return (ENOMEM);
447 	/* These writes are not yet visible due to ordering. */
448 	vm_radix_addnode(parent, index, leaf, UNSERIALIZED);
449 	vm_radix_addnode(parent, newind, rnode, UNSERIALIZED);
450 	/* Serializing write to make the above visible. */
451 	vm_radix_node_store(parentp, parent, LOCKED);
452 	return (0);
453 }
454 
455 /*
456  * Returns the value stored at the index.  If the index is not present,
457  * NULL is returned.
458  */
459 static __always_inline vm_page_t
460 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
461     enum vm_radix_access access)
462 {
463 	struct vm_radix_node *rnode;
464 	vm_page_t m;
465 	int slot;
466 
467 	rnode = vm_radix_root_load(rtree, access);
468 	for (;;) {
469 		if (vm_radix_isleaf(rnode)) {
470 			if ((m = vm_radix_topage(rnode)) != NULL &&
471 			    m->pindex == index)
472 				return (m);
473 			break;
474 		}
475 		if (vm_radix_keybarr(rnode, index, &slot))
476 			break;
477 		rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
478 	}
479 	return (NULL);
480 }
481 
482 /*
483  * Returns the value stored at the index assuming there is an external lock.
484  *
485  * If the index is not present, NULL is returned.
486  */
487 vm_page_t
488 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
489 {
490 
491 	return _vm_radix_lookup(rtree, index, LOCKED);
492 }
493 
494 /*
495  * Returns the value stored at the index without requiring an external lock.
496  *
497  * If the index is not present, NULL is returned.
498  */
499 vm_page_t
500 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
501 {
502 	vm_page_t m;
503 
504 	smr_enter(vm_radix_smr);
505 	m = _vm_radix_lookup(rtree, index, SMR);
506 	smr_exit(vm_radix_smr);
507 
508 	return (m);
509 }
510 
511 /*
512  * Returns the page with the least pindex that is greater than or equal to the
513  * specified pindex, or NULL if there are no such pages.
514  *
515  * Requires that access be externally synchronized by a lock.
516  */
517 vm_page_t
518 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
519 {
520 	struct vm_radix_node *rnode, *succ;
521 	vm_page_t m;
522 	int slot;
523 
524 	/*
525 	 * Descend the trie as if performing an ordinary lookup for the page
526 	 * with the specified pindex.  However, unlike an ordinary lookup, as we
527 	 * descend the trie, we use "succ" to remember the last branching-off
528 	 * point, that is, the interior node under which the page with the least
529 	 * pindex that is both outside our current path down the trie and more
530 	 * than the specified pindex resides.  (The node's popmap makes it fast
531 	 * and easy to recognize a branching-off point.)  If our ordinary lookup
532 	 * fails to yield a page with a pindex that is greater than or equal to
533 	 * the specified pindex, then we will exit this loop and perform a
534 	 * lookup starting from "succ".  If "succ" is not NULL, then that lookup
535 	 * is guaranteed to succeed.
536 	 */
537 	rnode = vm_radix_root_load(rtree, LOCKED);
538 	succ = NULL;
539 	for (;;) {
540 		if (vm_radix_isleaf(rnode)) {
541 			if ((m = vm_radix_topage(rnode)) != NULL &&
542 			    m->pindex >= index)
543 				return (m);
544 			break;
545 		}
546 		if (vm_radix_keybarr(rnode, index, &slot)) {
547 			/*
548 			 * If all pages in this subtree have pindex > index,
549 			 * then the page in this subtree with the least pindex
550 			 * is the answer.
551 			 */
552 			if (rnode->rn_owner > index)
553 				succ = rnode;
554 			break;
555 		}
556 
557 		/*
558 		 * Just in case the next search step leads to a subtree of all
559 		 * pages with pindex < index, check popmap to see if a next
560 		 * bigger step, to a subtree of all pages with pindex > index,
561 		 * is available.  If so, remember to restart the search here.
562 		 */
563 		if ((rnode->rn_popmap >> slot) > 1)
564 			succ = rnode;
565 		rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
566 	}
567 
568 	/*
569 	 * Restart the search from the last place visited in the subtree that
570 	 * included some pages with pindex > index, if there was such a place.
571 	 */
572 	if (succ == NULL)
573 		return (NULL);
574 	if (succ != rnode) {
575 		/*
576 		 * Take a step to the next bigger sibling of the node chosen
577 		 * last time.  In that subtree, all pages have pindex > index.
578 		 */
579 		slot = vm_radix_slot(succ, index) + 1;
580 		KASSERT((succ->rn_popmap >> slot) != 0,
581 		    ("%s: no popmap siblings past slot %d in node %p",
582 		    __func__, slot, succ));
583 		slot += ffs(succ->rn_popmap >> slot) - 1;
584 		succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED);
585 	}
586 
587 	/*
588 	 * Find the page in the subtree rooted at "succ" with the least pindex.
589 	 */
590 	while (!vm_radix_isleaf(succ)) {
591 		KASSERT(succ->rn_popmap != 0,
592 		    ("%s: no popmap children in node %p",  __func__, succ));
593 		slot = ffs(succ->rn_popmap) - 1;
594 		succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED);
595 	}
596 	return (vm_radix_topage(succ));
597 }
598 
599 /*
600  * Returns the page with the greatest pindex that is less than or equal to the
601  * specified pindex, or NULL if there are no such pages.
602  *
603  * Requires that access be externally synchronized by a lock.
604  */
605 vm_page_t
606 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
607 {
608 	struct vm_radix_node *pred, *rnode;
609 	vm_page_t m;
610 	int slot;
611 
612 	/*
613 	 * Mirror the implementation of vm_radix_lookup_ge, described above.
614 	 */
615 	rnode = vm_radix_root_load(rtree, LOCKED);
616 	pred = NULL;
617 	for (;;) {
618 		if (vm_radix_isleaf(rnode)) {
619 			if ((m = vm_radix_topage(rnode)) != NULL &&
620 			    m->pindex <= index)
621 				return (m);
622 			break;
623 		}
624 		if (vm_radix_keybarr(rnode, index, &slot)) {
625 			if (rnode->rn_owner < index)
626 				pred = rnode;
627 			break;
628 		}
629 		if ((rnode->rn_popmap & ((1 << slot) - 1)) != 0)
630 			pred = rnode;
631 		rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
632 	}
633 	if (pred == NULL)
634 		return (NULL);
635 	if (pred != rnode) {
636 		slot = vm_radix_slot(pred, index);
637 		KASSERT((pred->rn_popmap & ((1 << slot) - 1)) != 0,
638 		    ("%s: no popmap siblings before slot %d in node %p",
639 		    __func__, slot, pred));
640 		slot = fls(pred->rn_popmap & ((1 << slot) - 1)) - 1;
641 		pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED);
642 	}
643 	while (!vm_radix_isleaf(pred)) {
644 		KASSERT(pred->rn_popmap != 0,
645 		    ("%s: no popmap children in node %p",  __func__, pred));
646 		slot = fls(pred->rn_popmap) - 1;
647 		pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED);
648 	}
649 	return (vm_radix_topage(pred));
650 }
651 
652 /*
653  * Remove the specified index from the trie, and return the value stored at
654  * that index.  If the index is not present, return NULL.
655  */
656 vm_page_t
657 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
658 {
659 	struct vm_radix_node *child, *parent, *rnode;
660 	vm_page_t m;
661 	int slot;
662 
663 	rnode = NULL;
664 	child = vm_radix_root_load(rtree, LOCKED);
665 	for (;;) {
666 		if (vm_radix_isleaf(child))
667 			break;
668 		parent = rnode;
669 		rnode = child;
670 		slot = vm_radix_slot(rnode, index);
671 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
672 	}
673 	if ((m = vm_radix_topage(child)) == NULL || m->pindex != index)
674 		return (NULL);
675 	if (rnode == NULL) {
676 		vm_radix_root_store(rtree, VM_RADIX_NULL, LOCKED);
677 		return (m);
678 	}
679 	KASSERT((rnode->rn_popmap & (1 << slot)) != 0,
680 	    ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
681 	rnode->rn_popmap ^= 1 << slot;
682 	vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL, LOCKED);
683 	if (!powerof2(rnode->rn_popmap))
684 		return (m);
685 	KASSERT(rnode->rn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
686 	slot = ffs(rnode->rn_popmap) - 1;
687 	child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
688 	KASSERT(child != VM_RADIX_NULL,
689 	    ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
690 	if (parent == NULL)
691 		vm_radix_root_store(rtree, child, LOCKED);
692 	else {
693 		slot = vm_radix_slot(parent, index);
694 		KASSERT(rnode ==
695 		    vm_radix_node_load(&parent->rn_child[slot], LOCKED),
696 		    ("%s: invalid child value", __func__));
697 		vm_radix_node_store(&parent->rn_child[slot], child, LOCKED);
698 	}
699 	/*
700 	 * The child is still valid and we can not zero the
701 	 * pointer until all smr references are gone.
702 	 */
703 	vm_radix_node_put(rnode);
704 	return (m);
705 }
706 
707 /*
708  * Remove and free all the nodes from the radix tree.
709  * This function is recursive but there is a tight control on it as the
710  * maximum depth of the tree is fixed.
711  */
712 void
713 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
714 {
715 	struct vm_radix_node *root;
716 
717 	root = vm_radix_root_load(rtree, LOCKED);
718 	if (root == VM_RADIX_NULL)
719 		return;
720 	vm_radix_root_store(rtree, VM_RADIX_NULL, UNSERIALIZED);
721 	if (!vm_radix_isleaf(root))
722 		vm_radix_reclaim_allnodes_int(root);
723 }
724 
725 /*
726  * Replace an existing page in the trie with another one.
727  * Panics if there is not an old page in the trie at the new page's index.
728  */
729 vm_page_t
730 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
731 {
732 	struct vm_radix_node *leaf, *parent, *rnode;
733 	vm_page_t m;
734 	vm_pindex_t index;
735 	int slot;
736 
737 	leaf = vm_radix_toleaf(newpage);
738 	index = newpage->pindex;
739 	rnode = vm_radix_root_load(rtree, LOCKED);
740 	parent = NULL;
741 	for (;;) {
742 		if (vm_radix_isleaf(rnode)) {
743 			if ((m = vm_radix_topage(rnode)) != NULL &&
744 			    m->pindex == index) {
745 				if (parent == NULL)
746 					rtree->rt_root = leaf;
747 				else
748 					vm_radix_node_store(
749 					    &parent->rn_child[slot], leaf,
750 					    LOCKED);
751 				return (m);
752 			}
753 			break;
754 		}
755 		if (vm_radix_keybarr(rnode, index, &slot))
756 			break;
757 		parent = rnode;
758 		rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
759 	}
760 	panic("%s: original replacing page not found", __func__);
761 }
762 
763 void
764 vm_radix_wait(void)
765 {
766 	uma_zwait(vm_radix_node_zone);
767 }
768 
769 #ifdef DDB
770 /*
771  * Show details about the given radix node.
772  */
773 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
774 {
775 	struct vm_radix_node *rnode, *tmp;
776 	int slot;
777 	rn_popmap_t popmap;
778 
779         if (!have_addr)
780                 return;
781 	rnode = (struct vm_radix_node *)addr;
782 	db_printf("radixnode %p, owner %jx, children popmap %04x, level %u:\n",
783 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_popmap,
784 	    rnode->rn_clev / VM_RADIX_WIDTH);
785 	for (popmap = rnode->rn_popmap; popmap != 0; popmap ^= 1 << slot) {
786 		slot = ffs(popmap) - 1;
787 		tmp = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
788 		db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
789 		    slot, (void *)tmp,
790 		    vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
791 		    rnode->rn_clev / VM_RADIX_WIDTH);
792 	}
793 }
794 #endif /* DDB */
795