xref: /freebsd/sys/vm/vm_radix.c (revision e9a994639b2af232f994ba2ad23ca45a17718d2b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52 
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55 
56 #include "opt_ddb.h"
57 
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/proc.h>
62 #include <sys/vmmeter.h>
63 #include <sys/smr.h>
64 #include <sys/smr_types.h>
65 
66 #include <vm/uma.h>
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_radix.h>
72 
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76 
77 /*
78  * These widths should allow the pointers to a node's children to fit within
79  * a single cache line.  The extra levels from a narrow width should not be
80  * a problem thanks to path compression.
81  */
82 #ifdef __LP64__
83 #define	VM_RADIX_WIDTH	4
84 #else
85 #define	VM_RADIX_WIDTH	3
86 #endif
87 
88 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
89 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
90 #define	VM_RADIX_LIMIT							\
91 	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
92 
93 /* Flag bits stored in node pointers. */
94 #define	VM_RADIX_ISLEAF	0x1
95 #define	VM_RADIX_FLAGS	0x1
96 #define	VM_RADIX_PAD	VM_RADIX_FLAGS
97 
98 /* Returns one unit associated with specified level. */
99 #define	VM_RADIX_UNITLEVEL(lev)						\
100 	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
101 
102 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
103 
104 struct vm_radix_node;
105 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
106 
107 struct vm_radix_node {
108 	vm_pindex_t	rn_owner;			/* Owner of record. */
109 	uint16_t	rn_count;			/* Valid children. */
110 	uint8_t		rn_clev;			/* Current level. */
111 	int8_t		rn_last;			/* zero last ptr. */
112 	smrnode_t	rn_child[VM_RADIX_COUNT];	/* Child nodes. */
113 };
114 
115 static uma_zone_t vm_radix_node_zone;
116 static smr_t vm_radix_smr;
117 
118 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
119     enum vm_radix_access access);
120 
121 /*
122  * Allocate a radix node.
123  */
124 static struct vm_radix_node *
125 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
126 {
127 	struct vm_radix_node *rnode;
128 
129 	rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
130 	if (rnode == NULL)
131 		return (NULL);
132 
133 	/*
134 	 * We want to clear the last child pointer after the final section
135 	 * has exited so lookup can not return false negatives.  It is done
136 	 * here because it will be cache-cold in the dtor callback.
137 	 */
138 	if (rnode->rn_last != 0) {
139 		vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
140 		    NULL, UNSERIALIZED);
141 		rnode->rn_last = 0;
142 	}
143 	rnode->rn_owner = owner;
144 	rnode->rn_count = count;
145 	rnode->rn_clev = clevel;
146 	return (rnode);
147 }
148 
149 /*
150  * Free radix node.
151  */
152 static __inline void
153 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
154 {
155 #ifdef INVARIANTS
156 	int slot;
157 
158 	KASSERT(rnode->rn_count == 0,
159 	    ("vm_radix_node_put: rnode %p has %d children", rnode,
160 	    rnode->rn_count));
161 	for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
162 		if (slot == last)
163 			continue;
164 		KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
165 		    NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
166 	}
167 #endif
168 	/* Off by one so a freshly zero'd node is not assigned to. */
169 	rnode->rn_last = last + 1;
170 	uma_zfree_smr(vm_radix_node_zone, rnode);
171 }
172 
173 /*
174  * Return the position in the array for a given level.
175  */
176 static __inline int
177 vm_radix_slot(vm_pindex_t index, uint16_t level)
178 {
179 
180 	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
181 }
182 
183 /* Trims the key after the specified level. */
184 static __inline vm_pindex_t
185 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
186 {
187 	vm_pindex_t ret;
188 
189 	ret = index;
190 	if (level > 0) {
191 		ret >>= level * VM_RADIX_WIDTH;
192 		ret <<= level * VM_RADIX_WIDTH;
193 	}
194 	return (ret);
195 }
196 
197 /*
198  * Fetch a node pointer from a slot in another node.
199  */
200 static __inline struct vm_radix_node *
201 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
202 {
203 
204 	switch (access) {
205 	case UNSERIALIZED:
206 		return (smr_unserialized_load(p, true));
207 	case LOCKED:
208 		return (smr_serialized_load(p, true));
209 	case SMR:
210 		return (smr_entered_load(p, vm_radix_smr));
211 	}
212 	__assert_unreachable();
213 }
214 
215 static __inline void
216 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
217     enum vm_radix_access access)
218 {
219 
220 	switch (access) {
221 	case UNSERIALIZED:
222 		smr_unserialized_store(p, v, true);
223 		break;
224 	case LOCKED:
225 		smr_serialized_store(p, v, true);
226 		break;
227 	case SMR:
228 		panic("vm_radix_node_store: Not supported in smr section.");
229 	}
230 }
231 
232 /*
233  * Get the root node for a radix tree.
234  */
235 static __inline struct vm_radix_node *
236 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
237 {
238 
239 	return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
240 }
241 
242 /*
243  * Set the root node for a radix tree.
244  */
245 static __inline void
246 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
247     enum vm_radix_access access)
248 {
249 
250 	vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
251 }
252 
253 /*
254  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
255  */
256 static __inline boolean_t
257 vm_radix_isleaf(struct vm_radix_node *rnode)
258 {
259 
260 	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
261 }
262 
263 /*
264  * Returns the associated page extracted from rnode.
265  */
266 static __inline vm_page_t
267 vm_radix_topage(struct vm_radix_node *rnode)
268 {
269 
270 	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
271 }
272 
273 /*
274  * Adds the page as a child of the provided node.
275  */
276 static __inline void
277 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
278     vm_page_t page, enum vm_radix_access access)
279 {
280 	int slot;
281 
282 	slot = vm_radix_slot(index, clev);
283 	vm_radix_node_store(&rnode->rn_child[slot],
284 	    (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
285 }
286 
287 /*
288  * Returns the slot where two keys differ.
289  * It cannot accept 2 equal keys.
290  */
291 static __inline uint16_t
292 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
293 {
294 	uint16_t clev;
295 
296 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
297 	    __func__, (uintmax_t)index1));
298 
299 	index1 ^= index2;
300 	for (clev = VM_RADIX_LIMIT;; clev--)
301 		if (vm_radix_slot(index1, clev) != 0)
302 			return (clev);
303 }
304 
305 /*
306  * Returns TRUE if it can be determined that key does not belong to the
307  * specified rnode.  Otherwise, returns FALSE.
308  */
309 static __inline boolean_t
310 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
311 {
312 
313 	if (rnode->rn_clev < VM_RADIX_LIMIT) {
314 		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
315 		return (idx != rnode->rn_owner);
316 	}
317 	return (FALSE);
318 }
319 
320 /*
321  * Internal helper for vm_radix_reclaim_allnodes().
322  * This function is recursive.
323  */
324 static void
325 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
326 {
327 	struct vm_radix_node *child;
328 	int slot;
329 
330 	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
331 	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
332 	for (slot = 0; rnode->rn_count != 0; slot++) {
333 		child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
334 		if (child == NULL)
335 			continue;
336 		if (!vm_radix_isleaf(child))
337 			vm_radix_reclaim_allnodes_int(child);
338 		vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
339 		rnode->rn_count--;
340 	}
341 	vm_radix_node_put(rnode, -1);
342 }
343 
344 #ifndef UMA_MD_SMALL_ALLOC
345 void vm_radix_reserve_kva(void);
346 /*
347  * Reserve the KVA necessary to satisfy the node allocation.
348  * This is mandatory in architectures not supporting direct
349  * mapping as they will need otherwise to carve into the kernel maps for
350  * every node allocation, resulting into deadlocks for consumers already
351  * working with kernel maps.
352  */
353 void
354 vm_radix_reserve_kva(void)
355 {
356 
357 	/*
358 	 * Calculate the number of reserved nodes, discounting the pages that
359 	 * are needed to store them.
360 	 */
361 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
362 	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
363 	    sizeof(struct vm_radix_node))))
364 		panic("%s: unable to reserve KVA", __func__);
365 }
366 #endif
367 
368 /*
369  * Initialize the UMA slab zone.
370  */
371 void
372 vm_radix_zinit(void)
373 {
374 
375 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
376 	    sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
377 	    VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
378 	vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
379 }
380 
381 /*
382  * Inserts the key-value pair into the trie.
383  * Panics if the key already exists.
384  */
385 int
386 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
387 {
388 	vm_pindex_t index, newind;
389 	struct vm_radix_node *rnode, *tmp;
390 	smrnode_t *parentp;
391 	vm_page_t m;
392 	int slot;
393 	uint16_t clev;
394 
395 	index = page->pindex;
396 
397 	/*
398 	 * The owner of record for root is not really important because it
399 	 * will never be used.
400 	 */
401 	rnode = vm_radix_root_load(rtree, LOCKED);
402 	if (rnode == NULL) {
403 		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
404 		return (0);
405 	}
406 	parentp = (smrnode_t *)&rtree->rt_root;
407 	for (;;) {
408 		if (vm_radix_isleaf(rnode)) {
409 			m = vm_radix_topage(rnode);
410 			if (m->pindex == index)
411 				panic("%s: key %jx is already present",
412 				    __func__, (uintmax_t)index);
413 			clev = vm_radix_keydiff(m->pindex, index);
414 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
415 			    clev + 1), 2, clev);
416 			if (tmp == NULL)
417 				return (ENOMEM);
418 			/* These writes are not yet visible due to ordering. */
419 			vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
420 			vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
421 			/* Synchronize to make leaf visible. */
422 			vm_radix_node_store(parentp, tmp, LOCKED);
423 			return (0);
424 		} else if (vm_radix_keybarr(rnode, index))
425 			break;
426 		slot = vm_radix_slot(index, rnode->rn_clev);
427 		parentp = &rnode->rn_child[slot];
428 		tmp = vm_radix_node_load(parentp, LOCKED);
429 		if (tmp == NULL) {
430 			rnode->rn_count++;
431 			vm_radix_addpage(rnode, index, rnode->rn_clev, page,
432 			    LOCKED);
433 			return (0);
434 		}
435 		rnode = tmp;
436 	}
437 
438 	/*
439 	 * A new node is needed because the right insertion level is reached.
440 	 * Setup the new intermediate node and add the 2 children: the
441 	 * new object and the older edge.
442 	 */
443 	newind = rnode->rn_owner;
444 	clev = vm_radix_keydiff(newind, index);
445 	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
446 	if (tmp == NULL)
447 		return (ENOMEM);
448 	slot = vm_radix_slot(newind, clev);
449 	/* These writes are not yet visible due to ordering. */
450 	vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
451 	vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
452 	/* Serializing write to make the above visible. */
453 	vm_radix_node_store(parentp, tmp, LOCKED);
454 
455 	return (0);
456 }
457 
458 /*
459  * Returns TRUE if the specified radix tree contains a single leaf and FALSE
460  * otherwise.
461  */
462 boolean_t
463 vm_radix_is_singleton(struct vm_radix *rtree)
464 {
465 	struct vm_radix_node *rnode;
466 
467 	rnode = vm_radix_root_load(rtree, LOCKED);
468 	if (rnode == NULL)
469 		return (FALSE);
470 	return (vm_radix_isleaf(rnode));
471 }
472 
473 /*
474  * Returns the value stored at the index.  If the index is not present,
475  * NULL is returned.
476  */
477 static __always_inline vm_page_t
478 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
479     enum vm_radix_access access)
480 {
481 	struct vm_radix_node *rnode;
482 	vm_page_t m;
483 	int slot;
484 
485 	rnode = vm_radix_root_load(rtree, access);
486 	while (rnode != NULL) {
487 		if (vm_radix_isleaf(rnode)) {
488 			m = vm_radix_topage(rnode);
489 			if (m->pindex == index)
490 				return (m);
491 			break;
492 		}
493 		if (vm_radix_keybarr(rnode, index))
494 			break;
495 		slot = vm_radix_slot(index, rnode->rn_clev);
496 		rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
497 	}
498 	return (NULL);
499 }
500 
501 /*
502  * Returns the value stored at the index assuming there is an external lock.
503  *
504  * If the index is not present, NULL is returned.
505  */
506 vm_page_t
507 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
508 {
509 
510 	return _vm_radix_lookup(rtree, index, LOCKED);
511 }
512 
513 /*
514  * Returns the value stored at the index without requiring an external lock.
515  *
516  * If the index is not present, NULL is returned.
517  */
518 vm_page_t
519 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
520 {
521 	vm_page_t m;
522 
523 	smr_enter(vm_radix_smr);
524 	m = _vm_radix_lookup(rtree, index, SMR);
525 	smr_exit(vm_radix_smr);
526 
527 	return (m);
528 }
529 
530 /*
531  * Look up the nearest entry at a position greater than or equal to index.
532  */
533 vm_page_t
534 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
535 {
536 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
537 	vm_pindex_t inc;
538 	vm_page_t m;
539 	struct vm_radix_node *child, *rnode;
540 #ifdef INVARIANTS
541 	int loops = 0;
542 #endif
543 	int slot, tos;
544 
545 	rnode = vm_radix_root_load(rtree, LOCKED);
546 	if (rnode == NULL)
547 		return (NULL);
548 	else if (vm_radix_isleaf(rnode)) {
549 		m = vm_radix_topage(rnode);
550 		if (m->pindex >= index)
551 			return (m);
552 		else
553 			return (NULL);
554 	}
555 	tos = 0;
556 	for (;;) {
557 		/*
558 		 * If the keys differ before the current bisection node,
559 		 * then the search key might rollback to the earliest
560 		 * available bisection node or to the smallest key
561 		 * in the current node (if the owner is greater than the
562 		 * search key).
563 		 */
564 		if (vm_radix_keybarr(rnode, index)) {
565 			if (index > rnode->rn_owner) {
566 ascend:
567 				KASSERT(++loops < 1000,
568 				    ("vm_radix_lookup_ge: too many loops"));
569 
570 				/*
571 				 * Pop nodes from the stack until either the
572 				 * stack is empty or a node that could have a
573 				 * matching descendant is found.
574 				 */
575 				do {
576 					if (tos == 0)
577 						return (NULL);
578 					rnode = stack[--tos];
579 				} while (vm_radix_slot(index,
580 				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
581 
582 				/*
583 				 * The following computation cannot overflow
584 				 * because index's slot at the current level
585 				 * is less than VM_RADIX_COUNT - 1.
586 				 */
587 				index = vm_radix_trimkey(index,
588 				    rnode->rn_clev);
589 				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
590 			} else
591 				index = rnode->rn_owner;
592 			KASSERT(!vm_radix_keybarr(rnode, index),
593 			    ("vm_radix_lookup_ge: keybarr failed"));
594 		}
595 		slot = vm_radix_slot(index, rnode->rn_clev);
596 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
597 		if (vm_radix_isleaf(child)) {
598 			m = vm_radix_topage(child);
599 			if (m->pindex >= index)
600 				return (m);
601 		} else if (child != NULL)
602 			goto descend;
603 
604 		/*
605 		 * Look for an available edge or page within the current
606 		 * bisection node.
607 		 */
608                 if (slot < (VM_RADIX_COUNT - 1)) {
609 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
610 			index = vm_radix_trimkey(index, rnode->rn_clev);
611 			do {
612 				index += inc;
613 				slot++;
614 				child = vm_radix_node_load(&rnode->rn_child[slot],
615 				    LOCKED);
616 				if (vm_radix_isleaf(child)) {
617 					m = vm_radix_topage(child);
618 					if (m->pindex >= index)
619 						return (m);
620 				} else if (child != NULL)
621 					goto descend;
622 			} while (slot < (VM_RADIX_COUNT - 1));
623 		}
624 		KASSERT(child == NULL || vm_radix_isleaf(child),
625 		    ("vm_radix_lookup_ge: child is radix node"));
626 
627 		/*
628 		 * If a page or edge greater than the search slot is not found
629 		 * in the current node, ascend to the next higher-level node.
630 		 */
631 		goto ascend;
632 descend:
633 		KASSERT(rnode->rn_clev > 0,
634 		    ("vm_radix_lookup_ge: pushing leaf's parent"));
635 		KASSERT(tos < VM_RADIX_LIMIT,
636 		    ("vm_radix_lookup_ge: stack overflow"));
637 		stack[tos++] = rnode;
638 		rnode = child;
639 	}
640 }
641 
642 /*
643  * Look up the nearest entry at a position less than or equal to index.
644  */
645 vm_page_t
646 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
647 {
648 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
649 	vm_pindex_t inc;
650 	vm_page_t m;
651 	struct vm_radix_node *child, *rnode;
652 #ifdef INVARIANTS
653 	int loops = 0;
654 #endif
655 	int slot, tos;
656 
657 	rnode = vm_radix_root_load(rtree, LOCKED);
658 	if (rnode == NULL)
659 		return (NULL);
660 	else if (vm_radix_isleaf(rnode)) {
661 		m = vm_radix_topage(rnode);
662 		if (m->pindex <= index)
663 			return (m);
664 		else
665 			return (NULL);
666 	}
667 	tos = 0;
668 	for (;;) {
669 		/*
670 		 * If the keys differ before the current bisection node,
671 		 * then the search key might rollback to the earliest
672 		 * available bisection node or to the largest key
673 		 * in the current node (if the owner is smaller than the
674 		 * search key).
675 		 */
676 		if (vm_radix_keybarr(rnode, index)) {
677 			if (index > rnode->rn_owner) {
678 				index = rnode->rn_owner + VM_RADIX_COUNT *
679 				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
680 			} else {
681 ascend:
682 				KASSERT(++loops < 1000,
683 				    ("vm_radix_lookup_le: too many loops"));
684 
685 				/*
686 				 * Pop nodes from the stack until either the
687 				 * stack is empty or a node that could have a
688 				 * matching descendant is found.
689 				 */
690 				do {
691 					if (tos == 0)
692 						return (NULL);
693 					rnode = stack[--tos];
694 				} while (vm_radix_slot(index,
695 				    rnode->rn_clev) == 0);
696 
697 				/*
698 				 * The following computation cannot overflow
699 				 * because index's slot at the current level
700 				 * is greater than 0.
701 				 */
702 				index = vm_radix_trimkey(index,
703 				    rnode->rn_clev);
704 			}
705 			index--;
706 			KASSERT(!vm_radix_keybarr(rnode, index),
707 			    ("vm_radix_lookup_le: keybarr failed"));
708 		}
709 		slot = vm_radix_slot(index, rnode->rn_clev);
710 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
711 		if (vm_radix_isleaf(child)) {
712 			m = vm_radix_topage(child);
713 			if (m->pindex <= index)
714 				return (m);
715 		} else if (child != NULL)
716 			goto descend;
717 
718 		/*
719 		 * Look for an available edge or page within the current
720 		 * bisection node.
721 		 */
722 		if (slot > 0) {
723 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
724 			index |= inc - 1;
725 			do {
726 				index -= inc;
727 				slot--;
728 				child = vm_radix_node_load(&rnode->rn_child[slot],
729 				    LOCKED);
730 				if (vm_radix_isleaf(child)) {
731 					m = vm_radix_topage(child);
732 					if (m->pindex <= index)
733 						return (m);
734 				} else if (child != NULL)
735 					goto descend;
736 			} while (slot > 0);
737 		}
738 		KASSERT(child == NULL || vm_radix_isleaf(child),
739 		    ("vm_radix_lookup_le: child is radix node"));
740 
741 		/*
742 		 * If a page or edge smaller than the search slot is not found
743 		 * in the current node, ascend to the next higher-level node.
744 		 */
745 		goto ascend;
746 descend:
747 		KASSERT(rnode->rn_clev > 0,
748 		    ("vm_radix_lookup_le: pushing leaf's parent"));
749 		KASSERT(tos < VM_RADIX_LIMIT,
750 		    ("vm_radix_lookup_le: stack overflow"));
751 		stack[tos++] = rnode;
752 		rnode = child;
753 	}
754 }
755 
756 /*
757  * Remove the specified index from the trie, and return the value stored at
758  * that index.  If the index is not present, return NULL.
759  */
760 vm_page_t
761 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
762 {
763 	struct vm_radix_node *rnode, *parent, *tmp;
764 	vm_page_t m;
765 	int i, slot;
766 
767 	rnode = vm_radix_root_load(rtree, LOCKED);
768 	if (vm_radix_isleaf(rnode)) {
769 		m = vm_radix_topage(rnode);
770 		if (m->pindex != index)
771 			return (NULL);
772 		vm_radix_root_store(rtree, NULL, LOCKED);
773 		return (m);
774 	}
775 	parent = NULL;
776 	for (;;) {
777 		if (rnode == NULL)
778 			return (NULL);
779 		slot = vm_radix_slot(index, rnode->rn_clev);
780 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
781 		if (vm_radix_isleaf(tmp)) {
782 			m = vm_radix_topage(tmp);
783 			if (m->pindex != index)
784 				return (NULL);
785 			vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
786 			rnode->rn_count--;
787 			if (rnode->rn_count > 1)
788 				return (m);
789 			for (i = 0; i < VM_RADIX_COUNT; i++)
790 				if (vm_radix_node_load(&rnode->rn_child[i],
791 				    LOCKED) != NULL)
792 					break;
793 			KASSERT(i != VM_RADIX_COUNT,
794 			    ("%s: invalid node configuration", __func__));
795 			tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
796 			if (parent == NULL)
797 				vm_radix_root_store(rtree, tmp, LOCKED);
798 			else {
799 				slot = vm_radix_slot(index, parent->rn_clev);
800 				KASSERT(vm_radix_node_load(
801 				    &parent->rn_child[slot], LOCKED) == rnode,
802 				    ("%s: invalid child value", __func__));
803 				vm_radix_node_store(&parent->rn_child[slot],
804 				    tmp, LOCKED);
805 			}
806 			/*
807 			 * The child is still valid and we can not zero the
808 			 * pointer until all smr references are gone.
809 			 */
810 			rnode->rn_count--;
811 			vm_radix_node_put(rnode, i);
812 			return (m);
813 		}
814 		parent = rnode;
815 		rnode = tmp;
816 	}
817 }
818 
819 /*
820  * Remove and free all the nodes from the radix tree.
821  * This function is recursive but there is a tight control on it as the
822  * maximum depth of the tree is fixed.
823  */
824 void
825 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
826 {
827 	struct vm_radix_node *root;
828 
829 	root = vm_radix_root_load(rtree, LOCKED);
830 	if (root == NULL)
831 		return;
832 	vm_radix_root_store(rtree, NULL, UNSERIALIZED);
833 	if (!vm_radix_isleaf(root))
834 		vm_radix_reclaim_allnodes_int(root);
835 }
836 
837 /*
838  * Replace an existing page in the trie with another one.
839  * Panics if there is not an old page in the trie at the new page's index.
840  */
841 vm_page_t
842 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
843 {
844 	struct vm_radix_node *rnode, *tmp;
845 	vm_page_t m;
846 	vm_pindex_t index;
847 	int slot;
848 
849 	index = newpage->pindex;
850 	rnode = vm_radix_root_load(rtree, LOCKED);
851 	if (rnode == NULL)
852 		panic("%s: replacing page on an empty trie", __func__);
853 	if (vm_radix_isleaf(rnode)) {
854 		m = vm_radix_topage(rnode);
855 		if (m->pindex != index)
856 			panic("%s: original replacing root key not found",
857 			    __func__);
858 		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
859 		return (m);
860 	}
861 	for (;;) {
862 		slot = vm_radix_slot(index, rnode->rn_clev);
863 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
864 		if (vm_radix_isleaf(tmp)) {
865 			m = vm_radix_topage(tmp);
866 			if (m->pindex == index) {
867 				vm_radix_node_store(&rnode->rn_child[slot],
868 				    (struct vm_radix_node *)((uintptr_t)newpage |
869 				    VM_RADIX_ISLEAF), LOCKED);
870 				return (m);
871 			} else
872 				break;
873 		} else if (tmp == NULL || vm_radix_keybarr(tmp, index))
874 			break;
875 		rnode = tmp;
876 	}
877 	panic("%s: original replacing page not found", __func__);
878 }
879 
880 void
881 vm_radix_wait(void)
882 {
883 	uma_zwait(vm_radix_node_zone);
884 }
885 
886 #ifdef DDB
887 /*
888  * Show details about the given radix node.
889  */
890 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
891 {
892 	struct vm_radix_node *rnode, *tmp;
893 	int i;
894 
895         if (!have_addr)
896                 return;
897 	rnode = (struct vm_radix_node *)addr;
898 	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
899 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
900 	    rnode->rn_clev);
901 	for (i = 0; i < VM_RADIX_COUNT; i++) {
902 		tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
903 		if (tmp != NULL)
904 			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
905 			    i, (void *)tmp,
906 			    vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
907 			    rnode->rn_clev);
908 	}
909 }
910 #endif /* DDB */
911