xref: /freebsd/sys/vm/vm_radix.c (revision da72505f9c6a95009ef710fb1c2b4f2c63cce509)
1fe267a55SPedro F. Giffuni /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3fe267a55SPedro F. Giffuni  *
4774d251dSAttilio Rao  * Copyright (c) 2013 EMC Corp.
5774d251dSAttilio Rao  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6774d251dSAttilio Rao  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7774d251dSAttilio Rao  * All rights reserved.
8774d251dSAttilio Rao  *
9774d251dSAttilio Rao  * Redistribution and use in source and binary forms, with or without
10774d251dSAttilio Rao  * modification, are permitted provided that the following conditions
11774d251dSAttilio Rao  * are met:
12774d251dSAttilio Rao  * 1. Redistributions of source code must retain the above copyright
13774d251dSAttilio Rao  *    notice, this list of conditions and the following disclaimer.
14774d251dSAttilio Rao  * 2. Redistributions in binary form must reproduce the above copyright
15774d251dSAttilio Rao  *    notice, this list of conditions and the following disclaimer in the
16774d251dSAttilio Rao  *    documentation and/or other materials provided with the distribution.
17774d251dSAttilio Rao  *
18774d251dSAttilio Rao  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19774d251dSAttilio Rao  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20774d251dSAttilio Rao  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21774d251dSAttilio Rao  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22774d251dSAttilio Rao  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23774d251dSAttilio Rao  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24774d251dSAttilio Rao  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25774d251dSAttilio Rao  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26774d251dSAttilio Rao  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27774d251dSAttilio Rao  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28774d251dSAttilio Rao  * SUCH DAMAGE.
29774d251dSAttilio Rao  *
30774d251dSAttilio Rao  */
31774d251dSAttilio Rao 
32774d251dSAttilio Rao /*
33774d251dSAttilio Rao  * Path-compressed radix trie implementation.
34774d251dSAttilio Rao  * The following code is not generalized into a general purpose library
35774d251dSAttilio Rao  * because there are way too many parameters embedded that should really
36774d251dSAttilio Rao  * be decided by the library consumers.  At the same time, consumers
37774d251dSAttilio Rao  * of this code must achieve highest possible performance.
38774d251dSAttilio Rao  *
39774d251dSAttilio Rao  * The implementation takes into account the following rationale:
40774d251dSAttilio Rao  * - Size of the nodes should be as small as possible but still big enough
41774d251dSAttilio Rao  *   to avoid a large maximum depth for the trie.  This is a balance
42774d251dSAttilio Rao  *   between the necessity to not wire too much physical memory for the nodes
43774d251dSAttilio Rao  *   and the necessity to avoid too much cache pollution during the trie
44774d251dSAttilio Rao  *   operations.
45774d251dSAttilio Rao  * - There is not a huge bias toward the number of lookup operations over
46774d251dSAttilio Rao  *   the number of insert and remove operations.  This basically implies
47774d251dSAttilio Rao  *   that optimizations supposedly helping one operation but hurting the
48774d251dSAttilio Rao  *   other might be carefully evaluated.
49774d251dSAttilio Rao  * - On average not many nodes are expected to be fully populated, hence
50774d251dSAttilio Rao  *   level compression may just complicate things.
51774d251dSAttilio Rao  */
52774d251dSAttilio Rao 
53774d251dSAttilio Rao #include <sys/cdefs.h>
54774d251dSAttilio Rao __FBSDID("$FreeBSD$");
55774d251dSAttilio Rao 
56774d251dSAttilio Rao #include "opt_ddb.h"
57774d251dSAttilio Rao 
58774d251dSAttilio Rao #include <sys/param.h>
59774d251dSAttilio Rao #include <sys/systm.h>
60774d251dSAttilio Rao #include <sys/kernel.h>
6105963ea4SDoug Moore #include <sys/libkern.h>
621ddda2ebSJeff Roberson #include <sys/proc.h>
63774d251dSAttilio Rao #include <sys/vmmeter.h>
641ddda2ebSJeff Roberson #include <sys/smr.h>
653fba8868SMark Johnston #include <sys/smr_types.h>
66774d251dSAttilio Rao 
67774d251dSAttilio Rao #include <vm/uma.h>
68774d251dSAttilio Rao #include <vm/vm.h>
69774d251dSAttilio Rao #include <vm/vm_param.h>
701ddda2ebSJeff Roberson #include <vm/vm_object.h>
71774d251dSAttilio Rao #include <vm/vm_page.h>
72774d251dSAttilio Rao #include <vm/vm_radix.h>
73774d251dSAttilio Rao 
74774d251dSAttilio Rao #ifdef DDB
75774d251dSAttilio Rao #include <ddb/ddb.h>
76774d251dSAttilio Rao #endif
77774d251dSAttilio Rao 
78774d251dSAttilio Rao /*
79774d251dSAttilio Rao  * These widths should allow the pointers to a node's children to fit within
80774d251dSAttilio Rao  * a single cache line.  The extra levels from a narrow width should not be
81774d251dSAttilio Rao  * a problem thanks to path compression.
82774d251dSAttilio Rao  */
83774d251dSAttilio Rao #ifdef __LP64__
84774d251dSAttilio Rao #define	VM_RADIX_WIDTH	4
85774d251dSAttilio Rao #else
86774d251dSAttilio Rao #define	VM_RADIX_WIDTH	3
87774d251dSAttilio Rao #endif
88774d251dSAttilio Rao 
89774d251dSAttilio Rao #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
90774d251dSAttilio Rao #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
91774d251dSAttilio Rao #define	VM_RADIX_LIMIT							\
92b66bb393SPedro F. Giffuni 	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
93774d251dSAttilio Rao 
94774d251dSAttilio Rao /* Flag bits stored in node pointers. */
95774d251dSAttilio Rao #define	VM_RADIX_ISLEAF	0x1
96774d251dSAttilio Rao #define	VM_RADIX_FLAGS	0x1
97774d251dSAttilio Rao #define	VM_RADIX_PAD	VM_RADIX_FLAGS
98774d251dSAttilio Rao 
99774d251dSAttilio Rao /* Returns one unit associated with specified level. */
100774d251dSAttilio Rao #define	VM_RADIX_UNITLEVEL(lev)						\
1019f2e6008SAlan Cox 	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
102774d251dSAttilio Rao 
1031ddda2ebSJeff Roberson enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
1041ddda2ebSJeff Roberson 
1051ddda2ebSJeff Roberson struct vm_radix_node;
1063fba8868SMark Johnston typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
1071ddda2ebSJeff Roberson 
108774d251dSAttilio Rao struct vm_radix_node {
109774d251dSAttilio Rao 	vm_pindex_t	rn_owner;			/* Owner of record. */
110774d251dSAttilio Rao 	uint16_t	rn_count;			/* Valid children. */
1111ddda2ebSJeff Roberson 	uint8_t		rn_clev;			/* Current level. */
1121ddda2ebSJeff Roberson 	int8_t		rn_last;			/* zero last ptr. */
1131ddda2ebSJeff Roberson 	smrnode_t	rn_child[VM_RADIX_COUNT];	/* Child nodes. */
114774d251dSAttilio Rao };
115774d251dSAttilio Rao 
116774d251dSAttilio Rao static uma_zone_t vm_radix_node_zone;
1171ddda2ebSJeff Roberson static smr_t vm_radix_smr;
1181ddda2ebSJeff Roberson 
1191ddda2ebSJeff Roberson static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
1201ddda2ebSJeff Roberson     enum vm_radix_access access);
121774d251dSAttilio Rao 
122774d251dSAttilio Rao /*
123*da72505fSDoug Moore  * Return the position in the array for a given level.
124*da72505fSDoug Moore  */
125*da72505fSDoug Moore static __inline int
126*da72505fSDoug Moore vm_radix_slot(vm_pindex_t index, uint16_t level)
127*da72505fSDoug Moore {
128*da72505fSDoug Moore 	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
129*da72505fSDoug Moore }
130*da72505fSDoug Moore 
131*da72505fSDoug Moore /* Computes the key (index) with the low-order 'level' radix-digits zeroed. */
132*da72505fSDoug Moore static __inline vm_pindex_t
133*da72505fSDoug Moore vm_radix_trimkey(vm_pindex_t index, uint16_t level)
134*da72505fSDoug Moore {
135*da72505fSDoug Moore 	return (index & -VM_RADIX_UNITLEVEL(level));
136*da72505fSDoug Moore }
137*da72505fSDoug Moore 
138*da72505fSDoug Moore /*
139e946b949SAttilio Rao  * Allocate a radix node.
140774d251dSAttilio Rao  */
1411ddda2ebSJeff Roberson static struct vm_radix_node *
142*da72505fSDoug Moore vm_radix_node_get(vm_pindex_t index, uint16_t clevel)
143774d251dSAttilio Rao {
144774d251dSAttilio Rao 	struct vm_radix_node *rnode;
145774d251dSAttilio Rao 
1461ddda2ebSJeff Roberson 	rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
147774d251dSAttilio Rao 	if (rnode == NULL)
148e946b949SAttilio Rao 		return (NULL);
1491ddda2ebSJeff Roberson 
1501ddda2ebSJeff Roberson 	/*
1511ddda2ebSJeff Roberson 	 * We want to clear the last child pointer after the final section
1521ddda2ebSJeff Roberson 	 * has exited so lookup can not return false negatives.  It is done
1531ddda2ebSJeff Roberson 	 * here because it will be cache-cold in the dtor callback.
1541ddda2ebSJeff Roberson 	 */
1551ddda2ebSJeff Roberson 	if (rnode->rn_last != 0) {
1561ddda2ebSJeff Roberson 		vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
1571ddda2ebSJeff Roberson 		    NULL, UNSERIALIZED);
1581ddda2ebSJeff Roberson 		rnode->rn_last = 0;
1591ddda2ebSJeff Roberson 	}
160*da72505fSDoug Moore 	rnode->rn_owner = vm_radix_trimkey(index, clevel + 1);
161*da72505fSDoug Moore 	rnode->rn_count = 2;
162774d251dSAttilio Rao 	rnode->rn_clev = clevel;
163774d251dSAttilio Rao 	return (rnode);
164774d251dSAttilio Rao }
165774d251dSAttilio Rao 
166774d251dSAttilio Rao /*
167774d251dSAttilio Rao  * Free radix node.
168774d251dSAttilio Rao  */
169774d251dSAttilio Rao static __inline void
1701ddda2ebSJeff Roberson vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
171774d251dSAttilio Rao {
1721ddda2ebSJeff Roberson #ifdef INVARIANTS
1731ddda2ebSJeff Roberson 	int slot;
174774d251dSAttilio Rao 
1751ddda2ebSJeff Roberson 	KASSERT(rnode->rn_count == 0,
1761ddda2ebSJeff Roberson 	    ("vm_radix_node_put: rnode %p has %d children", rnode,
1771ddda2ebSJeff Roberson 	    rnode->rn_count));
1781ddda2ebSJeff Roberson 	for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
1791ddda2ebSJeff Roberson 		if (slot == last)
1801ddda2ebSJeff Roberson 			continue;
1811ddda2ebSJeff Roberson 		KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
1821ddda2ebSJeff Roberson 		    NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
1831ddda2ebSJeff Roberson 	}
1841ddda2ebSJeff Roberson #endif
1851ddda2ebSJeff Roberson 	/* Off by one so a freshly zero'd node is not assigned to. */
1861ddda2ebSJeff Roberson 	rnode->rn_last = last + 1;
1871ddda2ebSJeff Roberson 	uma_zfree_smr(vm_radix_node_zone, rnode);
188774d251dSAttilio Rao }
189774d251dSAttilio Rao 
190774d251dSAttilio Rao /*
1911ddda2ebSJeff Roberson  * Fetch a node pointer from a slot in another node.
1921ddda2ebSJeff Roberson  */
1931ddda2ebSJeff Roberson static __inline struct vm_radix_node *
1941ddda2ebSJeff Roberson vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
1951ddda2ebSJeff Roberson {
1961ddda2ebSJeff Roberson 
1971ddda2ebSJeff Roberson 	switch (access) {
1981ddda2ebSJeff Roberson 	case UNSERIALIZED:
1991ddda2ebSJeff Roberson 		return (smr_unserialized_load(p, true));
2001ddda2ebSJeff Roberson 	case LOCKED:
2011ddda2ebSJeff Roberson 		return (smr_serialized_load(p, true));
2021ddda2ebSJeff Roberson 	case SMR:
2031ddda2ebSJeff Roberson 		return (smr_entered_load(p, vm_radix_smr));
2041ddda2ebSJeff Roberson 	}
205c79cee71SKyle Evans 	__assert_unreachable();
2061ddda2ebSJeff Roberson }
2071ddda2ebSJeff Roberson 
2081ddda2ebSJeff Roberson static __inline void
2091ddda2ebSJeff Roberson vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
2101ddda2ebSJeff Roberson     enum vm_radix_access access)
2111ddda2ebSJeff Roberson {
2121ddda2ebSJeff Roberson 
2131ddda2ebSJeff Roberson 	switch (access) {
2141ddda2ebSJeff Roberson 	case UNSERIALIZED:
2151ddda2ebSJeff Roberson 		smr_unserialized_store(p, v, true);
2161ddda2ebSJeff Roberson 		break;
2171ddda2ebSJeff Roberson 	case LOCKED:
2181ddda2ebSJeff Roberson 		smr_serialized_store(p, v, true);
2191ddda2ebSJeff Roberson 		break;
2201ddda2ebSJeff Roberson 	case SMR:
2211ddda2ebSJeff Roberson 		panic("vm_radix_node_store: Not supported in smr section.");
2221ddda2ebSJeff Roberson 	}
2231ddda2ebSJeff Roberson }
2241ddda2ebSJeff Roberson 
2251ddda2ebSJeff Roberson /*
226774d251dSAttilio Rao  * Get the root node for a radix tree.
227774d251dSAttilio Rao  */
228774d251dSAttilio Rao static __inline struct vm_radix_node *
2291ddda2ebSJeff Roberson vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
230774d251dSAttilio Rao {
231774d251dSAttilio Rao 
2321ddda2ebSJeff Roberson 	return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
233774d251dSAttilio Rao }
234774d251dSAttilio Rao 
235774d251dSAttilio Rao /*
236774d251dSAttilio Rao  * Set the root node for a radix tree.
237774d251dSAttilio Rao  */
238774d251dSAttilio Rao static __inline void
2391ddda2ebSJeff Roberson vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
2401ddda2ebSJeff Roberson     enum vm_radix_access access)
241774d251dSAttilio Rao {
242774d251dSAttilio Rao 
2431ddda2ebSJeff Roberson 	vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
244774d251dSAttilio Rao }
245774d251dSAttilio Rao 
246774d251dSAttilio Rao /*
2473fc10b73SAlan Cox  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
2483fc10b73SAlan Cox  */
2491efa7dbcSDoug Moore static __inline bool
2503fc10b73SAlan Cox vm_radix_isleaf(struct vm_radix_node *rnode)
2513fc10b73SAlan Cox {
2523fc10b73SAlan Cox 
2533fc10b73SAlan Cox 	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
2543fc10b73SAlan Cox }
2553fc10b73SAlan Cox 
2563fc10b73SAlan Cox /*
2579cfed089SDoug Moore  * Returns page cast to radix node with leaf bit set.
2589cfed089SDoug Moore  */
2599cfed089SDoug Moore static __inline struct vm_radix_node *
2609cfed089SDoug Moore vm_radix_toleaf(vm_page_t page)
2619cfed089SDoug Moore {
2629cfed089SDoug Moore 	return ((struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF));
2639cfed089SDoug Moore }
2649cfed089SDoug Moore 
2659cfed089SDoug Moore /*
26696f1a842SAlan Cox  * Returns the associated page extracted from rnode.
267774d251dSAttilio Rao  */
268774d251dSAttilio Rao static __inline vm_page_t
26996f1a842SAlan Cox vm_radix_topage(struct vm_radix_node *rnode)
270774d251dSAttilio Rao {
271774d251dSAttilio Rao 
27296f1a842SAlan Cox 	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
273774d251dSAttilio Rao }
274774d251dSAttilio Rao 
275774d251dSAttilio Rao /*
276774d251dSAttilio Rao  * Adds the page as a child of the provided node.
277774d251dSAttilio Rao  */
278774d251dSAttilio Rao static __inline void
279774d251dSAttilio Rao vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
2801ddda2ebSJeff Roberson     vm_page_t page, enum vm_radix_access access)
281774d251dSAttilio Rao {
282774d251dSAttilio Rao 	int slot;
283774d251dSAttilio Rao 
284774d251dSAttilio Rao 	slot = vm_radix_slot(index, clev);
2851ddda2ebSJeff Roberson 	vm_radix_node_store(&rnode->rn_child[slot],
2869cfed089SDoug Moore 	    vm_radix_toleaf(page), access);
287774d251dSAttilio Rao }
288774d251dSAttilio Rao 
289774d251dSAttilio Rao /*
29005963ea4SDoug Moore  * Returns the level where two keys differ.
291774d251dSAttilio Rao  * It cannot accept 2 equal keys.
292774d251dSAttilio Rao  */
293774d251dSAttilio Rao static __inline uint16_t
294774d251dSAttilio Rao vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
295774d251dSAttilio Rao {
296774d251dSAttilio Rao 
297774d251dSAttilio Rao 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
298774d251dSAttilio Rao 	    __func__, (uintmax_t)index1));
29905963ea4SDoug Moore 	CTASSERT(sizeof(long long) >= sizeof(vm_pindex_t));
300774d251dSAttilio Rao 
30105963ea4SDoug Moore 	/*
30205963ea4SDoug Moore 	 * From the highest-order bit where the indexes differ,
30305963ea4SDoug Moore 	 * compute the highest level in the trie where they differ.
30405963ea4SDoug Moore 	 */
30505963ea4SDoug Moore 	return ((flsll(index1 ^ index2) - 1) / VM_RADIX_WIDTH);
306774d251dSAttilio Rao }
307774d251dSAttilio Rao 
308774d251dSAttilio Rao /*
309774d251dSAttilio Rao  * Returns TRUE if it can be determined that key does not belong to the
310774d251dSAttilio Rao  * specified rnode.  Otherwise, returns FALSE.
311774d251dSAttilio Rao  */
3121efa7dbcSDoug Moore static __inline bool
313774d251dSAttilio Rao vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
314774d251dSAttilio Rao {
315774d251dSAttilio Rao 
3169f2e6008SAlan Cox 	if (rnode->rn_clev < VM_RADIX_LIMIT) {
3179f2e6008SAlan Cox 		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
318c1c82b36SAlan Cox 		return (idx != rnode->rn_owner);
319774d251dSAttilio Rao 	}
3201efa7dbcSDoug Moore 	return (false);
321774d251dSAttilio Rao }
322774d251dSAttilio Rao 
323774d251dSAttilio Rao /*
324774d251dSAttilio Rao  * Internal helper for vm_radix_reclaim_allnodes().
325774d251dSAttilio Rao  * This function is recursive.
326774d251dSAttilio Rao  */
327774d251dSAttilio Rao static void
328774d251dSAttilio Rao vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
329774d251dSAttilio Rao {
3301ddda2ebSJeff Roberson 	struct vm_radix_node *child;
331774d251dSAttilio Rao 	int slot;
332774d251dSAttilio Rao 
333652615dcSAlan Cox 	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
334652615dcSAlan Cox 	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
335652615dcSAlan Cox 	for (slot = 0; rnode->rn_count != 0; slot++) {
3369cfed089SDoug Moore 		child = vm_radix_node_load(&rnode->rn_child[slot],
3379cfed089SDoug Moore 		    UNSERIALIZED);
3381ddda2ebSJeff Roberson 		if (child == NULL)
339774d251dSAttilio Rao 			continue;
3401ddda2ebSJeff Roberson 		if (!vm_radix_isleaf(child))
3411ddda2ebSJeff Roberson 			vm_radix_reclaim_allnodes_int(child);
3421ddda2ebSJeff Roberson 		vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
343774d251dSAttilio Rao 		rnode->rn_count--;
344774d251dSAttilio Rao 	}
3451ddda2ebSJeff Roberson 	vm_radix_node_put(rnode, -1);
346a3d799fbSMateusz Guzik }
347a3d799fbSMateusz Guzik 
348e946b949SAttilio Rao #ifndef UMA_MD_SMALL_ALLOC
349ae941b1bSGleb Smirnoff void vm_radix_reserve_kva(void);
350774d251dSAttilio Rao /*
351e946b949SAttilio Rao  * Reserve the KVA necessary to satisfy the node allocation.
352e946b949SAttilio Rao  * This is mandatory in architectures not supporting direct
353e946b949SAttilio Rao  * mapping as they will need otherwise to carve into the kernel maps for
354e946b949SAttilio Rao  * every node allocation, resulting into deadlocks for consumers already
355e946b949SAttilio Rao  * working with kernel maps.
356774d251dSAttilio Rao  */
357ae941b1bSGleb Smirnoff void
358ae941b1bSGleb Smirnoff vm_radix_reserve_kva(void)
359774d251dSAttilio Rao {
360774d251dSAttilio Rao 
361880659feSAlan Cox 	/*
362880659feSAlan Cox 	 * Calculate the number of reserved nodes, discounting the pages that
363880659feSAlan Cox 	 * are needed to store them.
364880659feSAlan Cox 	 */
365e946b949SAttilio Rao 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
36644f1c916SBryan Drewery 	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
367e946b949SAttilio Rao 	    sizeof(struct vm_radix_node))))
368e946b949SAttilio Rao 		panic("%s: unable to reserve KVA", __func__);
369774d251dSAttilio Rao }
370e946b949SAttilio Rao #endif
371774d251dSAttilio Rao 
372774d251dSAttilio Rao /*
373774d251dSAttilio Rao  * Initialize the UMA slab zone.
374774d251dSAttilio Rao  */
375774d251dSAttilio Rao void
376cd1241fbSKonstantin Belousov vm_radix_zinit(void)
377774d251dSAttilio Rao {
378774d251dSAttilio Rao 
379774d251dSAttilio Rao 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
3801ddda2ebSJeff Roberson 	    sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
3811ddda2ebSJeff Roberson 	    VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
3821ddda2ebSJeff Roberson 	vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
383774d251dSAttilio Rao }
384774d251dSAttilio Rao 
385774d251dSAttilio Rao /*
386774d251dSAttilio Rao  * Inserts the key-value pair into the trie.
387774d251dSAttilio Rao  * Panics if the key already exists.
388774d251dSAttilio Rao  */
389e946b949SAttilio Rao int
390774d251dSAttilio Rao vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
391774d251dSAttilio Rao {
392774d251dSAttilio Rao 	vm_pindex_t index, newind;
393a08f2cf6SAlan Cox 	struct vm_radix_node *rnode, *tmp;
3941ddda2ebSJeff Roberson 	smrnode_t *parentp;
395774d251dSAttilio Rao 	vm_page_t m;
396774d251dSAttilio Rao 	int slot;
397774d251dSAttilio Rao 	uint16_t clev;
398774d251dSAttilio Rao 
399774d251dSAttilio Rao 	index = page->pindex;
400774d251dSAttilio Rao 
401774d251dSAttilio Rao 	/*
402774d251dSAttilio Rao 	 * The owner of record for root is not really important because it
403774d251dSAttilio Rao 	 * will never be used.
404774d251dSAttilio Rao 	 */
4051ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, LOCKED);
406774d251dSAttilio Rao 	if (rnode == NULL) {
4079cfed089SDoug Moore 		rtree->rt_root = (uintptr_t)vm_radix_toleaf(page);
408e946b949SAttilio Rao 		return (0);
409774d251dSAttilio Rao 	}
4101ddda2ebSJeff Roberson 	parentp = (smrnode_t *)&rtree->rt_root;
411a08f2cf6SAlan Cox 	for (;;) {
412a08f2cf6SAlan Cox 		if (vm_radix_isleaf(rnode)) {
413a08f2cf6SAlan Cox 			m = vm_radix_topage(rnode);
414774d251dSAttilio Rao 			if (m->pindex == index)
415774d251dSAttilio Rao 				panic("%s: key %jx is already present",
416774d251dSAttilio Rao 				    __func__, (uintmax_t)index);
417774d251dSAttilio Rao 			clev = vm_radix_keydiff(m->pindex, index);
418*da72505fSDoug Moore 			tmp = vm_radix_node_get(index, clev);
419563a19d5SAlan Cox 			if (tmp == NULL)
420e946b949SAttilio Rao 				return (ENOMEM);
4211ddda2ebSJeff Roberson 			/* These writes are not yet visible due to ordering. */
4221ddda2ebSJeff Roberson 			vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
4231ddda2ebSJeff Roberson 			vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
4241ddda2ebSJeff Roberson 			/* Synchronize to make leaf visible. */
4251ddda2ebSJeff Roberson 			vm_radix_node_store(parentp, tmp, LOCKED);
426e946b949SAttilio Rao 			return (0);
427a08f2cf6SAlan Cox 		} else if (vm_radix_keybarr(rnode, index))
428a08f2cf6SAlan Cox 			break;
429a08f2cf6SAlan Cox 		slot = vm_radix_slot(index, rnode->rn_clev);
4301ddda2ebSJeff Roberson 		parentp = &rnode->rn_child[slot];
4311ddda2ebSJeff Roberson 		tmp = vm_radix_node_load(parentp, LOCKED);
4321ddda2ebSJeff Roberson 		if (tmp == NULL) {
433774d251dSAttilio Rao 			rnode->rn_count++;
4341ddda2ebSJeff Roberson 			vm_radix_addpage(rnode, index, rnode->rn_clev, page,
4351ddda2ebSJeff Roberson 			    LOCKED);
436e946b949SAttilio Rao 			return (0);
437774d251dSAttilio Rao 		}
4381ddda2ebSJeff Roberson 		rnode = tmp;
439a08f2cf6SAlan Cox 	}
440774d251dSAttilio Rao 
441774d251dSAttilio Rao 	/*
442774d251dSAttilio Rao 	 * A new node is needed because the right insertion level is reached.
443774d251dSAttilio Rao 	 * Setup the new intermediate node and add the 2 children: the
444774d251dSAttilio Rao 	 * new object and the older edge.
445774d251dSAttilio Rao 	 */
44672abda64SAlan Cox 	newind = rnode->rn_owner;
44772abda64SAlan Cox 	clev = vm_radix_keydiff(newind, index);
448*da72505fSDoug Moore 	tmp = vm_radix_node_get(index, clev);
449563a19d5SAlan Cox 	if (tmp == NULL)
450e946b949SAttilio Rao 		return (ENOMEM);
451774d251dSAttilio Rao 	slot = vm_radix_slot(newind, clev);
4521ddda2ebSJeff Roberson 	/* These writes are not yet visible due to ordering. */
4531ddda2ebSJeff Roberson 	vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
4541ddda2ebSJeff Roberson 	vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
4551ddda2ebSJeff Roberson 	/* Serializing write to make the above visible. */
4561ddda2ebSJeff Roberson 	vm_radix_node_store(parentp, tmp, LOCKED);
4571ddda2ebSJeff Roberson 
458e946b949SAttilio Rao 	return (0);
459774d251dSAttilio Rao }
460774d251dSAttilio Rao 
461774d251dSAttilio Rao /*
462774d251dSAttilio Rao  * Returns the value stored at the index.  If the index is not present,
463774d251dSAttilio Rao  * NULL is returned.
464774d251dSAttilio Rao  */
4651ddda2ebSJeff Roberson static __always_inline vm_page_t
4661ddda2ebSJeff Roberson _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
4671ddda2ebSJeff Roberson     enum vm_radix_access access)
468774d251dSAttilio Rao {
469774d251dSAttilio Rao 	struct vm_radix_node *rnode;
470774d251dSAttilio Rao 	vm_page_t m;
471774d251dSAttilio Rao 	int slot;
472774d251dSAttilio Rao 
4731ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, access);
474774d251dSAttilio Rao 	while (rnode != NULL) {
47596f1a842SAlan Cox 		if (vm_radix_isleaf(rnode)) {
47696f1a842SAlan Cox 			m = vm_radix_topage(rnode);
477774d251dSAttilio Rao 			if (m->pindex == index)
478774d251dSAttilio Rao 				return (m);
4796f9c0b15SAlan Cox 			break;
4801ddda2ebSJeff Roberson 		}
4811ddda2ebSJeff Roberson 		if (vm_radix_keybarr(rnode, index))
4826f9c0b15SAlan Cox 			break;
4836f9c0b15SAlan Cox 		slot = vm_radix_slot(index, rnode->rn_clev);
4841ddda2ebSJeff Roberson 		rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
485774d251dSAttilio Rao 	}
486774d251dSAttilio Rao 	return (NULL);
487774d251dSAttilio Rao }
488774d251dSAttilio Rao 
489774d251dSAttilio Rao /*
4901ddda2ebSJeff Roberson  * Returns the value stored at the index assuming there is an external lock.
4911ddda2ebSJeff Roberson  *
4921ddda2ebSJeff Roberson  * If the index is not present, NULL is returned.
4931ddda2ebSJeff Roberson  */
4941ddda2ebSJeff Roberson vm_page_t
4951ddda2ebSJeff Roberson vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
4961ddda2ebSJeff Roberson {
4971ddda2ebSJeff Roberson 
4981ddda2ebSJeff Roberson 	return _vm_radix_lookup(rtree, index, LOCKED);
4991ddda2ebSJeff Roberson }
5001ddda2ebSJeff Roberson 
5011ddda2ebSJeff Roberson /*
5021ddda2ebSJeff Roberson  * Returns the value stored at the index without requiring an external lock.
5031ddda2ebSJeff Roberson  *
5041ddda2ebSJeff Roberson  * If the index is not present, NULL is returned.
5051ddda2ebSJeff Roberson  */
5061ddda2ebSJeff Roberson vm_page_t
5071ddda2ebSJeff Roberson vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
5081ddda2ebSJeff Roberson {
5091ddda2ebSJeff Roberson 	vm_page_t m;
5101ddda2ebSJeff Roberson 
5111ddda2ebSJeff Roberson 	smr_enter(vm_radix_smr);
5121ddda2ebSJeff Roberson 	m = _vm_radix_lookup(rtree, index, SMR);
5131ddda2ebSJeff Roberson 	smr_exit(vm_radix_smr);
5141ddda2ebSJeff Roberson 
5151ddda2ebSJeff Roberson 	return (m);
5161ddda2ebSJeff Roberson }
5171ddda2ebSJeff Roberson 
5181ddda2ebSJeff Roberson /*
5191ddda2ebSJeff Roberson  * Look up the nearest entry at a position greater than or equal to index.
520774d251dSAttilio Rao  */
521774d251dSAttilio Rao vm_page_t
522774d251dSAttilio Rao vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
523774d251dSAttilio Rao {
5242d4b9a64SAlan Cox 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
525774d251dSAttilio Rao 	vm_pindex_t inc;
526774d251dSAttilio Rao 	vm_page_t m;
52796f1a842SAlan Cox 	struct vm_radix_node *child, *rnode;
528774d251dSAttilio Rao #ifdef INVARIANTS
529774d251dSAttilio Rao 	int loops = 0;
530774d251dSAttilio Rao #endif
5312d4b9a64SAlan Cox 	int slot, tos;
532774d251dSAttilio Rao 
5331ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, LOCKED);
5346f9c0b15SAlan Cox 	if (rnode == NULL)
5356f9c0b15SAlan Cox 		return (NULL);
5366f9c0b15SAlan Cox 	else if (vm_radix_isleaf(rnode)) {
5376f9c0b15SAlan Cox 		m = vm_radix_topage(rnode);
5386f9c0b15SAlan Cox 		if (m->pindex >= index)
5396f9c0b15SAlan Cox 			return (m);
5406f9c0b15SAlan Cox 		else
5416f9c0b15SAlan Cox 			return (NULL);
5426f9c0b15SAlan Cox 	}
5432d4b9a64SAlan Cox 	tos = 0;
5446f9c0b15SAlan Cox 	for (;;) {
545774d251dSAttilio Rao 		/*
54640076ebcSAlan Cox 		 * If the keys differ before the current bisection node,
54740076ebcSAlan Cox 		 * then the search key might rollback to the earliest
54840076ebcSAlan Cox 		 * available bisection node or to the smallest key
5491ddda2ebSJeff Roberson 		 * in the current node (if the owner is greater than the
550774d251dSAttilio Rao 		 * search key).
551774d251dSAttilio Rao 		 */
552774d251dSAttilio Rao 		if (vm_radix_keybarr(rnode, index)) {
553774d251dSAttilio Rao 			if (index > rnode->rn_owner) {
5542d4b9a64SAlan Cox ascend:
5552d4b9a64SAlan Cox 				KASSERT(++loops < 1000,
5562d4b9a64SAlan Cox 				    ("vm_radix_lookup_ge: too many loops"));
5572d4b9a64SAlan Cox 
5582d4b9a64SAlan Cox 				/*
5592d4b9a64SAlan Cox 				 * Pop nodes from the stack until either the
5602d4b9a64SAlan Cox 				 * stack is empty or a node that could have a
5612d4b9a64SAlan Cox 				 * matching descendant is found.
5622d4b9a64SAlan Cox 				 */
5632d4b9a64SAlan Cox 				do {
5642d4b9a64SAlan Cox 					if (tos == 0)
5652d4b9a64SAlan Cox 						return (NULL);
5662d4b9a64SAlan Cox 					rnode = stack[--tos];
5672d4b9a64SAlan Cox 				} while (vm_radix_slot(index,
5682d4b9a64SAlan Cox 				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
5692d4b9a64SAlan Cox 
5702d4b9a64SAlan Cox 				/*
5712d4b9a64SAlan Cox 				 * The following computation cannot overflow
5722d4b9a64SAlan Cox 				 * because index's slot at the current level
5732d4b9a64SAlan Cox 				 * is less than VM_RADIX_COUNT - 1.
5742d4b9a64SAlan Cox 				 */
5752d4b9a64SAlan Cox 				index = vm_radix_trimkey(index,
5762d4b9a64SAlan Cox 				    rnode->rn_clev);
5772d4b9a64SAlan Cox 				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
57840076ebcSAlan Cox 			} else
57940076ebcSAlan Cox 				index = rnode->rn_owner;
5802d4b9a64SAlan Cox 			KASSERT(!vm_radix_keybarr(rnode, index),
5812d4b9a64SAlan Cox 			    ("vm_radix_lookup_ge: keybarr failed"));
582774d251dSAttilio Rao 		}
583774d251dSAttilio Rao 		slot = vm_radix_slot(index, rnode->rn_clev);
5841ddda2ebSJeff Roberson 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
58596f1a842SAlan Cox 		if (vm_radix_isleaf(child)) {
58696f1a842SAlan Cox 			m = vm_radix_topage(child);
58796f1a842SAlan Cox 			if (m->pindex >= index)
588774d251dSAttilio Rao 				return (m);
58996f1a842SAlan Cox 		} else if (child != NULL)
59096f1a842SAlan Cox 			goto descend;
591774d251dSAttilio Rao 
592774d251dSAttilio Rao 		/*
593774d251dSAttilio Rao 		 * Look for an available edge or page within the current
594774d251dSAttilio Rao 		 * bisection node.
595774d251dSAttilio Rao 		 */
596774d251dSAttilio Rao                 if (slot < (VM_RADIX_COUNT - 1)) {
597774d251dSAttilio Rao 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
598774d251dSAttilio Rao 			index = vm_radix_trimkey(index, rnode->rn_clev);
59996f1a842SAlan Cox 			do {
600774d251dSAttilio Rao 				index += inc;
601774d251dSAttilio Rao 				slot++;
6021ddda2ebSJeff Roberson 				child = vm_radix_node_load(&rnode->rn_child[slot],
6031ddda2ebSJeff Roberson 				    LOCKED);
60496f1a842SAlan Cox 				if (vm_radix_isleaf(child)) {
60596f1a842SAlan Cox 					m = vm_radix_topage(child);
60672c3a43bSDoug Moore 					KASSERT(m->pindex >= index,
60772c3a43bSDoug Moore 					    ("vm_radix_lookup_ge: leaf<index"));
608774d251dSAttilio Rao 					return (m);
60996f1a842SAlan Cox 				} else if (child != NULL)
61096f1a842SAlan Cox 					goto descend;
61196f1a842SAlan Cox 			} while (slot < (VM_RADIX_COUNT - 1));
612774d251dSAttilio Rao 		}
61396f1a842SAlan Cox 		KASSERT(child == NULL || vm_radix_isleaf(child),
61496f1a842SAlan Cox 		    ("vm_radix_lookup_ge: child is radix node"));
615774d251dSAttilio Rao 
616774d251dSAttilio Rao 		/*
6171ddda2ebSJeff Roberson 		 * If a page or edge greater than the search slot is not found
6182d4b9a64SAlan Cox 		 * in the current node, ascend to the next higher-level node.
619774d251dSAttilio Rao 		 */
6202d4b9a64SAlan Cox 		goto ascend;
62196f1a842SAlan Cox descend:
6229f2e6008SAlan Cox 		KASSERT(rnode->rn_clev > 0,
6232d4b9a64SAlan Cox 		    ("vm_radix_lookup_ge: pushing leaf's parent"));
6242d4b9a64SAlan Cox 		KASSERT(tos < VM_RADIX_LIMIT,
6252d4b9a64SAlan Cox 		    ("vm_radix_lookup_ge: stack overflow"));
6262d4b9a64SAlan Cox 		stack[tos++] = rnode;
62796f1a842SAlan Cox 		rnode = child;
628774d251dSAttilio Rao 	}
629774d251dSAttilio Rao }
630774d251dSAttilio Rao 
631774d251dSAttilio Rao /*
632774d251dSAttilio Rao  * Look up the nearest entry at a position less than or equal to index.
633774d251dSAttilio Rao  */
634774d251dSAttilio Rao vm_page_t
635774d251dSAttilio Rao vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
636774d251dSAttilio Rao {
6372d4b9a64SAlan Cox 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
638774d251dSAttilio Rao 	vm_pindex_t inc;
639774d251dSAttilio Rao 	vm_page_t m;
64096f1a842SAlan Cox 	struct vm_radix_node *child, *rnode;
641774d251dSAttilio Rao #ifdef INVARIANTS
642774d251dSAttilio Rao 	int loops = 0;
643774d251dSAttilio Rao #endif
6442d4b9a64SAlan Cox 	int slot, tos;
645774d251dSAttilio Rao 
6461ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, LOCKED);
6476f9c0b15SAlan Cox 	if (rnode == NULL)
6486f9c0b15SAlan Cox 		return (NULL);
6496f9c0b15SAlan Cox 	else if (vm_radix_isleaf(rnode)) {
6506f9c0b15SAlan Cox 		m = vm_radix_topage(rnode);
6516f9c0b15SAlan Cox 		if (m->pindex <= index)
6526f9c0b15SAlan Cox 			return (m);
6536f9c0b15SAlan Cox 		else
6546f9c0b15SAlan Cox 			return (NULL);
6556f9c0b15SAlan Cox 	}
6562d4b9a64SAlan Cox 	tos = 0;
6576f9c0b15SAlan Cox 	for (;;) {
658774d251dSAttilio Rao 		/*
65940076ebcSAlan Cox 		 * If the keys differ before the current bisection node,
66040076ebcSAlan Cox 		 * then the search key might rollback to the earliest
66140076ebcSAlan Cox 		 * available bisection node or to the largest key
66240076ebcSAlan Cox 		 * in the current node (if the owner is smaller than the
663774d251dSAttilio Rao 		 * search key).
664774d251dSAttilio Rao 		 */
665774d251dSAttilio Rao 		if (vm_radix_keybarr(rnode, index)) {
666774d251dSAttilio Rao 			if (index > rnode->rn_owner) {
66740076ebcSAlan Cox 				index = rnode->rn_owner + VM_RADIX_COUNT *
6682d4b9a64SAlan Cox 				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
66940076ebcSAlan Cox 			} else {
6702d4b9a64SAlan Cox ascend:
6712d4b9a64SAlan Cox 				KASSERT(++loops < 1000,
6722d4b9a64SAlan Cox 				    ("vm_radix_lookup_le: too many loops"));
6732d4b9a64SAlan Cox 
6742d4b9a64SAlan Cox 				/*
6752d4b9a64SAlan Cox 				 * Pop nodes from the stack until either the
6762d4b9a64SAlan Cox 				 * stack is empty or a node that could have a
6772d4b9a64SAlan Cox 				 * matching descendant is found.
6782d4b9a64SAlan Cox 				 */
6792d4b9a64SAlan Cox 				do {
6802d4b9a64SAlan Cox 					if (tos == 0)
6812d4b9a64SAlan Cox 						return (NULL);
6822d4b9a64SAlan Cox 					rnode = stack[--tos];
6832d4b9a64SAlan Cox 				} while (vm_radix_slot(index,
6842d4b9a64SAlan Cox 				    rnode->rn_clev) == 0);
6852d4b9a64SAlan Cox 
6862d4b9a64SAlan Cox 				/*
6872d4b9a64SAlan Cox 				 * The following computation cannot overflow
6882d4b9a64SAlan Cox 				 * because index's slot at the current level
6892d4b9a64SAlan Cox 				 * is greater than 0.
6902d4b9a64SAlan Cox 				 */
6912d4b9a64SAlan Cox 				index = vm_radix_trimkey(index,
6922d4b9a64SAlan Cox 				    rnode->rn_clev);
693774d251dSAttilio Rao 			}
6942d4b9a64SAlan Cox 			index--;
6952d4b9a64SAlan Cox 			KASSERT(!vm_radix_keybarr(rnode, index),
6962d4b9a64SAlan Cox 			    ("vm_radix_lookup_le: keybarr failed"));
69740076ebcSAlan Cox 		}
698774d251dSAttilio Rao 		slot = vm_radix_slot(index, rnode->rn_clev);
6991ddda2ebSJeff Roberson 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
70096f1a842SAlan Cox 		if (vm_radix_isleaf(child)) {
70196f1a842SAlan Cox 			m = vm_radix_topage(child);
70296f1a842SAlan Cox 			if (m->pindex <= index)
703774d251dSAttilio Rao 				return (m);
70496f1a842SAlan Cox 		} else if (child != NULL)
70596f1a842SAlan Cox 			goto descend;
706774d251dSAttilio Rao 
707774d251dSAttilio Rao 		/*
708774d251dSAttilio Rao 		 * Look for an available edge or page within the current
709774d251dSAttilio Rao 		 * bisection node.
710774d251dSAttilio Rao 		 */
711774d251dSAttilio Rao 		if (slot > 0) {
712774d251dSAttilio Rao 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
713774d251dSAttilio Rao 			index |= inc - 1;
71496f1a842SAlan Cox 			do {
715774d251dSAttilio Rao 				index -= inc;
716774d251dSAttilio Rao 				slot--;
7171ddda2ebSJeff Roberson 				child = vm_radix_node_load(&rnode->rn_child[slot],
7181ddda2ebSJeff Roberson 				    LOCKED);
71996f1a842SAlan Cox 				if (vm_radix_isleaf(child)) {
72096f1a842SAlan Cox 					m = vm_radix_topage(child);
72172c3a43bSDoug Moore 					KASSERT(m->pindex <= index,
72272c3a43bSDoug Moore 					    ("vm_radix_lookup_le: leaf>index"));
723774d251dSAttilio Rao 					return (m);
72496f1a842SAlan Cox 				} else if (child != NULL)
72596f1a842SAlan Cox 					goto descend;
72696f1a842SAlan Cox 			} while (slot > 0);
727774d251dSAttilio Rao 		}
72896f1a842SAlan Cox 		KASSERT(child == NULL || vm_radix_isleaf(child),
72996f1a842SAlan Cox 		    ("vm_radix_lookup_le: child is radix node"));
730774d251dSAttilio Rao 
731774d251dSAttilio Rao 		/*
7322d4b9a64SAlan Cox 		 * If a page or edge smaller than the search slot is not found
7332d4b9a64SAlan Cox 		 * in the current node, ascend to the next higher-level node.
734774d251dSAttilio Rao 		 */
7352d4b9a64SAlan Cox 		goto ascend;
73696f1a842SAlan Cox descend:
7379f2e6008SAlan Cox 		KASSERT(rnode->rn_clev > 0,
7382d4b9a64SAlan Cox 		    ("vm_radix_lookup_le: pushing leaf's parent"));
7392d4b9a64SAlan Cox 		KASSERT(tos < VM_RADIX_LIMIT,
7402d4b9a64SAlan Cox 		    ("vm_radix_lookup_le: stack overflow"));
7412d4b9a64SAlan Cox 		stack[tos++] = rnode;
74296f1a842SAlan Cox 		rnode = child;
743774d251dSAttilio Rao 	}
744774d251dSAttilio Rao }
745774d251dSAttilio Rao 
746774d251dSAttilio Rao /*
747e94965d8SAlan Cox  * Remove the specified index from the trie, and return the value stored at
748e94965d8SAlan Cox  * that index.  If the index is not present, return NULL.
749774d251dSAttilio Rao  */
750e94965d8SAlan Cox vm_page_t
751774d251dSAttilio Rao vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
752774d251dSAttilio Rao {
7531ddda2ebSJeff Roberson 	struct vm_radix_node *rnode, *parent, *tmp;
754774d251dSAttilio Rao 	vm_page_t m;
755774d251dSAttilio Rao 	int i, slot;
756774d251dSAttilio Rao 
7571ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, LOCKED);
758a08f2cf6SAlan Cox 	if (vm_radix_isleaf(rnode)) {
759a08f2cf6SAlan Cox 		m = vm_radix_topage(rnode);
760a08f2cf6SAlan Cox 		if (m->pindex != index)
761e94965d8SAlan Cox 			return (NULL);
7621ddda2ebSJeff Roberson 		vm_radix_root_store(rtree, NULL, LOCKED);
763e94965d8SAlan Cox 		return (m);
764a08f2cf6SAlan Cox 	}
765a08f2cf6SAlan Cox 	parent = NULL;
766774d251dSAttilio Rao 	for (;;) {
767774d251dSAttilio Rao 		if (rnode == NULL)
768e94965d8SAlan Cox 			return (NULL);
769774d251dSAttilio Rao 		slot = vm_radix_slot(index, rnode->rn_clev);
7701ddda2ebSJeff Roberson 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
7711ddda2ebSJeff Roberson 		if (vm_radix_isleaf(tmp)) {
7721ddda2ebSJeff Roberson 			m = vm_radix_topage(tmp);
77396f1a842SAlan Cox 			if (m->pindex != index)
774e94965d8SAlan Cox 				return (NULL);
7759cfed089SDoug Moore 			vm_radix_node_store(
7769cfed089SDoug Moore 			    &rnode->rn_child[slot], NULL, LOCKED);
777774d251dSAttilio Rao 			rnode->rn_count--;
778774d251dSAttilio Rao 			if (rnode->rn_count > 1)
779e94965d8SAlan Cox 				return (m);
780e8efee29SDoug Moore 			for (i = 0; i < VM_RADIX_COUNT; i++) {
781e8efee29SDoug Moore 				tmp = vm_radix_node_load(&rnode->rn_child[i],
782e8efee29SDoug Moore 				    LOCKED);
783e8efee29SDoug Moore 				if (tmp != NULL)
784774d251dSAttilio Rao 					break;
785e8efee29SDoug Moore 			}
786e8efee29SDoug Moore 			KASSERT(tmp != NULL,
787774d251dSAttilio Rao 			    ("%s: invalid node configuration", __func__));
788a08f2cf6SAlan Cox 			if (parent == NULL)
7891ddda2ebSJeff Roberson 				vm_radix_root_store(rtree, tmp, LOCKED);
790a08f2cf6SAlan Cox 			else {
791774d251dSAttilio Rao 				slot = vm_radix_slot(index, parent->rn_clev);
7921ddda2ebSJeff Roberson 				KASSERT(vm_radix_node_load(
7931ddda2ebSJeff Roberson 				    &parent->rn_child[slot], LOCKED) == rnode,
794774d251dSAttilio Rao 				    ("%s: invalid child value", __func__));
7951ddda2ebSJeff Roberson 				vm_radix_node_store(&parent->rn_child[slot],
7961ddda2ebSJeff Roberson 				    tmp, LOCKED);
797a08f2cf6SAlan Cox 			}
7981ddda2ebSJeff Roberson 			/*
7991ddda2ebSJeff Roberson 			 * The child is still valid and we can not zero the
8001ddda2ebSJeff Roberson 			 * pointer until all smr references are gone.
8011ddda2ebSJeff Roberson 			 */
802774d251dSAttilio Rao 			rnode->rn_count--;
8031ddda2ebSJeff Roberson 			vm_radix_node_put(rnode, i);
804e94965d8SAlan Cox 			return (m);
805774d251dSAttilio Rao 		}
806774d251dSAttilio Rao 		parent = rnode;
8071ddda2ebSJeff Roberson 		rnode = tmp;
808774d251dSAttilio Rao 	}
809774d251dSAttilio Rao }
810774d251dSAttilio Rao 
811774d251dSAttilio Rao /*
812774d251dSAttilio Rao  * Remove and free all the nodes from the radix tree.
813774d251dSAttilio Rao  * This function is recursive but there is a tight control on it as the
814774d251dSAttilio Rao  * maximum depth of the tree is fixed.
815774d251dSAttilio Rao  */
816774d251dSAttilio Rao void
817774d251dSAttilio Rao vm_radix_reclaim_allnodes(struct vm_radix *rtree)
818774d251dSAttilio Rao {
819774d251dSAttilio Rao 	struct vm_radix_node *root;
820774d251dSAttilio Rao 
8211ddda2ebSJeff Roberson 	root = vm_radix_root_load(rtree, LOCKED);
822774d251dSAttilio Rao 	if (root == NULL)
823774d251dSAttilio Rao 		return;
8241ddda2ebSJeff Roberson 	vm_radix_root_store(rtree, NULL, UNSERIALIZED);
825a08f2cf6SAlan Cox 	if (!vm_radix_isleaf(root))
826652615dcSAlan Cox 		vm_radix_reclaim_allnodes_int(root);
827774d251dSAttilio Rao }
828774d251dSAttilio Rao 
829e946b949SAttilio Rao /*
830703b304fSAlan Cox  * Replace an existing page in the trie with another one.
831703b304fSAlan Cox  * Panics if there is not an old page in the trie at the new page's index.
832e946b949SAttilio Rao  */
833e946b949SAttilio Rao vm_page_t
834703b304fSAlan Cox vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
835e946b949SAttilio Rao {
8361ddda2ebSJeff Roberson 	struct vm_radix_node *rnode, *tmp;
837e946b949SAttilio Rao 	vm_page_t m;
838703b304fSAlan Cox 	vm_pindex_t index;
839e946b949SAttilio Rao 	int slot;
840e946b949SAttilio Rao 
841703b304fSAlan Cox 	index = newpage->pindex;
8421ddda2ebSJeff Roberson 	rnode = vm_radix_root_load(rtree, LOCKED);
843e946b949SAttilio Rao 	if (rnode == NULL)
844e946b949SAttilio Rao 		panic("%s: replacing page on an empty trie", __func__);
845e946b949SAttilio Rao 	if (vm_radix_isleaf(rnode)) {
846e946b949SAttilio Rao 		m = vm_radix_topage(rnode);
847e946b949SAttilio Rao 		if (m->pindex != index)
848e946b949SAttilio Rao 			panic("%s: original replacing root key not found",
849e946b949SAttilio Rao 			    __func__);
8509cfed089SDoug Moore 		rtree->rt_root = (uintptr_t)vm_radix_toleaf(newpage);
851e946b949SAttilio Rao 		return (m);
852e946b949SAttilio Rao 	}
853e946b949SAttilio Rao 	for (;;) {
854e946b949SAttilio Rao 		slot = vm_radix_slot(index, rnode->rn_clev);
8551ddda2ebSJeff Roberson 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
8561ddda2ebSJeff Roberson 		if (vm_radix_isleaf(tmp)) {
8571ddda2ebSJeff Roberson 			m = vm_radix_topage(tmp);
8589cfed089SDoug Moore 			if (m->pindex != index)
859e946b949SAttilio Rao 				break;
8609cfed089SDoug Moore 			vm_radix_node_store(&rnode->rn_child[slot],
8619cfed089SDoug Moore 			    vm_radix_toleaf(newpage), LOCKED);
8629cfed089SDoug Moore 			return (m);
8631ddda2ebSJeff Roberson 		} else if (tmp == NULL || vm_radix_keybarr(tmp, index))
864e946b949SAttilio Rao 			break;
8651ddda2ebSJeff Roberson 		rnode = tmp;
866e946b949SAttilio Rao 	}
867e946b949SAttilio Rao 	panic("%s: original replacing page not found", __func__);
868e946b949SAttilio Rao }
869e946b949SAttilio Rao 
8708d6fbbb8SJeff Roberson void
8718d6fbbb8SJeff Roberson vm_radix_wait(void)
8728d6fbbb8SJeff Roberson {
8738d6fbbb8SJeff Roberson 	uma_zwait(vm_radix_node_zone);
8748d6fbbb8SJeff Roberson }
8758d6fbbb8SJeff Roberson 
876774d251dSAttilio Rao #ifdef DDB
877774d251dSAttilio Rao /*
878774d251dSAttilio Rao  * Show details about the given radix node.
879774d251dSAttilio Rao  */
880774d251dSAttilio Rao DB_SHOW_COMMAND(radixnode, db_show_radixnode)
881774d251dSAttilio Rao {
8821ddda2ebSJeff Roberson 	struct vm_radix_node *rnode, *tmp;
883774d251dSAttilio Rao 	int i;
884774d251dSAttilio Rao 
885774d251dSAttilio Rao         if (!have_addr)
886774d251dSAttilio Rao                 return;
887774d251dSAttilio Rao 	rnode = (struct vm_radix_node *)addr;
888774d251dSAttilio Rao 	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
889774d251dSAttilio Rao 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
890774d251dSAttilio Rao 	    rnode->rn_clev);
8911ddda2ebSJeff Roberson 	for (i = 0; i < VM_RADIX_COUNT; i++) {
8921ddda2ebSJeff Roberson 		tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
8931ddda2ebSJeff Roberson 		if (tmp != NULL)
894774d251dSAttilio Rao 			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
8951ddda2ebSJeff Roberson 			    i, (void *)tmp,
8961ddda2ebSJeff Roberson 			    vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
897774d251dSAttilio Rao 			    rnode->rn_clev);
898774d251dSAttilio Rao 	}
8991ddda2ebSJeff Roberson }
900774d251dSAttilio Rao #endif /* DDB */
901