xref: /freebsd/sys/kern/subr_pctrie.c (revision 9b37d84c87e69dabc69d818aa4d2fea718bd8b74)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  *
35  * The implementation takes into account the following rationale:
36  * - Size of the nodes should be as small as possible but still big enough
37  *   to avoid a large maximum depth for the trie.  This is a balance
38  *   between the necessity to not wire too much physical memory for the nodes
39  *   and the necessity to avoid too much cache pollution during the trie
40  *   operations.
41  * - There is not a huge bias toward the number of lookup operations over
42  *   the number of insert and remove operations.  This basically implies
43  *   that optimizations supposedly helping one operation but hurting the
44  *   other might be carefully evaluated.
45  * - On average not many nodes are expected to be fully populated, hence
46  *   level compression may just complicate things.
47  */
48 
49 #include <sys/cdefs.h>
50 #include "opt_ddb.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/libkern.h>
56 #include <sys/pctrie.h>
57 #include <sys/proc.h>	/* smr.h depends on struct thread. */
58 #include <sys/smr.h>
59 #include <sys/smr_types.h>
60 
61 #ifdef DDB
62 #include <ddb/ddb.h>
63 #endif
64 
65 #if PCTRIE_WIDTH == 3
66 typedef uint8_t pn_popmap_t;
67 #elif PCTRIE_WIDTH == 4
68 typedef uint16_t pn_popmap_t;
69 #elif PCTRIE_WIDTH == 5
70 typedef uint32_t pn_popmap_t;
71 #else
72 #error Unsupported width
73 #endif
74 _Static_assert(sizeof(pn_popmap_t) <= sizeof(int),
75     "pn_popmap_t too wide");
76 
77 struct pctrie_node;
78 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
79 
80 struct pctrie_node {
81 	uint64_t	pn_owner;			/* Owner of record. */
82 	pn_popmap_t	pn_popmap;			/* Valid children. */
83 	uint8_t		pn_clev;			/* Level * WIDTH. */
84 	smr_pctnode_t	pn_parent;			/* Parent node. */
85 	smr_pctnode_t	pn_child[PCTRIE_COUNT];		/* Child nodes. */
86 };
87 
88 /*
89  * Map index to an array position for the children of node,
90  */
91 static __inline int
92 pctrie_slot(struct pctrie_node *node, uint64_t index)
93 {
94 	return ((index >> node->pn_clev) & (PCTRIE_COUNT - 1));
95 }
96 
97 /*
98  * Returns true if index does not belong to the specified node.  Otherwise,
99  * sets slot value, and returns false.
100  */
101 static __inline bool
102 pctrie_keybarr(struct pctrie_node *node, uint64_t index, int *slot)
103 {
104 	index = (index - node->pn_owner) >> node->pn_clev;
105 	if (index >= PCTRIE_COUNT)
106 		return (true);
107 	*slot = index;
108 	return (false);
109 }
110 
111 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
112 
113 /*
114  * Fetch a node pointer from a slot.
115  */
116 static __inline struct pctrie_node *
117 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
118 {
119 	switch (access) {
120 	case PCTRIE_UNSERIALIZED:
121 		return (smr_unserialized_load(p, true));
122 	case PCTRIE_LOCKED:
123 		return (smr_serialized_load(p, true));
124 	case PCTRIE_SMR:
125 		return (smr_entered_load(p, smr));
126 	}
127 	__assert_unreachable();
128 }
129 
130 static __inline void
131 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
132 {
133 	switch (access) {
134 	case PCTRIE_UNSERIALIZED:
135 		smr_unserialized_store(p, v, true);
136 		break;
137 	case PCTRIE_LOCKED:
138 		smr_serialized_store(p, v, true);
139 		break;
140 	case PCTRIE_SMR:
141 		panic("%s: Not supported in SMR section.", __func__);
142 		break;
143 	default:
144 		__assert_unreachable();
145 		break;
146 	}
147 }
148 
149 /*
150  * Get the root address, cast to proper type for load/store.
151  */
152 static __inline smr_pctnode_t *
153 pctrie_root(struct pctrie *ptree)
154 {
155 	return ((smr_pctnode_t *)&ptree->pt_root);
156 }
157 
158 /*
159  * Get the root node for a tree.
160  */
161 static __inline struct pctrie_node *
162 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
163 {
164 	return (pctrie_node_load(pctrie_root(ptree), smr, access));
165 }
166 
167 /*
168  * Get the child of a node.
169  */
170 static __inline smr_pctnode_t *
171 pctrie_child(struct pctrie *ptree, struct pctrie_node *node, uint64_t index)
172 {
173 	return (node == NULL ? pctrie_root(ptree) :
174 	    &node->pn_child[pctrie_slot(node, index)]);
175 }
176 
177 /*
178  * Returns TRUE if the specified node is a leaf and FALSE otherwise.
179  */
180 static __inline bool
181 pctrie_isleaf(struct pctrie_node *node)
182 {
183 	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
184 }
185 
186 /*
187  * Returns val with leaf bit set.
188  */
189 static __inline void *
190 pctrie_toleaf(uint64_t *val)
191 {
192 	return ((void *)((uintptr_t)val | PCTRIE_ISLEAF));
193 }
194 
195 /*
196  * Returns the associated val extracted from node.
197  */
198 static __inline uint64_t *
199 pctrie_toval(struct pctrie_node *node)
200 {
201 	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
202 }
203 
204 /*
205  * Returns the associated pointer extracted from node and field offset.
206  */
207 static __inline void *
208 pctrie_toptr(struct pctrie_node *node, int keyoff)
209 {
210 	return ((void *)(((uintptr_t)node & ~PCTRIE_FLAGS) - keyoff));
211 }
212 
213 /*
214  * Make 'parent' a parent of 'child'.
215  */
216 static __inline void
217 pctrie_setparent(struct pctrie_node *child, struct pctrie_node *parent)
218 {
219 	pctrie_node_store(&child->pn_parent, parent, PCTRIE_UNSERIALIZED);
220 }
221 
222 /*
223  * Return the parent of 'node'.
224  */
225 static __inline struct pctrie_node *
226 pctrie_parent(struct pctrie_node *node)
227 {
228 	return (pctrie_node_load(&node->pn_parent, NULL, PCTRIE_UNSERIALIZED));
229 }
230 
231 /*
232  * Make 'child' a child of 'node'.
233  */
234 static __inline void
235 pctrie_addnode(struct pctrie_node *node, uint64_t index,
236     struct pctrie_node *child, enum pctrie_access access)
237 {
238 	int slot;
239 
240 	slot = pctrie_slot(node, index);
241 	pctrie_node_store(&node->pn_child[slot], child, access);
242 	node->pn_popmap ^= 1 << slot;
243 	KASSERT((node->pn_popmap & (1 << slot)) != 0,
244 	    ("%s: bad popmap slot %d in node %p", __func__, slot, node));
245 }
246 
247 /*
248  * pctrie node zone initializer.
249  */
250 int
251 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
252 {
253 	struct pctrie_node *node;
254 
255 	node = mem;
256 	node->pn_popmap = 0;
257 	for (int i = 0; i < nitems(node->pn_child); i++)
258 		pctrie_node_store(&node->pn_child[i], PCTRIE_NULL,
259 		    PCTRIE_UNSERIALIZED);
260 	return (0);
261 }
262 
263 size_t
264 pctrie_node_size(void)
265 {
266 
267 	return (sizeof(struct pctrie_node));
268 }
269 
270 /*
271  * Look for where to insert the key-value pair into the trie.  Complete the
272  * insertion if it replaces a null leaf.  Return the insertion location if the
273  * insertion needs to be completed by the caller; otherwise return NULL.
274  *
275  * If the key is already present in the trie, populate *found_out as if by
276  * pctrie_lookup().
277  */
278 static __always_inline void *
279 pctrie_insert_lookup_compound(struct pctrie *ptree, uint64_t *val,
280     struct pctrie_node **parent_out, uint64_t **found_out)
281 {
282 	uint64_t index;
283 	struct pctrie_node *node, *parent;
284 	int slot;
285 
286 	index = *val;
287 
288 	/*
289 	 * The owner of record for root is not really important because it
290 	 * will never be used.
291 	 */
292 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
293 	parent = NULL;
294 	for (;;) {
295 		if (pctrie_isleaf(node)) {
296 			if (node == PCTRIE_NULL) {
297 				if (parent == NULL)
298 					pctrie_node_store(pctrie_root(ptree),
299 					    pctrie_toleaf(val), PCTRIE_LOCKED);
300 				else
301 					pctrie_addnode(parent, index,
302 					    pctrie_toleaf(val), PCTRIE_LOCKED);
303 				*parent_out = parent;
304 				return (NULL);
305 			}
306 			if (*pctrie_toval(node) == index) {
307 				*found_out = pctrie_toval(node);
308 				*parent_out = parent;
309 				return (NULL);
310 			}
311 			break;
312 		}
313 		if (pctrie_keybarr(node, index, &slot))
314 			break;
315 		parent = node;
316 		node = pctrie_node_load(&node->pn_child[slot], NULL,
317 		    PCTRIE_LOCKED);
318 	}
319 
320 	/*
321 	 * 'node' must be replaced in the tree with a new branch node, with
322 	 * children 'node' and 'val'. Return the place that points to 'node'
323 	 * now, and will point to to the new branching node later.
324 	 */
325 	*parent_out = parent;
326 	return ((parent == NULL) ? pctrie_root(ptree): &parent->pn_child[slot]);
327 }
328 
329 /*
330  * Wrap pctrie_insert_lookup_compound to implement a strict insertion.  Panic
331  * if the key already exists, and do not look for neighboring entries.
332  */
333 void *
334 pctrie_insert_lookup_strict(struct pctrie *ptree, uint64_t *val,
335     struct pctrie_node **parent_out)
336 {
337 	void *parentp;
338 	uint64_t *found;
339 
340 	found = NULL;
341 	parentp = pctrie_insert_lookup_compound(ptree, val, parent_out,
342 	    &found);
343 	if (__predict_false(found != NULL))
344 		panic("%s: key %jx is already present", __func__,
345 		    (uintmax_t)*val);
346 	return (parentp);
347 }
348 
349 /*
350  * Wrap pctrie_insert_lookup_compound to implement find-or-insert.  Do not look
351  * for neighboring entries.
352  */
353 void *
354 pctrie_insert_lookup(struct pctrie *ptree, uint64_t *val,
355     struct pctrie_node **parent_out, uint64_t **found_out)
356 {
357 	*found_out = NULL;
358 	return (pctrie_insert_lookup_compound(ptree, val, parent_out,
359 	    found_out));
360 }
361 
362 /*
363  * Inserts newly allocated node 'child' into trie at location 'parentp', with
364  * parent 'parent' and two children, 'val' and whatever non-NULL node or leaf
365  * was at 'parentp' to begin with.
366  */
367 void
368 pctrie_insert_node(uint64_t *val, struct pctrie_node *parent, void *parentp,
369     struct pctrie_node *child)
370 {
371 	struct pctrie_node *node;
372 	uint64_t index, newind;
373 
374 	/*
375 	 * Clear the last child pointer of the newly allocated child.  We want
376 	 * to clear it after the final section has exited so lookup can not
377 	 * return false negatives.  It is done here because it will be
378 	 * cache-cold in the dtor callback.
379 	 */
380 	if (child->pn_popmap != 0) {
381 		pctrie_node_store(&child->pn_child[ffs(child->pn_popmap) - 1],
382 		    PCTRIE_NULL, PCTRIE_UNSERIALIZED);
383 		child->pn_popmap = 0;
384 	}
385 
386 	/*
387 	 * Recover the values of the two children of the new child node.  If
388 	 * 'node' is not a leaf, this stores into 'newind' the 'owner' field,
389 	 * which must be first in the node.
390 	 */
391 	index = *val;
392 	node = pctrie_node_load(parentp, NULL, PCTRIE_UNSERIALIZED);
393 	pctrie_setparent(child, parent);
394 	if (!pctrie_isleaf(node))
395 		pctrie_setparent(node, child);
396 	newind = *pctrie_toval(node);
397 
398 	/*
399 	 * From the highest-order bit where the indexes differ,
400 	 * compute the highest level in the trie where they differ.  Then,
401 	 * compute the least index of this subtrie.
402 	 */
403 	_Static_assert(sizeof(long long) >= sizeof(uint64_t),
404 	    "uint64 too wide");
405 	_Static_assert(sizeof(uint64_t) * NBBY <=
406 	    (1 << (sizeof(child->pn_clev) * NBBY)), "pn_clev too narrow");
407 	child->pn_clev = rounddown(ilog2(index ^ newind), PCTRIE_WIDTH);
408 	child->pn_owner = PCTRIE_COUNT;
409 	child->pn_owner = index & -(child->pn_owner << child->pn_clev);
410 
411 
412 	/* These writes are not yet visible due to ordering. */
413 	pctrie_addnode(child, index, pctrie_toleaf(val), PCTRIE_UNSERIALIZED);
414 	pctrie_addnode(child, newind, node, PCTRIE_UNSERIALIZED);
415 	/* Synchronize to make the above visible. */
416 	pctrie_node_store(parentp, child, PCTRIE_LOCKED);
417 }
418 
419 /*
420  * Return the value associated with the node, if the node is a leaf that matches
421  * the index; otherwise NULL.
422  */
423 static __always_inline uint64_t *
424 pctrie_match_value(struct pctrie_node *node, uint64_t index)
425 {
426 	uint64_t *m;
427 
428 	if (!pctrie_isleaf(node) || (m = pctrie_toval(node)) == NULL ||
429 	    *m != index)
430 		m = NULL;
431 	return (m);
432 }
433 
434 /*
435  * Returns the value stored at the index.  If the index is not present,
436  * NULL is returned.
437  */
438 static __always_inline uint64_t *
439 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
440     enum pctrie_access access)
441 {
442 	struct pctrie_node *node;
443 	int slot;
444 
445 	node = pctrie_root_load(ptree, smr, access);
446 	/* Seek a node that matches index. */
447 	while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot))
448 		node = pctrie_node_load(&node->pn_child[slot], smr, access);
449 	return (pctrie_match_value(node, index));
450 }
451 
452 /*
453  * Returns the value stored at the index, assuming access is externally
454  * synchronized by a lock.
455  *
456  * If the index is not present, NULL is returned.
457  */
458 uint64_t *
459 pctrie_lookup(struct pctrie *ptree, uint64_t index)
460 {
461 	return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
462 }
463 
464 /*
465  * Returns the value stored at the index without requiring an external lock.
466  *
467  * If the index is not present, NULL is returned.
468  */
469 uint64_t *
470 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
471 {
472 	uint64_t *res;
473 
474 	smr_enter(smr);
475 	res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
476 	smr_exit(smr);
477 	return (res);
478 }
479 
480 /*
481  * Returns the last node examined in the search for the index, and sets the
482  * parent of that node.
483  */
484 static __always_inline struct pctrie_node *
485 _pctrie_lookup_node(struct pctrie *ptree, struct pctrie_node *node,
486     uint64_t index, struct pctrie_node **parent_out,
487     smr_t smr, enum pctrie_access access)
488 {
489 	struct pctrie_node *parent;
490 	int slot;
491 
492 	/*
493 	 * Climb the search path to find the lowest node from which to start the
494 	 * search for a value matching 'index'.
495 	 */
496 	while (node != NULL) {
497 		KASSERT(access == PCTRIE_SMR || !powerof2(node->pn_popmap),
498 		    ("%s: freed node in iter path", __func__));
499 		if (!pctrie_keybarr(node, index, &slot))
500 			break;
501 		node = pctrie_parent(node);
502 	}
503 
504 	if (node == NULL) {
505 		parent = NULL;
506 		node = pctrie_root_load(ptree, smr, access);
507 	} else {
508 		parent = node;
509 		node = pctrie_node_load(&node->pn_child[slot], smr, access);
510 	}
511 
512 	/* Seek a node that matches index. */
513 	while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot)) {
514 		parent = node;
515 		node = pctrie_node_load(&node->pn_child[slot], smr, access);
516 	}
517 	if (parent_out != NULL)
518 		*parent_out = parent;
519 	return (node);
520 }
521 
522 /*
523  * Returns the value stored at a given index value, possibly NULL, assuming
524  * access is externally synchronized by a lock.
525  */
526 uint64_t *
527 pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index)
528 {
529 	struct pctrie_node *node;
530 
531 	it->index = index;
532 	node = _pctrie_lookup_node(it->ptree, it->node, index, &it->node,
533 	    NULL, PCTRIE_LOCKED);
534 	return (pctrie_match_value(node, index));
535 }
536 
537 /*
538  * Insert the val in the trie, starting search with iterator.  Return a pointer
539  * to indicate where a new node must be allocated to complete insertion.
540  * Assumes access is externally synchronized by a lock.
541  */
542 void *
543 pctrie_iter_insert_lookup(struct pctrie_iter *it, uint64_t *val)
544 {
545 	struct pctrie_node *node;
546 
547 	it->index = *val;
548 	node = _pctrie_lookup_node(it->ptree, it->node, *val, &it->node,
549 	    NULL, PCTRIE_LOCKED);
550 	if (node == PCTRIE_NULL) {
551 		if (it->node == NULL)
552 			pctrie_node_store(pctrie_root(it->ptree),
553 			    pctrie_toleaf(val), PCTRIE_LOCKED);
554 		else
555 			pctrie_addnode(it->node, it->index,
556 			    pctrie_toleaf(val), PCTRIE_LOCKED);
557 		return (NULL);
558 	}
559 	if (__predict_false(pctrie_match_value(node, it->index) != NULL))
560 		panic("%s: key %jx is already present", __func__,
561 		    (uintmax_t)it->index);
562 
563 	/*
564 	 * 'node' must be replaced in the tree with a new branch node, with
565 	 * children 'node' and 'val'. Return the place that points to 'node'
566 	 * now, and will point to to the new branching node later.
567 	 */
568 	return (pctrie_child(it->ptree, it->node, it->index));
569 }
570 
571 /*
572  * Returns the value stored at a fixed offset from the current index value,
573  * possibly NULL.
574  */
575 uint64_t *
576 pctrie_iter_stride(struct pctrie_iter *it, int stride)
577 {
578 	uint64_t index = it->index + stride;
579 
580 	/* Detect stride overflow. */
581 	if ((stride > 0) != (index > it->index))
582 		return (NULL);
583 	/* Detect crossing limit */
584 	if ((index < it->limit) != (it->index < it->limit))
585 		return (NULL);
586 
587 	return (pctrie_iter_lookup(it, index));
588 }
589 
590 /*
591  * Returns the value stored at one more than the current index value, possibly
592  * NULL, assuming access is externally synchronized by a lock.
593  */
594 uint64_t *
595 pctrie_iter_next(struct pctrie_iter *it)
596 {
597 	return (pctrie_iter_stride(it, 1));
598 }
599 
600 /*
601  * Returns the value stored at one less than the current index value, possibly
602  * NULL, assuming access is externally synchronized by a lock.
603  */
604 uint64_t *
605 pctrie_iter_prev(struct pctrie_iter *it)
606 {
607 	return (pctrie_iter_stride(it, -1));
608 }
609 
610 /*
611  * Returns the number of contiguous, non-NULL entries read into the value[]
612  * array, starting at index.
613  */
614 static __always_inline int
615 _pctrie_lookup_range(struct pctrie *ptree, struct pctrie_node *node,
616     uint64_t index, uint64_t *value[], int count,
617     struct pctrie_node **parent_out, smr_t smr, enum pctrie_access access)
618 {
619 	struct pctrie_node *parent;
620 	uint64_t *val;
621 	int base, end, i;
622 
623 	parent = node;
624 	for (i = 0; i < count;) {
625 		node = _pctrie_lookup_node(ptree, parent, index + i, &parent,
626 		    smr, access);
627 		if ((val = pctrie_match_value(node, index + i)) == NULL)
628 			break;
629 		value[i++] = val;
630 		base = (index + i) % PCTRIE_COUNT;
631 		if (base == 0 || parent == NULL || parent->pn_clev != 0)
632 			continue;
633 
634 		/*
635 		 * For PCTRIE_SMR, compute an upper bound on the number of
636 		 * children of this parent left to examine.  For PCTRIE_LOCKED,
637 		 * compute the number of non-NULL children from base up to the
638 		 * first NULL child, if any, using the fact that pn_popmap has
639 		 * bits set for only the non-NULL children.
640 		 *
641 		 * The pn_popmap field is accessed only when a lock is held.
642 		 * To use it for PCTRIE_SMR here would require that we know that
643 		 * race conditions cannot occur if the tree is modified while
644 		 * accessed here.  Guarantees about the visibility of changes to
645 		 * child pointers, enforced by memory barriers on the writing of
646 		 * pointers, are not present for the pn_popmap field, so that
647 		 * the popmap bit for a child page may, for an instant,
648 		 * misrepresent the nullness of the child page because an
649 		 * operation modifying the pctrie is in progress.
650 		 */
651 		end = (access == PCTRIE_SMR) ? PCTRIE_COUNT - base :
652 		    ffs((parent->pn_popmap >> base) + 1) - 1;
653 		end = MIN(count, i + end);
654 		while (i < end) {
655 			node = pctrie_node_load(&parent->pn_child[base++],
656 			    smr, access);
657 			val = pctrie_toval(node);
658 			if (access == PCTRIE_SMR && val == NULL)
659 				break;
660 			value[i++] = val;
661 			KASSERT(val != NULL,
662 			    ("%s: null child written to range", __func__));
663 		}
664 		if (access == PCTRIE_SMR) {
665 			if (i < end)
666 				break;
667 		} else {
668 			if (base < PCTRIE_COUNT)
669 				break;
670 		}
671 	}
672 	if (parent_out != NULL)
673 		*parent_out = parent;
674 	return (i);
675 }
676 
677 /*
678  * Returns the number of contiguous, non-NULL entries read into the value[]
679  * array, starting at index, assuming access is externally synchronized by a
680  * lock.
681  */
682 int
683 pctrie_lookup_range(struct pctrie *ptree, uint64_t index,
684     uint64_t *value[], int count)
685 {
686 	return (_pctrie_lookup_range(ptree, NULL, index, value, count, NULL,
687 	    NULL, PCTRIE_LOCKED));
688 }
689 
690 /*
691  * Returns the number of contiguous, non-NULL entries read into the value[]
692  * array, starting at index, without requiring an external lock.  These entries
693  * *may* never have been in the pctrie all at one time, but for a series of
694  * times t0, t1, t2, ..., with ti <= t(i+1), value[i] was in the trie at time
695  * ti.
696  */
697 int
698 pctrie_lookup_range_unlocked(struct pctrie *ptree, uint64_t index,
699     uint64_t *value[], int count, smr_t smr)
700 {
701 	int res;
702 
703 	smr_enter(smr);
704 	res = _pctrie_lookup_range(ptree, NULL, index, value, count, NULL,
705 	    smr, PCTRIE_SMR);
706 	smr_exit(smr);
707 	return (res);
708 }
709 
710 /*
711  * Returns the number of contiguous, non-NULL entries read into the value[]
712  * array, starting at index, assuming access is externally synchronized by a
713  * lock.  Uses an iterator.
714  */
715 int
716 pctrie_iter_lookup_range(struct pctrie_iter *it, uint64_t index,
717     uint64_t *value[], int count)
718 {
719 	return (_pctrie_lookup_range(it->ptree, it->node, index, value, count,
720 	    &it->node, NULL, PCTRIE_LOCKED));
721 }
722 
723 /*
724  * Find first leaf >= index, and fill iter with the path to the parent of that
725  * leaf.  Return NULL if there is no such leaf less than limit.
726  */
727 static __inline uint64_t *
728 _pctrie_lookup_ge(struct pctrie *ptree, struct pctrie_node *node,
729     uint64_t index, struct pctrie_node **parent_out, uint64_t limit)
730 {
731 	struct pctrie_node *parent;
732 	uint64_t *m;
733 	int slot;
734 
735 	/* Seek a node that matches index. */
736 	node = _pctrie_lookup_node(ptree, node, index, &parent,
737 	    NULL, PCTRIE_LOCKED);
738 
739 	/*
740 	 * If no such node was found, and instead this path leads only to nodes
741 	 * < index, back up to find a subtrie with the least value > index.
742 	 */
743 	if (node == PCTRIE_NULL || *pctrie_toval(node) < index) {
744 		/* Climb the path to find a node with a descendant > index. */
745 		for (node = parent; node != NULL; node = pctrie_parent(node)) {
746 			slot = pctrie_slot(node, index) + 1;
747 			if ((node->pn_popmap >> slot) != 0)
748 				break;
749 		}
750 		if (node == NULL) {
751 			if (parent_out != NULL)
752 				*parent_out = NULL;
753 			return (NULL);
754 		}
755 
756 		/* Step to the least child with a descendant > index. */
757 		slot += ffs(node->pn_popmap >> slot) - 1;
758 		parent = node;
759 		node = pctrie_node_load(&node->pn_child[slot], NULL,
760 		    PCTRIE_LOCKED);
761 	}
762 	/* Descend to the least leaf of the subtrie. */
763 	while (!pctrie_isleaf(node)) {
764 		if (limit != 0 && node->pn_owner >= limit)
765 			return (NULL);
766 		slot = ffs(node->pn_popmap) - 1;
767 		parent = node;
768 		node = pctrie_node_load(&node->pn_child[slot], NULL,
769 		    PCTRIE_LOCKED);
770 	}
771 	if (parent_out != NULL)
772 		*parent_out = parent;
773 	m = pctrie_toval(node);
774 	if (limit != 0 && *m >= limit)
775 		return (NULL);
776 	return (m);
777 }
778 
779 uint64_t *
780 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
781 {
782 	return (_pctrie_lookup_ge(ptree, NULL, index, NULL, 0));
783 }
784 
785 /*
786  * Find first leaf >= index, and fill iter with the path to the parent of that
787  * leaf.  Return NULL if there is no such leaf less than limit.
788  */
789 uint64_t *
790 pctrie_iter_lookup_ge(struct pctrie_iter *it, uint64_t index)
791 {
792 	uint64_t *m;
793 
794 	m = _pctrie_lookup_ge(it->ptree, it->node, index, &it->node, it->limit);
795 	if (m != NULL)
796 		it->index = *m;
797 	return (m);
798 }
799 
800 /*
801  * Find the first leaf with value at least 'jump' greater than the previous
802  * leaf.  Return NULL if that value is >= limit.
803  */
804 uint64_t *
805 pctrie_iter_jump_ge(struct pctrie_iter *it, int64_t jump)
806 {
807 	uint64_t index = it->index + jump;
808 
809 	/* Detect jump overflow. */
810 	if ((jump > 0) != (index > it->index))
811 		return (NULL);
812 	if (it->limit != 0 && index >= it->limit)
813 		return (NULL);
814 	return (pctrie_iter_lookup_ge(it, index));
815 }
816 
817 /*
818  * Find first leaf <= index, and fill iter with the path to the parent of that
819  * leaf.  Return NULL if there is no such leaf greater than limit.
820  */
821 static __inline uint64_t *
822 _pctrie_lookup_le(struct pctrie *ptree, struct pctrie_node *node,
823     uint64_t index, struct pctrie_node **parent_out, uint64_t limit)
824 {
825 	struct pctrie_node *parent;
826 	uint64_t *m;
827 	int slot;
828 
829 	/* Seek a node that matches index. */
830 	node = _pctrie_lookup_node(ptree, node, index, &parent, NULL,
831 	    PCTRIE_LOCKED);
832 
833 	/*
834 	 * If no such node was found, and instead this path leads only to nodes
835 	 * > index, back up to find a subtrie with the greatest value < index.
836 	 */
837 	if (node == PCTRIE_NULL || *pctrie_toval(node) > index) {
838 		/* Climb the path to find a node with a descendant < index. */
839 		for (node = parent; node != NULL; node = pctrie_parent(node)) {
840 			slot = pctrie_slot(node, index);
841 			if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
842 				break;
843 		}
844 		if (node == NULL) {
845 			if (parent_out != NULL)
846 				*parent_out = NULL;
847 			return (NULL);
848 		}
849 
850 		/* Step to the greatest child with a descendant < index. */
851 		slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
852 		parent = node;
853 		node = pctrie_node_load(&node->pn_child[slot], NULL,
854 		    PCTRIE_LOCKED);
855 	}
856 	/* Descend to the greatest leaf of the subtrie. */
857 	while (!pctrie_isleaf(node)) {
858 		if (limit != 0 && limit >= node->pn_owner +
859 		    ((uint64_t)PCTRIE_COUNT << node->pn_clev) - 1)
860 			return (NULL);
861 		slot = ilog2(node->pn_popmap);
862 		parent = node;
863 		node = pctrie_node_load(&node->pn_child[slot], NULL,
864 		    PCTRIE_LOCKED);
865 	}
866 	if (parent_out != NULL)
867 		*parent_out = parent;
868 	m = pctrie_toval(node);
869 	if (limit != 0 && *m <= limit)
870 		return (NULL);
871 	return (m);
872 }
873 
874 uint64_t *
875 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
876 {
877 	return (_pctrie_lookup_le(ptree, NULL, index, NULL, 0));
878 }
879 
880 uint64_t *
881 pctrie_subtree_lookup_lt(struct pctrie *ptree, struct pctrie_node *node,
882     uint64_t index)
883 {
884 	if (index == 0)
885 		return (NULL);
886 	return (_pctrie_lookup_le(ptree, node, index - 1, NULL, 0));
887 }
888 
889 /*
890  * Find first leaf <= index, and fill iter with the path to the parent of that
891  * leaf.  Return NULL if there is no such leaf greater than limit.
892  */
893 uint64_t *
894 pctrie_iter_lookup_le(struct pctrie_iter *it, uint64_t index)
895 {
896 	uint64_t *m;
897 
898 	m = _pctrie_lookup_le(it->ptree, it->node, index, &it->node, it->limit);
899 	if (m != NULL)
900 		it->index = *m;
901 	return (m);
902 }
903 
904 /*
905  * Find the first leaf with value at most 'jump' less than the previous
906  * leaf.  Return NULL if that value is <= limit.
907  */
908 uint64_t *
909 pctrie_iter_jump_le(struct pctrie_iter *it, int64_t jump)
910 {
911 	uint64_t index = it->index - jump;
912 
913 	/* Detect jump overflow. */
914 	if ((jump > 0) != (index < it->index))
915 		return (NULL);
916 	if (it->limit != 0 && index <= it->limit)
917 		return (NULL);
918 	return (pctrie_iter_lookup_le(it, index));
919 }
920 
921 /*
922  * Remove the non-NULL child identified by 'index' from the set of children of
923  * 'node'.  If doing so causes 'node' to have only one child, purge it from the
924  * pctrie and save it in *freenode for later disposal.
925  */
926 static void
927 pctrie_remove(struct pctrie *ptree, struct pctrie_node *node, uint64_t index,
928     struct pctrie_node **freenode)
929 {
930 	smr_pctnode_t *parentp;
931 	struct pctrie_node *child;
932 	int slot;
933 
934 	*freenode = NULL;
935 	parentp = pctrie_child(ptree, node, index);
936 	if (node == NULL) {
937 		pctrie_node_store(parentp, PCTRIE_NULL, PCTRIE_LOCKED);
938 		return;
939 	}
940 	slot = pctrie_slot(node, index);
941 	KASSERT((node->pn_popmap & (1 << slot)) != 0,
942 	    ("%s: bad popmap slot %d in node %p",
943 	    __func__, slot, node));
944 	node->pn_popmap ^= 1 << slot;
945 	if (!powerof2(node->pn_popmap)) {
946 		pctrie_node_store(parentp, PCTRIE_NULL, PCTRIE_LOCKED);
947 		return;
948 	}
949 	pctrie_node_store(parentp, PCTRIE_NULL, PCTRIE_UNSERIALIZED);
950 	KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
951 	slot = ffs(node->pn_popmap) - 1;
952 	*freenode = node;
953 	child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED);
954 	KASSERT(child != PCTRIE_NULL,
955 	    ("%s: bad popmap slot %d in node %p", __func__, slot, node));
956 	node = pctrie_parent(node);
957 	if (!pctrie_isleaf(child))
958 		pctrie_setparent(child, node);
959 	parentp = pctrie_child(ptree, node, index);
960 	pctrie_node_store(parentp, child, PCTRIE_LOCKED);
961 }
962 
963 /*
964  * Remove the specified index from the tree, and return the value stored at
965  * that index.  If the index is not present, return NULL.
966  */
967 uint64_t *
968 pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
969     struct pctrie_node **freenode)
970 {
971 	struct pctrie_node *child, *node;
972 	uint64_t *m;
973 	int slot;
974 
975 	node = NULL;
976 	child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
977 	while (!pctrie_isleaf(child)) {
978 		node = child;
979 		slot = pctrie_slot(node, index);
980 		child = pctrie_node_load(&node->pn_child[slot], NULL,
981 		    PCTRIE_LOCKED);
982 	}
983 	if ((m = pctrie_match_value(child, index)) != NULL)
984 		pctrie_remove(ptree, node, index, freenode);
985 	else
986 		*freenode = NULL;
987 	return (m);
988 }
989 
990 /*
991  * Remove from the trie the leaf last chosen by the iterator, and
992  * adjust the path if it's last member is to be freed.
993  */
994 void
995 pctrie_iter_remove(struct pctrie_iter *it, struct pctrie_node **freenode)
996 {
997 	KASSERT(NULL != pctrie_match_value(pctrie_node_load(pctrie_child(
998 	    it->ptree, it->node, it->index), NULL, PCTRIE_LOCKED), it->index),
999 	    ("%s: removing value %jx not at iter", __func__,
1000 	    (uintmax_t)it->index));
1001 	pctrie_remove(it->ptree, it->node, it->index, freenode);
1002 	if (*freenode != NULL)
1003 		it->node = pctrie_parent(it->node);
1004 }
1005 
1006 /*
1007  * Return the current leaf, assuming access is externally synchronized by a
1008  * lock.
1009  */
1010 uint64_t *
1011 pctrie_iter_value(struct pctrie_iter *it)
1012 {
1013 	struct pctrie_node *node;
1014 
1015 	node = pctrie_node_load(pctrie_child(it->ptree, it->node, it->index),
1016 	    NULL, PCTRIE_LOCKED);
1017 	return (pctrie_toval(node));
1018 }
1019 
1020 /*
1021  * Walk the subtrie rooted at *pnode in order, invoking callback on leaves,
1022  * until an interior node is stripped of all children, and returned for
1023  * deallocation, with *pnode left pointing to the parent of that node.
1024  */
1025 static __always_inline struct pctrie_node *
1026 pctrie_reclaim_prune(struct pctrie_node **pnode, struct pctrie_node *parent,
1027     pctrie_cb_t callback, int keyoff, void *arg)
1028 {
1029 	struct pctrie_node *child, *node;
1030 	int slot;
1031 
1032 	node = *pnode;
1033 	while (node->pn_popmap != 0) {
1034 		slot = ffs(node->pn_popmap) - 1;
1035 		node->pn_popmap ^= 1 << slot;
1036 		child = pctrie_node_load(&node->pn_child[slot], NULL,
1037 		    PCTRIE_UNSERIALIZED);
1038 		pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL,
1039 		    PCTRIE_UNSERIALIZED);
1040 		if (pctrie_isleaf(child)) {
1041 			if (callback != NULL)
1042 				callback(pctrie_toptr(child, keyoff), arg);
1043 			continue;
1044 		}
1045 		/* Climb one level down the trie. */
1046 		parent = node;
1047 		node = child;
1048 	}
1049 	*pnode = parent;
1050 	return (node);
1051 }
1052 
1053 /*
1054  * Recover the node parent from its first child and continue pruning.
1055  */
1056 static __always_inline struct pctrie_node *
1057 pctrie_reclaim_resume_compound(struct pctrie_node **pnode,
1058     pctrie_cb_t callback, int keyoff, void *arg)
1059 {
1060 	if (*pnode == NULL)
1061 		return (NULL);
1062 	/* Climb one level up the trie. */
1063 	return (pctrie_reclaim_prune(pnode, pctrie_parent(*pnode), callback,
1064 	    keyoff, arg));
1065 }
1066 
1067 /*
1068  * Find the trie root, and start pruning with a NULL parent.
1069  */
1070 static __always_inline struct pctrie_node *
1071 pctrie_reclaim_begin_compound(struct pctrie_node **pnode,
1072     struct pctrie *ptree,
1073     pctrie_cb_t callback, int keyoff, void *arg)
1074 {
1075 	struct pctrie_node *node;
1076 
1077 	node = pctrie_root_load(ptree, NULL, PCTRIE_UNSERIALIZED);
1078 	pctrie_node_store(pctrie_root(ptree), PCTRIE_NULL, PCTRIE_UNSERIALIZED);
1079 	if (pctrie_isleaf(node)) {
1080 		if (callback != NULL && node != PCTRIE_NULL)
1081 			callback(pctrie_toptr(node, keyoff), arg);
1082 		return (NULL);
1083 	}
1084 	*pnode = node;
1085 	return (pctrie_reclaim_prune(pnode, NULL, callback, keyoff, arg));
1086 }
1087 
1088 struct pctrie_node *
1089 pctrie_reclaim_resume(struct pctrie_node **pnode)
1090 {
1091 	return (pctrie_reclaim_resume_compound(pnode, NULL, 0, NULL));
1092 }
1093 
1094 struct pctrie_node *
1095 pctrie_reclaim_begin(struct pctrie_node **pnode, struct pctrie *ptree)
1096 {
1097 	return (pctrie_reclaim_begin_compound(pnode, ptree, NULL, 0, NULL));
1098 }
1099 
1100 struct pctrie_node *
1101 pctrie_reclaim_resume_cb(struct pctrie_node **pnode,
1102     pctrie_cb_t callback, int keyoff, void *arg)
1103 {
1104 	return (pctrie_reclaim_resume_compound(pnode, callback, keyoff, arg));
1105 }
1106 
1107 struct pctrie_node *
1108 pctrie_reclaim_begin_cb(struct pctrie_node **pnode, struct pctrie *ptree,
1109     pctrie_cb_t callback, int keyoff, void *arg)
1110 {
1111 	return (pctrie_reclaim_begin_compound(pnode, ptree,
1112 	    callback, keyoff, arg));
1113 }
1114 
1115 /*
1116  * Replace an existing value in the trie with another one.
1117  * Panics if there is not an old value in the trie at the new value's index.
1118  */
1119 uint64_t *
1120 pctrie_replace(struct pctrie *ptree, uint64_t *newval)
1121 {
1122 	struct pctrie_node *leaf, *parent, *node;
1123 	uint64_t *m;
1124 	uint64_t index;
1125 	int slot;
1126 
1127 	leaf = pctrie_toleaf(newval);
1128 	index = *newval;
1129 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
1130 	parent = NULL;
1131 	for (;;) {
1132 		if (pctrie_isleaf(node)) {
1133 			if ((m = pctrie_toval(node)) != NULL && *m == index) {
1134 				if (parent == NULL)
1135 					pctrie_node_store(pctrie_root(ptree),
1136 					    leaf, PCTRIE_LOCKED);
1137 				else
1138 					pctrie_node_store(
1139 					    &parent->pn_child[slot], leaf,
1140 					    PCTRIE_LOCKED);
1141 				return (m);
1142 			}
1143 			break;
1144 		}
1145 		if (pctrie_keybarr(node, index, &slot))
1146 			break;
1147 		parent = node;
1148 		node = pctrie_node_load(&node->pn_child[slot], NULL,
1149 		    PCTRIE_LOCKED);
1150 	}
1151 	panic("%s: original replacing value not found", __func__);
1152 }
1153 
1154 #ifdef DDB
1155 /*
1156  * Show details about the given node.
1157  */
1158 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
1159 {
1160 	struct pctrie_node *node, *tmp;
1161 	int slot;
1162 	pn_popmap_t popmap;
1163 
1164         if (!have_addr)
1165                 return;
1166 	node = (struct pctrie_node *)addr;
1167 	db_printf("node %p, owner %jx, children popmap %04x, level %u:\n",
1168 	    (void *)node, (uintmax_t)node->pn_owner, node->pn_popmap,
1169 	    node->pn_clev / PCTRIE_WIDTH);
1170 	for (popmap = node->pn_popmap; popmap != 0; popmap ^= 1 << slot) {
1171 		slot = ffs(popmap) - 1;
1172 		tmp = pctrie_node_load(&node->pn_child[slot], NULL,
1173 		    PCTRIE_UNSERIALIZED);
1174 		db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
1175 		    slot, (void *)tmp,
1176 		    pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,
1177 		    node->pn_clev / PCTRIE_WIDTH);
1178 	}
1179 }
1180 #endif /* DDB */
1181