xref: /freebsd/sys/kern/subr_pctrie.c (revision 3806950135d2c8633ec0764e8807eacc87cf3e10)
1 /*
2  * Copyright (c) 2013 EMC Corp.
3  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  */
29 
30 /*
31  * Path-compressed radix trie implementation.
32  *
33  * The implementation takes into account the following rationale:
34  * - Size of the nodes should be as small as possible but still big enough
35  *   to avoid a large maximum depth for the trie.  This is a balance
36  *   between the necessity to not wire too much physical memory for the nodes
37  *   and the necessity to avoid too much cache pollution during the trie
38  *   operations.
39  * - There is not a huge bias toward the number of lookup operations over
40  *   the number of insert and remove operations.  This basically implies
41  *   that optimizations supposedly helping one operation but hurting the
42  *   other might be carefully evaluated.
43  * - On average not many nodes are expected to be fully populated, hence
44  *   level compression may just complicate things.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include "opt_ddb.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
56 
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60 
61 #define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
62 #define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
63 
64 /* Flag bits stored in node pointers. */
65 #define	PCTRIE_ISLEAF	0x1
66 #define	PCTRIE_FLAGS	0x1
67 #define	PCTRIE_PAD	PCTRIE_FLAGS
68 
69 /* Returns one unit associated with specified level. */
70 #define	PCTRIE_UNITLEVEL(lev)						\
71 	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
72 
73 struct pctrie_node {
74 	uint64_t	 pn_owner;			/* Owner of record. */
75 	uint16_t	 pn_count;			/* Valid children. */
76 	uint16_t	 pn_clev;			/* Current level. */
77 	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
78 };
79 
80 /*
81  * Allocate a node.  Pre-allocation should ensure that the request
82  * will always be satisfied.
83  */
84 static __inline struct pctrie_node *
85 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
86     uint16_t count, uint16_t clevel)
87 {
88 	struct pctrie_node *node;
89 
90 	node = allocfn(ptree);
91 	if (node == NULL)
92 		return (NULL);
93 	node->pn_owner = owner;
94 	node->pn_count = count;
95 	node->pn_clev = clevel;
96 
97 	return (node);
98 }
99 
100 /*
101  * Free radix node.
102  */
103 static __inline void
104 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
105     pctrie_free_t freefn)
106 {
107 #ifdef INVARIANTS
108 	int slot;
109 
110 	KASSERT(node->pn_count == 0,
111 	    ("pctrie_node_put: node %p has %d children", node,
112 	    node->pn_count));
113 	for (slot = 0; slot < PCTRIE_COUNT; slot++)
114 		KASSERT(node->pn_child[slot] == NULL,
115 		    ("pctrie_node_put: node %p has a child", node));
116 #endif
117 	freefn(ptree, node);
118 }
119 
120 /*
121  * Return the position in the array for a given level.
122  */
123 static __inline int
124 pctrie_slot(uint64_t index, uint16_t level)
125 {
126 
127 	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
128 }
129 
130 /* Trims the key after the specified level. */
131 static __inline uint64_t
132 pctrie_trimkey(uint64_t index, uint16_t level)
133 {
134 	uint64_t ret;
135 
136 	ret = index;
137 	if (level > 0) {
138 		ret >>= level * PCTRIE_WIDTH;
139 		ret <<= level * PCTRIE_WIDTH;
140 	}
141 	return (ret);
142 }
143 
144 /*
145  * Get the root node for a tree.
146  */
147 static __inline struct pctrie_node *
148 pctrie_getroot(struct pctrie *ptree)
149 {
150 
151 	return ((struct pctrie_node *)ptree->pt_root);
152 }
153 
154 /*
155  * Set the root node for a tree.
156  */
157 static __inline void
158 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
159 {
160 
161 	ptree->pt_root = (uintptr_t)node;
162 }
163 
164 /*
165  * Returns TRUE if the specified node is a leaf and FALSE otherwise.
166  */
167 static __inline boolean_t
168 pctrie_isleaf(struct pctrie_node *node)
169 {
170 
171 	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
172 }
173 
174 /*
175  * Returns the associated val extracted from node.
176  */
177 static __inline uint64_t *
178 pctrie_toval(struct pctrie_node *node)
179 {
180 
181 	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
182 }
183 
184 /*
185  * Adds the val as a child of the provided node.
186  */
187 static __inline void
188 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
189     uint64_t *val)
190 {
191 	int slot;
192 
193 	slot = pctrie_slot(index, clev);
194 	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
195 }
196 
197 /*
198  * Returns the slot where two keys differ.
199  * It cannot accept 2 equal keys.
200  */
201 static __inline uint16_t
202 pctrie_keydiff(uint64_t index1, uint64_t index2)
203 {
204 	uint16_t clev;
205 
206 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
207 	    __func__, (uintmax_t)index1));
208 
209 	index1 ^= index2;
210 	for (clev = PCTRIE_LIMIT;; clev--)
211 		if (pctrie_slot(index1, clev) != 0)
212 			return (clev);
213 }
214 
215 /*
216  * Returns TRUE if it can be determined that key does not belong to the
217  * specified node.  Otherwise, returns FALSE.
218  */
219 static __inline boolean_t
220 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
221 {
222 
223 	if (node->pn_clev < PCTRIE_LIMIT) {
224 		idx = pctrie_trimkey(idx, node->pn_clev + 1);
225 		return (idx != node->pn_owner);
226 	}
227 	return (FALSE);
228 }
229 
230 /*
231  * Internal helper for pctrie_reclaim_allnodes().
232  * This function is recursive.
233  */
234 static void
235 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
236     pctrie_free_t freefn)
237 {
238 	int slot;
239 
240 	KASSERT(node->pn_count <= PCTRIE_COUNT,
241 	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
242 	for (slot = 0; node->pn_count != 0; slot++) {
243 		if (node->pn_child[slot] == NULL)
244 			continue;
245 		if (!pctrie_isleaf(node->pn_child[slot]))
246 			pctrie_reclaim_allnodes_int(ptree,
247 			    node->pn_child[slot], freefn);
248 		node->pn_child[slot] = NULL;
249 		node->pn_count--;
250 	}
251 	pctrie_node_put(ptree, node, freefn);
252 }
253 
254 /*
255  * pctrie node zone initializer.
256  */
257 int
258 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
259 {
260 	struct pctrie_node *node;
261 
262 	node = mem;
263 	memset(node->pn_child, 0, sizeof(node->pn_child));
264 	return (0);
265 }
266 
267 size_t
268 pctrie_node_size(void)
269 {
270 
271 	return (sizeof(struct pctrie_node));
272 }
273 
274 /*
275  * Inserts the key-value pair into the trie.
276  * Panics if the key already exists.
277  */
278 int
279 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
280 {
281 	uint64_t index, newind;
282 	void **parentp;
283 	struct pctrie_node *node, *tmp;
284 	uint64_t *m;
285 	int slot;
286 	uint16_t clev;
287 
288 	index = *val;
289 
290 	/*
291 	 * The owner of record for root is not really important because it
292 	 * will never be used.
293 	 */
294 	node = pctrie_getroot(ptree);
295 	if (node == NULL) {
296 		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
297 		return (0);
298 	}
299 	parentp = (void **)&ptree->pt_root;
300 	for (;;) {
301 		if (pctrie_isleaf(node)) {
302 			m = pctrie_toval(node);
303 			if (*m == index)
304 				panic("%s: key %jx is already present",
305 				    __func__, (uintmax_t)index);
306 			clev = pctrie_keydiff(*m, index);
307 			tmp = pctrie_node_get(ptree, allocfn,
308 			    pctrie_trimkey(index, clev + 1), 2, clev);
309 			if (tmp == NULL)
310 				return (ENOMEM);
311 			*parentp = tmp;
312 			pctrie_addval(tmp, index, clev, val);
313 			pctrie_addval(tmp, *m, clev, m);
314 			return (0);
315 		} else if (pctrie_keybarr(node, index))
316 			break;
317 		slot = pctrie_slot(index, node->pn_clev);
318 		if (node->pn_child[slot] == NULL) {
319 			node->pn_count++;
320 			pctrie_addval(node, index, node->pn_clev, val);
321 			return (0);
322 		}
323 		parentp = &node->pn_child[slot];
324 		node = node->pn_child[slot];
325 	}
326 
327 	/*
328 	 * A new node is needed because the right insertion level is reached.
329 	 * Setup the new intermediate node and add the 2 children: the
330 	 * new object and the older edge.
331 	 */
332 	newind = node->pn_owner;
333 	clev = pctrie_keydiff(newind, index);
334 	tmp = pctrie_node_get(ptree, allocfn,
335 	    pctrie_trimkey(index, clev + 1), 2, clev);
336 	if (tmp == NULL)
337 		return (ENOMEM);
338 	*parentp = tmp;
339 	pctrie_addval(tmp, index, clev, val);
340 	slot = pctrie_slot(newind, clev);
341 	tmp->pn_child[slot] = node;
342 
343 	return (0);
344 }
345 
346 /*
347  * Returns the value stored at the index.  If the index is not present,
348  * NULL is returned.
349  */
350 uint64_t *
351 pctrie_lookup(struct pctrie *ptree, uint64_t index)
352 {
353 	struct pctrie_node *node;
354 	uint64_t *m;
355 	int slot;
356 
357 	node = pctrie_getroot(ptree);
358 	while (node != NULL) {
359 		if (pctrie_isleaf(node)) {
360 			m = pctrie_toval(node);
361 			if (*m == index)
362 				return (m);
363 			else
364 				break;
365 		} else if (pctrie_keybarr(node, index))
366 			break;
367 		slot = pctrie_slot(index, node->pn_clev);
368 		node = node->pn_child[slot];
369 	}
370 	return (NULL);
371 }
372 
373 /*
374  * Look up the nearest entry at a position bigger than or equal to index.
375  */
376 uint64_t *
377 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
378 {
379 	struct pctrie_node *stack[PCTRIE_LIMIT];
380 	uint64_t inc;
381 	uint64_t *m;
382 	struct pctrie_node *child, *node;
383 #ifdef INVARIANTS
384 	int loops = 0;
385 #endif
386 	int slot, tos;
387 
388 	node = pctrie_getroot(ptree);
389 	if (node == NULL)
390 		return (NULL);
391 	else if (pctrie_isleaf(node)) {
392 		m = pctrie_toval(node);
393 		if (*m >= index)
394 			return (m);
395 		else
396 			return (NULL);
397 	}
398 	tos = 0;
399 	for (;;) {
400 		/*
401 		 * If the keys differ before the current bisection node,
402 		 * then the search key might rollback to the earliest
403 		 * available bisection node or to the smallest key
404 		 * in the current node (if the owner is bigger than the
405 		 * search key).
406 		 */
407 		if (pctrie_keybarr(node, index)) {
408 			if (index > node->pn_owner) {
409 ascend:
410 				KASSERT(++loops < 1000,
411 				    ("pctrie_lookup_ge: too many loops"));
412 
413 				/*
414 				 * Pop nodes from the stack until either the
415 				 * stack is empty or a node that could have a
416 				 * matching descendant is found.
417 				 */
418 				do {
419 					if (tos == 0)
420 						return (NULL);
421 					node = stack[--tos];
422 				} while (pctrie_slot(index,
423 				    node->pn_clev) == (PCTRIE_COUNT - 1));
424 
425 				/*
426 				 * The following computation cannot overflow
427 				 * because index's slot at the current level
428 				 * is less than PCTRIE_COUNT - 1.
429 				 */
430 				index = pctrie_trimkey(index,
431 				    node->pn_clev);
432 				index += PCTRIE_UNITLEVEL(node->pn_clev);
433 			} else
434 				index = node->pn_owner;
435 			KASSERT(!pctrie_keybarr(node, index),
436 			    ("pctrie_lookup_ge: keybarr failed"));
437 		}
438 		slot = pctrie_slot(index, node->pn_clev);
439 		child = node->pn_child[slot];
440 		if (pctrie_isleaf(child)) {
441 			m = pctrie_toval(child);
442 			if (*m >= index)
443 				return (m);
444 		} else if (child != NULL)
445 			goto descend;
446 
447 		/*
448 		 * Look for an available edge or val within the current
449 		 * bisection node.
450 		 */
451                 if (slot < (PCTRIE_COUNT - 1)) {
452 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
453 			index = pctrie_trimkey(index, node->pn_clev);
454 			do {
455 				index += inc;
456 				slot++;
457 				child = node->pn_child[slot];
458 				if (pctrie_isleaf(child)) {
459 					m = pctrie_toval(child);
460 					if (*m >= index)
461 						return (m);
462 				} else if (child != NULL)
463 					goto descend;
464 			} while (slot < (PCTRIE_COUNT - 1));
465 		}
466 		KASSERT(child == NULL || pctrie_isleaf(child),
467 		    ("pctrie_lookup_ge: child is radix node"));
468 
469 		/*
470 		 * If a value or edge bigger than the search slot is not found
471 		 * in the current node, ascend to the next higher-level node.
472 		 */
473 		goto ascend;
474 descend:
475 		KASSERT(node->pn_clev > 0,
476 		    ("pctrie_lookup_ge: pushing leaf's parent"));
477 		KASSERT(tos < PCTRIE_LIMIT,
478 		    ("pctrie_lookup_ge: stack overflow"));
479 		stack[tos++] = node;
480 		node = child;
481 	}
482 }
483 
484 /*
485  * Look up the nearest entry at a position less than or equal to index.
486  */
487 uint64_t *
488 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
489 {
490 	struct pctrie_node *stack[PCTRIE_LIMIT];
491 	uint64_t inc;
492 	uint64_t *m;
493 	struct pctrie_node *child, *node;
494 #ifdef INVARIANTS
495 	int loops = 0;
496 #endif
497 	int slot, tos;
498 
499 	node = pctrie_getroot(ptree);
500 	if (node == NULL)
501 		return (NULL);
502 	else if (pctrie_isleaf(node)) {
503 		m = pctrie_toval(node);
504 		if (*m <= index)
505 			return (m);
506 		else
507 			return (NULL);
508 	}
509 	tos = 0;
510 	for (;;) {
511 		/*
512 		 * If the keys differ before the current bisection node,
513 		 * then the search key might rollback to the earliest
514 		 * available bisection node or to the largest key
515 		 * in the current node (if the owner is smaller than the
516 		 * search key).
517 		 */
518 		if (pctrie_keybarr(node, index)) {
519 			if (index > node->pn_owner) {
520 				index = node->pn_owner + PCTRIE_COUNT *
521 				    PCTRIE_UNITLEVEL(node->pn_clev);
522 			} else {
523 ascend:
524 				KASSERT(++loops < 1000,
525 				    ("pctrie_lookup_le: too many loops"));
526 
527 				/*
528 				 * Pop nodes from the stack until either the
529 				 * stack is empty or a node that could have a
530 				 * matching descendant is found.
531 				 */
532 				do {
533 					if (tos == 0)
534 						return (NULL);
535 					node = stack[--tos];
536 				} while (pctrie_slot(index,
537 				    node->pn_clev) == 0);
538 
539 				/*
540 				 * The following computation cannot overflow
541 				 * because index's slot at the current level
542 				 * is greater than 0.
543 				 */
544 				index = pctrie_trimkey(index,
545 				    node->pn_clev);
546 			}
547 			index--;
548 			KASSERT(!pctrie_keybarr(node, index),
549 			    ("pctrie_lookup_le: keybarr failed"));
550 		}
551 		slot = pctrie_slot(index, node->pn_clev);
552 		child = node->pn_child[slot];
553 		if (pctrie_isleaf(child)) {
554 			m = pctrie_toval(child);
555 			if (*m <= index)
556 				return (m);
557 		} else if (child != NULL)
558 			goto descend;
559 
560 		/*
561 		 * Look for an available edge or value within the current
562 		 * bisection node.
563 		 */
564 		if (slot > 0) {
565 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
566 			index |= inc - 1;
567 			do {
568 				index -= inc;
569 				slot--;
570 				child = node->pn_child[slot];
571 				if (pctrie_isleaf(child)) {
572 					m = pctrie_toval(child);
573 					if (*m <= index)
574 						return (m);
575 				} else if (child != NULL)
576 					goto descend;
577 			} while (slot > 0);
578 		}
579 		KASSERT(child == NULL || pctrie_isleaf(child),
580 		    ("pctrie_lookup_le: child is radix node"));
581 
582 		/*
583 		 * If a value or edge smaller than the search slot is not found
584 		 * in the current node, ascend to the next higher-level node.
585 		 */
586 		goto ascend;
587 descend:
588 		KASSERT(node->pn_clev > 0,
589 		    ("pctrie_lookup_le: pushing leaf's parent"));
590 		KASSERT(tos < PCTRIE_LIMIT,
591 		    ("pctrie_lookup_le: stack overflow"));
592 		stack[tos++] = node;
593 		node = child;
594 	}
595 }
596 
597 /*
598  * Remove the specified index from the tree.
599  * Panics if the key is not present.
600  */
601 void
602 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
603 {
604 	struct pctrie_node *node, *parent;
605 	uint64_t *m;
606 	int i, slot;
607 
608 	node = pctrie_getroot(ptree);
609 	if (pctrie_isleaf(node)) {
610 		m = pctrie_toval(node);
611 		if (*m != index)
612 			panic("%s: invalid key found", __func__);
613 		pctrie_setroot(ptree, NULL);
614 		return;
615 	}
616 	parent = NULL;
617 	for (;;) {
618 		if (node == NULL)
619 			panic("pctrie_remove: impossible to locate the key");
620 		slot = pctrie_slot(index, node->pn_clev);
621 		if (pctrie_isleaf(node->pn_child[slot])) {
622 			m = pctrie_toval(node->pn_child[slot]);
623 			if (*m != index)
624 				panic("%s: invalid key found", __func__);
625 			node->pn_child[slot] = NULL;
626 			node->pn_count--;
627 			if (node->pn_count > 1)
628 				break;
629 			for (i = 0; i < PCTRIE_COUNT; i++)
630 				if (node->pn_child[i] != NULL)
631 					break;
632 			KASSERT(i != PCTRIE_COUNT,
633 			    ("%s: invalid node configuration", __func__));
634 			if (parent == NULL)
635 				pctrie_setroot(ptree, node->pn_child[i]);
636 			else {
637 				slot = pctrie_slot(index, parent->pn_clev);
638 				KASSERT(parent->pn_child[slot] == node,
639 				    ("%s: invalid child value", __func__));
640 				parent->pn_child[slot] = node->pn_child[i];
641 			}
642 			node->pn_count--;
643 			node->pn_child[i] = NULL;
644 			pctrie_node_put(ptree, node, freefn);
645 			break;
646 		}
647 		parent = node;
648 		node = node->pn_child[slot];
649 	}
650 }
651 
652 /*
653  * Remove and free all the nodes from the tree.
654  * This function is recursive but there is a tight control on it as the
655  * maximum depth of the tree is fixed.
656  */
657 void
658 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
659 {
660 	struct pctrie_node *root;
661 
662 	root = pctrie_getroot(ptree);
663 	if (root == NULL)
664 		return;
665 	pctrie_setroot(ptree, NULL);
666 	if (!pctrie_isleaf(root))
667 		pctrie_reclaim_allnodes_int(ptree, root, freefn);
668 }
669 
670 #ifdef DDB
671 /*
672  * Show details about the given node.
673  */
674 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
675 {
676 	struct pctrie_node *node;
677 	int i;
678 
679         if (!have_addr)
680                 return;
681 	node = (struct pctrie_node *)addr;
682 	db_printf("node %p, owner %jx, children count %u, level %u:\n",
683 	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
684 	    node->pn_clev);
685 	for (i = 0; i < PCTRIE_COUNT; i++)
686 		if (node->pn_child[i] != NULL)
687 			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
688 			    i, (void *)node->pn_child[i],
689 			    pctrie_isleaf(node->pn_child[i]) ?
690 			    pctrie_toval(node->pn_child[i]) : NULL,
691 			    node->pn_clev);
692 }
693 #endif /* DDB */
694