xref: /freebsd/sys/kern/subr_pctrie.c (revision 7447ca0eb235974642312b9555caec00b57d8fc1)
1 /*
2  * Copyright (c) 2013 EMC Corp.
3  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  */
29 
30 /*
31  * Path-compressed radix trie implementation.
32  *
33  * The implementation takes into account the following rationale:
34  * - Size of the nodes should be as small as possible but still big enough
35  *   to avoid a large maximum depth for the trie.  This is a balance
36  *   between the necessity to not wire too much physical memory for the nodes
37  *   and the necessity to avoid too much cache pollution during the trie
38  *   operations.
39  * - There is not a huge bias toward the number of lookup operations over
40  *   the number of insert and remove operations.  This basically implies
41  *   that optimizations supposedly helping one operation but hurting the
42  *   other might be carefully evaluated.
43  * - On average not many nodes are expected to be fully populated, hence
44  *   level compression may just complicate things.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include "opt_ddb.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
56 
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60 
61 /*
62  * These widths should allow the pointers to a node's children to fit within
63  * a single cache line.  The extra levels from a narrow width should not be
64  * a problem thanks to path compression.
65  */
66 #ifdef __LP64__
67 #define	PCTRIE_WIDTH	4
68 #else
69 #define	PCTRIE_WIDTH	3
70 #endif
71 
72 #define	PCTRIE_COUNT	(1 << PCTRIE_WIDTH)
73 #define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
74 #define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
75 
76 /* Flag bits stored in node pointers. */
77 #define	PCTRIE_ISLEAF	0x1
78 #define	PCTRIE_FLAGS	0x1
79 #define	PCTRIE_PAD	PCTRIE_FLAGS
80 
81 /* Returns one unit associated with specified level. */
82 #define	PCTRIE_UNITLEVEL(lev)						\
83 	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
84 
85 struct pctrie_node {
86 	uint64_t	 pn_owner;			/* Owner of record. */
87 	uint16_t	 pn_count;			/* Valid children. */
88 	uint16_t	 pn_clev;			/* Current level. */
89 	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
90 };
91 
92 /*
93  * Allocate a node.  Pre-allocation should ensure that the request
94  * will always be satisfied.
95  */
96 static __inline struct pctrie_node *
97 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
98     uint16_t count, uint16_t clevel)
99 {
100 	struct pctrie_node *node;
101 
102 	node = allocfn(ptree);
103 	if (node == NULL)
104 		return (NULL);
105 	node->pn_owner = owner;
106 	node->pn_count = count;
107 	node->pn_clev = clevel;
108 
109 	return (node);
110 }
111 
112 /*
113  * Free radix node.
114  */
115 static __inline void
116 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
117     pctrie_free_t freefn)
118 {
119 #ifdef INVARIANTS
120 	int slot;
121 
122 	KASSERT(node->pn_count == 0,
123 	    ("pctrie_node_put: node %p has %d children", node,
124 	    node->pn_count));
125 	for (slot = 0; slot < PCTRIE_COUNT; slot++)
126 		KASSERT(node->pn_child[slot] == NULL,
127 		    ("pctrie_node_put: node %p has a child", node));
128 #endif
129 	freefn(ptree, node);
130 }
131 
132 /*
133  * Return the position in the array for a given level.
134  */
135 static __inline int
136 pctrie_slot(uint64_t index, uint16_t level)
137 {
138 
139 	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
140 }
141 
142 /* Trims the key after the specified level. */
143 static __inline uint64_t
144 pctrie_trimkey(uint64_t index, uint16_t level)
145 {
146 	uint64_t ret;
147 
148 	ret = index;
149 	if (level > 0) {
150 		ret >>= level * PCTRIE_WIDTH;
151 		ret <<= level * PCTRIE_WIDTH;
152 	}
153 	return (ret);
154 }
155 
156 /*
157  * Get the root node for a tree.
158  */
159 static __inline struct pctrie_node *
160 pctrie_getroot(struct pctrie *ptree)
161 {
162 
163 	return ((struct pctrie_node *)ptree->pt_root);
164 }
165 
166 /*
167  * Set the root node for a tree.
168  */
169 static __inline void
170 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
171 {
172 
173 	ptree->pt_root = (uintptr_t)node;
174 }
175 
176 /*
177  * Returns TRUE if the specified node is a leaf and FALSE otherwise.
178  */
179 static __inline boolean_t
180 pctrie_isleaf(struct pctrie_node *node)
181 {
182 
183 	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
184 }
185 
186 /*
187  * Returns the associated val extracted from node.
188  */
189 static __inline uint64_t *
190 pctrie_toval(struct pctrie_node *node)
191 {
192 
193 	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
194 }
195 
196 /*
197  * Adds the val as a child of the provided node.
198  */
199 static __inline void
200 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
201     uint64_t *val)
202 {
203 	int slot;
204 
205 	slot = pctrie_slot(index, clev);
206 	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
207 }
208 
209 /*
210  * Returns the slot where two keys differ.
211  * It cannot accept 2 equal keys.
212  */
213 static __inline uint16_t
214 pctrie_keydiff(uint64_t index1, uint64_t index2)
215 {
216 	uint16_t clev;
217 
218 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
219 	    __func__, (uintmax_t)index1));
220 
221 	index1 ^= index2;
222 	for (clev = PCTRIE_LIMIT;; clev--)
223 		if (pctrie_slot(index1, clev) != 0)
224 			return (clev);
225 }
226 
227 /*
228  * Returns TRUE if it can be determined that key does not belong to the
229  * specified node.  Otherwise, returns FALSE.
230  */
231 static __inline boolean_t
232 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
233 {
234 
235 	if (node->pn_clev < PCTRIE_LIMIT) {
236 		idx = pctrie_trimkey(idx, node->pn_clev + 1);
237 		return (idx != node->pn_owner);
238 	}
239 	return (FALSE);
240 }
241 
242 /*
243  * Internal helper for pctrie_reclaim_allnodes().
244  * This function is recursive.
245  */
246 static void
247 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
248     pctrie_free_t freefn)
249 {
250 	int slot;
251 
252 	KASSERT(node->pn_count <= PCTRIE_COUNT,
253 	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
254 	for (slot = 0; node->pn_count != 0; slot++) {
255 		if (node->pn_child[slot] == NULL)
256 			continue;
257 		if (!pctrie_isleaf(node->pn_child[slot]))
258 			pctrie_reclaim_allnodes_int(ptree,
259 			    node->pn_child[slot], freefn);
260 		node->pn_child[slot] = NULL;
261 		node->pn_count--;
262 	}
263 	pctrie_node_put(ptree, node, freefn);
264 }
265 
266 /*
267  * pctrie node zone initializer.
268  */
269 int
270 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
271 {
272 	struct pctrie_node *node;
273 
274 	node = mem;
275 	memset(node->pn_child, 0, sizeof(node->pn_child));
276 	return (0);
277 }
278 
279 size_t
280 pctrie_node_size(void)
281 {
282 
283 	return (sizeof(struct pctrie_node));
284 }
285 
286 /*
287  * Inserts the key-value pair into the trie.
288  * Panics if the key already exists.
289  */
290 int
291 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
292 {
293 	uint64_t index, newind;
294 	void **parentp;
295 	struct pctrie_node *node, *tmp;
296 	uint64_t *m;
297 	int slot;
298 	uint16_t clev;
299 
300 	index = *val;
301 
302 	/*
303 	 * The owner of record for root is not really important because it
304 	 * will never be used.
305 	 */
306 	node = pctrie_getroot(ptree);
307 	if (node == NULL) {
308 		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
309 		return (0);
310 	}
311 	parentp = (void **)&ptree->pt_root;
312 	for (;;) {
313 		if (pctrie_isleaf(node)) {
314 			m = pctrie_toval(node);
315 			if (*m == index)
316 				panic("%s: key %jx is already present",
317 				    __func__, (uintmax_t)index);
318 			clev = pctrie_keydiff(*m, index);
319 			tmp = pctrie_node_get(ptree, allocfn,
320 			    pctrie_trimkey(index, clev + 1), 2, clev);
321 			if (tmp == NULL)
322 				return (ENOMEM);
323 			*parentp = tmp;
324 			pctrie_addval(tmp, index, clev, val);
325 			pctrie_addval(tmp, *m, clev, m);
326 			return (0);
327 		} else if (pctrie_keybarr(node, index))
328 			break;
329 		slot = pctrie_slot(index, node->pn_clev);
330 		if (node->pn_child[slot] == NULL) {
331 			node->pn_count++;
332 			pctrie_addval(node, index, node->pn_clev, val);
333 			return (0);
334 		}
335 		parentp = &node->pn_child[slot];
336 		node = node->pn_child[slot];
337 	}
338 
339 	/*
340 	 * A new node is needed because the right insertion level is reached.
341 	 * Setup the new intermediate node and add the 2 children: the
342 	 * new object and the older edge.
343 	 */
344 	newind = node->pn_owner;
345 	clev = pctrie_keydiff(newind, index);
346 	tmp = pctrie_node_get(ptree, allocfn,
347 	    pctrie_trimkey(index, clev + 1), 2, clev);
348 	if (tmp == NULL)
349 		return (ENOMEM);
350 	*parentp = tmp;
351 	pctrie_addval(tmp, index, clev, val);
352 	slot = pctrie_slot(newind, clev);
353 	tmp->pn_child[slot] = node;
354 
355 	return (0);
356 }
357 
358 /*
359  * Returns the value stored at the index.  If the index is not present,
360  * NULL is returned.
361  */
362 uint64_t *
363 pctrie_lookup(struct pctrie *ptree, uint64_t index)
364 {
365 	struct pctrie_node *node;
366 	uint64_t *m;
367 	int slot;
368 
369 	node = pctrie_getroot(ptree);
370 	while (node != NULL) {
371 		if (pctrie_isleaf(node)) {
372 			m = pctrie_toval(node);
373 			if (*m == index)
374 				return (m);
375 			else
376 				break;
377 		} else if (pctrie_keybarr(node, index))
378 			break;
379 		slot = pctrie_slot(index, node->pn_clev);
380 		node = node->pn_child[slot];
381 	}
382 	return (NULL);
383 }
384 
385 /*
386  * Look up the nearest entry at a position bigger than or equal to index.
387  */
388 uint64_t *
389 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
390 {
391 	struct pctrie_node *stack[PCTRIE_LIMIT];
392 	uint64_t inc;
393 	uint64_t *m;
394 	struct pctrie_node *child, *node;
395 #ifdef INVARIANTS
396 	int loops = 0;
397 #endif
398 	int slot, tos;
399 
400 	node = pctrie_getroot(ptree);
401 	if (node == NULL)
402 		return (NULL);
403 	else if (pctrie_isleaf(node)) {
404 		m = pctrie_toval(node);
405 		if (*m >= index)
406 			return (m);
407 		else
408 			return (NULL);
409 	}
410 	tos = 0;
411 	for (;;) {
412 		/*
413 		 * If the keys differ before the current bisection node,
414 		 * then the search key might rollback to the earliest
415 		 * available bisection node or to the smallest key
416 		 * in the current node (if the owner is bigger than the
417 		 * search key).
418 		 */
419 		if (pctrie_keybarr(node, index)) {
420 			if (index > node->pn_owner) {
421 ascend:
422 				KASSERT(++loops < 1000,
423 				    ("pctrie_lookup_ge: too many loops"));
424 
425 				/*
426 				 * Pop nodes from the stack until either the
427 				 * stack is empty or a node that could have a
428 				 * matching descendant is found.
429 				 */
430 				do {
431 					if (tos == 0)
432 						return (NULL);
433 					node = stack[--tos];
434 				} while (pctrie_slot(index,
435 				    node->pn_clev) == (PCTRIE_COUNT - 1));
436 
437 				/*
438 				 * The following computation cannot overflow
439 				 * because index's slot at the current level
440 				 * is less than PCTRIE_COUNT - 1.
441 				 */
442 				index = pctrie_trimkey(index,
443 				    node->pn_clev);
444 				index += PCTRIE_UNITLEVEL(node->pn_clev);
445 			} else
446 				index = node->pn_owner;
447 			KASSERT(!pctrie_keybarr(node, index),
448 			    ("pctrie_lookup_ge: keybarr failed"));
449 		}
450 		slot = pctrie_slot(index, node->pn_clev);
451 		child = node->pn_child[slot];
452 		if (pctrie_isleaf(child)) {
453 			m = pctrie_toval(child);
454 			if (*m >= index)
455 				return (m);
456 		} else if (child != NULL)
457 			goto descend;
458 
459 		/*
460 		 * Look for an available edge or val within the current
461 		 * bisection node.
462 		 */
463                 if (slot < (PCTRIE_COUNT - 1)) {
464 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
465 			index = pctrie_trimkey(index, node->pn_clev);
466 			do {
467 				index += inc;
468 				slot++;
469 				child = node->pn_child[slot];
470 				if (pctrie_isleaf(child)) {
471 					m = pctrie_toval(child);
472 					if (*m >= index)
473 						return (m);
474 				} else if (child != NULL)
475 					goto descend;
476 			} while (slot < (PCTRIE_COUNT - 1));
477 		}
478 		KASSERT(child == NULL || pctrie_isleaf(child),
479 		    ("pctrie_lookup_ge: child is radix node"));
480 
481 		/*
482 		 * If a value or edge bigger than the search slot is not found
483 		 * in the current node, ascend to the next higher-level node.
484 		 */
485 		goto ascend;
486 descend:
487 		KASSERT(node->pn_clev > 0,
488 		    ("pctrie_lookup_ge: pushing leaf's parent"));
489 		KASSERT(tos < PCTRIE_LIMIT,
490 		    ("pctrie_lookup_ge: stack overflow"));
491 		stack[tos++] = node;
492 		node = child;
493 	}
494 }
495 
496 /*
497  * Look up the nearest entry at a position less than or equal to index.
498  */
499 uint64_t *
500 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
501 {
502 	struct pctrie_node *stack[PCTRIE_LIMIT];
503 	uint64_t inc;
504 	uint64_t *m;
505 	struct pctrie_node *child, *node;
506 #ifdef INVARIANTS
507 	int loops = 0;
508 #endif
509 	int slot, tos;
510 
511 	node = pctrie_getroot(ptree);
512 	if (node == NULL)
513 		return (NULL);
514 	else if (pctrie_isleaf(node)) {
515 		m = pctrie_toval(node);
516 		if (*m <= index)
517 			return (m);
518 		else
519 			return (NULL);
520 	}
521 	tos = 0;
522 	for (;;) {
523 		/*
524 		 * If the keys differ before the current bisection node,
525 		 * then the search key might rollback to the earliest
526 		 * available bisection node or to the largest key
527 		 * in the current node (if the owner is smaller than the
528 		 * search key).
529 		 */
530 		if (pctrie_keybarr(node, index)) {
531 			if (index > node->pn_owner) {
532 				index = node->pn_owner + PCTRIE_COUNT *
533 				    PCTRIE_UNITLEVEL(node->pn_clev);
534 			} else {
535 ascend:
536 				KASSERT(++loops < 1000,
537 				    ("pctrie_lookup_le: too many loops"));
538 
539 				/*
540 				 * Pop nodes from the stack until either the
541 				 * stack is empty or a node that could have a
542 				 * matching descendant is found.
543 				 */
544 				do {
545 					if (tos == 0)
546 						return (NULL);
547 					node = stack[--tos];
548 				} while (pctrie_slot(index,
549 				    node->pn_clev) == 0);
550 
551 				/*
552 				 * The following computation cannot overflow
553 				 * because index's slot at the current level
554 				 * is greater than 0.
555 				 */
556 				index = pctrie_trimkey(index,
557 				    node->pn_clev);
558 			}
559 			index--;
560 			KASSERT(!pctrie_keybarr(node, index),
561 			    ("pctrie_lookup_le: keybarr failed"));
562 		}
563 		slot = pctrie_slot(index, node->pn_clev);
564 		child = node->pn_child[slot];
565 		if (pctrie_isleaf(child)) {
566 			m = pctrie_toval(child);
567 			if (*m <= index)
568 				return (m);
569 		} else if (child != NULL)
570 			goto descend;
571 
572 		/*
573 		 * Look for an available edge or value within the current
574 		 * bisection node.
575 		 */
576 		if (slot > 0) {
577 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
578 			index |= inc - 1;
579 			do {
580 				index -= inc;
581 				slot--;
582 				child = node->pn_child[slot];
583 				if (pctrie_isleaf(child)) {
584 					m = pctrie_toval(child);
585 					if (*m <= index)
586 						return (m);
587 				} else if (child != NULL)
588 					goto descend;
589 			} while (slot > 0);
590 		}
591 		KASSERT(child == NULL || pctrie_isleaf(child),
592 		    ("pctrie_lookup_le: child is radix node"));
593 
594 		/*
595 		 * If a value or edge smaller than the search slot is not found
596 		 * in the current node, ascend to the next higher-level node.
597 		 */
598 		goto ascend;
599 descend:
600 		KASSERT(node->pn_clev > 0,
601 		    ("pctrie_lookup_le: pushing leaf's parent"));
602 		KASSERT(tos < PCTRIE_LIMIT,
603 		    ("pctrie_lookup_le: stack overflow"));
604 		stack[tos++] = node;
605 		node = child;
606 	}
607 }
608 
609 /*
610  * Remove the specified index from the tree.
611  * Panics if the key is not present.
612  */
613 void
614 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
615 {
616 	struct pctrie_node *node, *parent;
617 	uint64_t *m;
618 	int i, slot;
619 
620 	node = pctrie_getroot(ptree);
621 	if (pctrie_isleaf(node)) {
622 		m = pctrie_toval(node);
623 		if (*m != index)
624 			panic("%s: invalid key found", __func__);
625 		pctrie_setroot(ptree, NULL);
626 		return;
627 	}
628 	parent = NULL;
629 	for (;;) {
630 		if (node == NULL)
631 			panic("pctrie_remove: impossible to locate the key");
632 		slot = pctrie_slot(index, node->pn_clev);
633 		if (pctrie_isleaf(node->pn_child[slot])) {
634 			m = pctrie_toval(node->pn_child[slot]);
635 			if (*m != index)
636 				panic("%s: invalid key found", __func__);
637 			node->pn_child[slot] = NULL;
638 			node->pn_count--;
639 			if (node->pn_count > 1)
640 				break;
641 			for (i = 0; i < PCTRIE_COUNT; i++)
642 				if (node->pn_child[i] != NULL)
643 					break;
644 			KASSERT(i != PCTRIE_COUNT,
645 			    ("%s: invalid node configuration", __func__));
646 			if (parent == NULL)
647 				pctrie_setroot(ptree, node->pn_child[i]);
648 			else {
649 				slot = pctrie_slot(index, parent->pn_clev);
650 				KASSERT(parent->pn_child[slot] == node,
651 				    ("%s: invalid child value", __func__));
652 				parent->pn_child[slot] = node->pn_child[i];
653 			}
654 			node->pn_count--;
655 			node->pn_child[i] = NULL;
656 			pctrie_node_put(ptree, node, freefn);
657 			break;
658 		}
659 		parent = node;
660 		node = node->pn_child[slot];
661 	}
662 }
663 
664 /*
665  * Remove and free all the nodes from the tree.
666  * This function is recursive but there is a tight control on it as the
667  * maximum depth of the tree is fixed.
668  */
669 void
670 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
671 {
672 	struct pctrie_node *root;
673 
674 	root = pctrie_getroot(ptree);
675 	if (root == NULL)
676 		return;
677 	pctrie_setroot(ptree, NULL);
678 	if (!pctrie_isleaf(root))
679 		pctrie_reclaim_allnodes_int(ptree, root, freefn);
680 }
681 
682 #ifdef DDB
683 /*
684  * Show details about the given node.
685  */
686 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
687 {
688 	struct pctrie_node *node;
689 	int i;
690 
691         if (!have_addr)
692                 return;
693 	node = (struct pctrie_node *)addr;
694 	db_printf("node %p, owner %jx, children count %u, level %u:\n",
695 	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
696 	    node->pn_clev);
697 	for (i = 0; i < PCTRIE_COUNT; i++)
698 		if (node->pn_child[i] != NULL)
699 			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
700 			    i, (void *)node->pn_child[i],
701 			    pctrie_isleaf(node->pn_child[i]) ?
702 			    pctrie_toval(node->pn_child[i]) : NULL,
703 			    node->pn_clev);
704 }
705 #endif /* DDB */
706