1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * 35 * The implementation takes into account the following rationale: 36 * - Size of the nodes should be as small as possible but still big enough 37 * to avoid a large maximum depth for the trie. This is a balance 38 * between the necessity to not wire too much physical memory for the nodes 39 * and the necessity to avoid too much cache pollution during the trie 40 * operations. 41 * - There is not a huge bias toward the number of lookup operations over 42 * the number of insert and remove operations. This basically implies 43 * that optimizations supposedly helping one operation but hurting the 44 * other might be carefully evaluated. 45 * - On average not many nodes are expected to be fully populated, hence 46 * level compression may just complicate things. 47 */ 48 49 #include <sys/cdefs.h> 50 __FBSDID("$FreeBSD$"); 51 52 #include "opt_ddb.h" 53 54 #include <sys/param.h> 55 #include <sys/systm.h> 56 #include <sys/kernel.h> 57 #include <sys/pctrie.h> 58 59 #ifdef DDB 60 #include <ddb/ddb.h> 61 #endif 62 63 #define PCTRIE_MASK (PCTRIE_COUNT - 1) 64 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) 65 66 /* Flag bits stored in node pointers. */ 67 #define PCTRIE_ISLEAF 0x1 68 #define PCTRIE_FLAGS 0x1 69 #define PCTRIE_PAD PCTRIE_FLAGS 70 71 /* Returns one unit associated with specified level. */ 72 #define PCTRIE_UNITLEVEL(lev) \ 73 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) 74 75 struct pctrie_node { 76 uint64_t pn_owner; /* Owner of record. */ 77 uint16_t pn_count; /* Valid children. */ 78 uint16_t pn_clev; /* Current level. */ 79 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ 80 }; 81 82 /* 83 * Allocate a node. Pre-allocation should ensure that the request 84 * will always be satisfied. 85 */ 86 static __inline struct pctrie_node * 87 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, 88 uint16_t count, uint16_t clevel) 89 { 90 struct pctrie_node *node; 91 92 node = allocfn(ptree); 93 if (node == NULL) 94 return (NULL); 95 node->pn_owner = owner; 96 node->pn_count = count; 97 node->pn_clev = clevel; 98 99 return (node); 100 } 101 102 /* 103 * Free radix node. 104 */ 105 static __inline void 106 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, 107 pctrie_free_t freefn) 108 { 109 #ifdef INVARIANTS 110 int slot; 111 112 KASSERT(node->pn_count == 0, 113 ("pctrie_node_put: node %p has %d children", node, 114 node->pn_count)); 115 for (slot = 0; slot < PCTRIE_COUNT; slot++) 116 KASSERT(node->pn_child[slot] == NULL, 117 ("pctrie_node_put: node %p has a child", node)); 118 #endif 119 freefn(ptree, node); 120 } 121 122 /* 123 * Return the position in the array for a given level. 124 */ 125 static __inline int 126 pctrie_slot(uint64_t index, uint16_t level) 127 { 128 129 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); 130 } 131 132 /* Trims the key after the specified level. */ 133 static __inline uint64_t 134 pctrie_trimkey(uint64_t index, uint16_t level) 135 { 136 uint64_t ret; 137 138 ret = index; 139 if (level > 0) { 140 ret >>= level * PCTRIE_WIDTH; 141 ret <<= level * PCTRIE_WIDTH; 142 } 143 return (ret); 144 } 145 146 /* 147 * Get the root node for a tree. 148 */ 149 static __inline struct pctrie_node * 150 pctrie_getroot(struct pctrie *ptree) 151 { 152 153 return ((struct pctrie_node *)ptree->pt_root); 154 } 155 156 /* 157 * Set the root node for a tree. 158 */ 159 static __inline void 160 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) 161 { 162 163 ptree->pt_root = (uintptr_t)node; 164 } 165 166 /* 167 * Returns TRUE if the specified node is a leaf and FALSE otherwise. 168 */ 169 static __inline bool 170 pctrie_isleaf(struct pctrie_node *node) 171 { 172 173 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); 174 } 175 176 /* 177 * Returns the associated val extracted from node. 178 */ 179 static __inline uint64_t * 180 pctrie_toval(struct pctrie_node *node) 181 { 182 183 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); 184 } 185 186 /* 187 * Adds the val as a child of the provided node. 188 */ 189 static __inline void 190 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, 191 uint64_t *val) 192 { 193 int slot; 194 195 slot = pctrie_slot(index, clev); 196 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); 197 } 198 199 /* 200 * Returns the slot where two keys differ. 201 * It cannot accept 2 equal keys. 202 */ 203 static __inline uint16_t 204 pctrie_keydiff(uint64_t index1, uint64_t index2) 205 { 206 uint16_t clev; 207 208 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 209 __func__, (uintmax_t)index1)); 210 211 index1 ^= index2; 212 for (clev = PCTRIE_LIMIT;; clev--) 213 if (pctrie_slot(index1, clev) != 0) 214 return (clev); 215 } 216 217 /* 218 * Returns TRUE if it can be determined that key does not belong to the 219 * specified node. Otherwise, returns FALSE. 220 */ 221 static __inline bool 222 pctrie_keybarr(struct pctrie_node *node, uint64_t idx) 223 { 224 225 if (node->pn_clev < PCTRIE_LIMIT) { 226 idx = pctrie_trimkey(idx, node->pn_clev + 1); 227 return (idx != node->pn_owner); 228 } 229 return (false); 230 } 231 232 /* 233 * Internal helper for pctrie_reclaim_allnodes(). 234 * This function is recursive. 235 */ 236 static void 237 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, 238 pctrie_free_t freefn) 239 { 240 int slot; 241 242 KASSERT(node->pn_count <= PCTRIE_COUNT, 243 ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); 244 for (slot = 0; node->pn_count != 0; slot++) { 245 if (node->pn_child[slot] == NULL) 246 continue; 247 if (!pctrie_isleaf(node->pn_child[slot])) 248 pctrie_reclaim_allnodes_int(ptree, 249 node->pn_child[slot], freefn); 250 node->pn_child[slot] = NULL; 251 node->pn_count--; 252 } 253 pctrie_node_put(ptree, node, freefn); 254 } 255 256 /* 257 * pctrie node zone initializer. 258 */ 259 int 260 pctrie_zone_init(void *mem, int size __unused, int flags __unused) 261 { 262 struct pctrie_node *node; 263 264 node = mem; 265 memset(node->pn_child, 0, sizeof(node->pn_child)); 266 return (0); 267 } 268 269 size_t 270 pctrie_node_size(void) 271 { 272 273 return (sizeof(struct pctrie_node)); 274 } 275 276 /* 277 * Inserts the key-value pair into the trie. 278 * Panics if the key already exists. 279 */ 280 int 281 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) 282 { 283 uint64_t index, newind; 284 void **parentp; 285 struct pctrie_node *node, *tmp; 286 uint64_t *m; 287 int slot; 288 uint16_t clev; 289 290 index = *val; 291 292 /* 293 * The owner of record for root is not really important because it 294 * will never be used. 295 */ 296 node = pctrie_getroot(ptree); 297 if (node == NULL) { 298 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; 299 return (0); 300 } 301 parentp = (void **)&ptree->pt_root; 302 for (;;) { 303 if (pctrie_isleaf(node)) { 304 m = pctrie_toval(node); 305 if (*m == index) 306 panic("%s: key %jx is already present", 307 __func__, (uintmax_t)index); 308 clev = pctrie_keydiff(*m, index); 309 tmp = pctrie_node_get(ptree, allocfn, 310 pctrie_trimkey(index, clev + 1), 2, clev); 311 if (tmp == NULL) 312 return (ENOMEM); 313 *parentp = tmp; 314 pctrie_addval(tmp, index, clev, val); 315 pctrie_addval(tmp, *m, clev, m); 316 return (0); 317 } else if (pctrie_keybarr(node, index)) 318 break; 319 slot = pctrie_slot(index, node->pn_clev); 320 if (node->pn_child[slot] == NULL) { 321 node->pn_count++; 322 pctrie_addval(node, index, node->pn_clev, val); 323 return (0); 324 } 325 parentp = &node->pn_child[slot]; 326 node = node->pn_child[slot]; 327 } 328 329 /* 330 * A new node is needed because the right insertion level is reached. 331 * Setup the new intermediate node and add the 2 children: the 332 * new object and the older edge. 333 */ 334 newind = node->pn_owner; 335 clev = pctrie_keydiff(newind, index); 336 tmp = pctrie_node_get(ptree, allocfn, 337 pctrie_trimkey(index, clev + 1), 2, clev); 338 if (tmp == NULL) 339 return (ENOMEM); 340 *parentp = tmp; 341 pctrie_addval(tmp, index, clev, val); 342 slot = pctrie_slot(newind, clev); 343 tmp->pn_child[slot] = node; 344 345 return (0); 346 } 347 348 /* 349 * Returns the value stored at the index. If the index is not present, 350 * NULL is returned. 351 */ 352 uint64_t * 353 pctrie_lookup(struct pctrie *ptree, uint64_t index) 354 { 355 struct pctrie_node *node; 356 uint64_t *m; 357 int slot; 358 359 node = pctrie_getroot(ptree); 360 while (node != NULL) { 361 if (pctrie_isleaf(node)) { 362 m = pctrie_toval(node); 363 if (*m == index) 364 return (m); 365 else 366 break; 367 } else if (pctrie_keybarr(node, index)) 368 break; 369 slot = pctrie_slot(index, node->pn_clev); 370 node = node->pn_child[slot]; 371 } 372 return (NULL); 373 } 374 375 /* 376 * Look up the nearest entry at a position bigger than or equal to index. 377 */ 378 uint64_t * 379 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) 380 { 381 struct pctrie_node *stack[PCTRIE_LIMIT]; 382 uint64_t inc; 383 uint64_t *m; 384 struct pctrie_node *child, *node; 385 #ifdef INVARIANTS 386 int loops = 0; 387 #endif 388 unsigned tos; 389 int slot; 390 391 node = pctrie_getroot(ptree); 392 if (node == NULL) 393 return (NULL); 394 else if (pctrie_isleaf(node)) { 395 m = pctrie_toval(node); 396 if (*m >= index) 397 return (m); 398 else 399 return (NULL); 400 } 401 tos = 0; 402 for (;;) { 403 /* 404 * If the keys differ before the current bisection node, 405 * then the search key might rollback to the earliest 406 * available bisection node or to the smallest key 407 * in the current node (if the owner is bigger than the 408 * search key). 409 */ 410 if (pctrie_keybarr(node, index)) { 411 if (index > node->pn_owner) { 412 ascend: 413 KASSERT(++loops < 1000, 414 ("pctrie_lookup_ge: too many loops")); 415 416 /* 417 * Pop nodes from the stack until either the 418 * stack is empty or a node that could have a 419 * matching descendant is found. 420 */ 421 do { 422 if (tos == 0) 423 return (NULL); 424 node = stack[--tos]; 425 } while (pctrie_slot(index, 426 node->pn_clev) == (PCTRIE_COUNT - 1)); 427 428 /* 429 * The following computation cannot overflow 430 * because index's slot at the current level 431 * is less than PCTRIE_COUNT - 1. 432 */ 433 index = pctrie_trimkey(index, 434 node->pn_clev); 435 index += PCTRIE_UNITLEVEL(node->pn_clev); 436 } else 437 index = node->pn_owner; 438 KASSERT(!pctrie_keybarr(node, index), 439 ("pctrie_lookup_ge: keybarr failed")); 440 } 441 slot = pctrie_slot(index, node->pn_clev); 442 child = node->pn_child[slot]; 443 if (pctrie_isleaf(child)) { 444 m = pctrie_toval(child); 445 if (*m >= index) 446 return (m); 447 } else if (child != NULL) 448 goto descend; 449 450 /* 451 * Look for an available edge or val within the current 452 * bisection node. 453 */ 454 if (slot < (PCTRIE_COUNT - 1)) { 455 inc = PCTRIE_UNITLEVEL(node->pn_clev); 456 index = pctrie_trimkey(index, node->pn_clev); 457 do { 458 index += inc; 459 slot++; 460 child = node->pn_child[slot]; 461 if (pctrie_isleaf(child)) { 462 m = pctrie_toval(child); 463 if (*m >= index) 464 return (m); 465 } else if (child != NULL) 466 goto descend; 467 } while (slot < (PCTRIE_COUNT - 1)); 468 } 469 KASSERT(child == NULL || pctrie_isleaf(child), 470 ("pctrie_lookup_ge: child is radix node")); 471 472 /* 473 * If a value or edge bigger than the search slot is not found 474 * in the current node, ascend to the next higher-level node. 475 */ 476 goto ascend; 477 descend: 478 KASSERT(node->pn_clev > 0, 479 ("pctrie_lookup_ge: pushing leaf's parent")); 480 KASSERT(tos < PCTRIE_LIMIT, 481 ("pctrie_lookup_ge: stack overflow")); 482 stack[tos++] = node; 483 node = child; 484 } 485 } 486 487 /* 488 * Look up the nearest entry at a position less than or equal to index. 489 */ 490 uint64_t * 491 pctrie_lookup_le(struct pctrie *ptree, uint64_t index) 492 { 493 struct pctrie_node *stack[PCTRIE_LIMIT]; 494 uint64_t inc; 495 uint64_t *m; 496 struct pctrie_node *child, *node; 497 #ifdef INVARIANTS 498 int loops = 0; 499 #endif 500 unsigned tos; 501 int slot; 502 503 node = pctrie_getroot(ptree); 504 if (node == NULL) 505 return (NULL); 506 else if (pctrie_isleaf(node)) { 507 m = pctrie_toval(node); 508 if (*m <= index) 509 return (m); 510 else 511 return (NULL); 512 } 513 tos = 0; 514 for (;;) { 515 /* 516 * If the keys differ before the current bisection node, 517 * then the search key might rollback to the earliest 518 * available bisection node or to the largest key 519 * in the current node (if the owner is smaller than the 520 * search key). 521 */ 522 if (pctrie_keybarr(node, index)) { 523 if (index > node->pn_owner) { 524 index = node->pn_owner + PCTRIE_COUNT * 525 PCTRIE_UNITLEVEL(node->pn_clev); 526 } else { 527 ascend: 528 KASSERT(++loops < 1000, 529 ("pctrie_lookup_le: too many loops")); 530 531 /* 532 * Pop nodes from the stack until either the 533 * stack is empty or a node that could have a 534 * matching descendant is found. 535 */ 536 do { 537 if (tos == 0) 538 return (NULL); 539 node = stack[--tos]; 540 } while (pctrie_slot(index, 541 node->pn_clev) == 0); 542 543 /* 544 * The following computation cannot overflow 545 * because index's slot at the current level 546 * is greater than 0. 547 */ 548 index = pctrie_trimkey(index, 549 node->pn_clev); 550 } 551 index--; 552 KASSERT(!pctrie_keybarr(node, index), 553 ("pctrie_lookup_le: keybarr failed")); 554 } 555 slot = pctrie_slot(index, node->pn_clev); 556 child = node->pn_child[slot]; 557 if (pctrie_isleaf(child)) { 558 m = pctrie_toval(child); 559 if (*m <= index) 560 return (m); 561 } else if (child != NULL) 562 goto descend; 563 564 /* 565 * Look for an available edge or value within the current 566 * bisection node. 567 */ 568 if (slot > 0) { 569 inc = PCTRIE_UNITLEVEL(node->pn_clev); 570 index |= inc - 1; 571 do { 572 index -= inc; 573 slot--; 574 child = node->pn_child[slot]; 575 if (pctrie_isleaf(child)) { 576 m = pctrie_toval(child); 577 if (*m <= index) 578 return (m); 579 } else if (child != NULL) 580 goto descend; 581 } while (slot > 0); 582 } 583 KASSERT(child == NULL || pctrie_isleaf(child), 584 ("pctrie_lookup_le: child is radix node")); 585 586 /* 587 * If a value or edge smaller than the search slot is not found 588 * in the current node, ascend to the next higher-level node. 589 */ 590 goto ascend; 591 descend: 592 KASSERT(node->pn_clev > 0, 593 ("pctrie_lookup_le: pushing leaf's parent")); 594 KASSERT(tos < PCTRIE_LIMIT, 595 ("pctrie_lookup_le: stack overflow")); 596 stack[tos++] = node; 597 node = child; 598 } 599 } 600 601 /* 602 * Remove the specified index from the tree. 603 * Panics if the key is not present. 604 */ 605 void 606 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) 607 { 608 struct pctrie_node *node, *parent; 609 uint64_t *m; 610 int i, slot; 611 612 node = pctrie_getroot(ptree); 613 if (pctrie_isleaf(node)) { 614 m = pctrie_toval(node); 615 if (*m != index) 616 panic("%s: invalid key found", __func__); 617 pctrie_setroot(ptree, NULL); 618 return; 619 } 620 parent = NULL; 621 for (;;) { 622 if (node == NULL) 623 panic("pctrie_remove: impossible to locate the key"); 624 slot = pctrie_slot(index, node->pn_clev); 625 if (pctrie_isleaf(node->pn_child[slot])) { 626 m = pctrie_toval(node->pn_child[slot]); 627 if (*m != index) 628 panic("%s: invalid key found", __func__); 629 node->pn_child[slot] = NULL; 630 node->pn_count--; 631 if (node->pn_count > 1) 632 break; 633 for (i = 0; i < PCTRIE_COUNT; i++) 634 if (node->pn_child[i] != NULL) 635 break; 636 KASSERT(i != PCTRIE_COUNT, 637 ("%s: invalid node configuration", __func__)); 638 if (parent == NULL) 639 pctrie_setroot(ptree, node->pn_child[i]); 640 else { 641 slot = pctrie_slot(index, parent->pn_clev); 642 KASSERT(parent->pn_child[slot] == node, 643 ("%s: invalid child value", __func__)); 644 parent->pn_child[slot] = node->pn_child[i]; 645 } 646 node->pn_count--; 647 node->pn_child[i] = NULL; 648 pctrie_node_put(ptree, node, freefn); 649 break; 650 } 651 parent = node; 652 node = node->pn_child[slot]; 653 } 654 } 655 656 /* 657 * Remove and free all the nodes from the tree. 658 * This function is recursive but there is a tight control on it as the 659 * maximum depth of the tree is fixed. 660 */ 661 void 662 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) 663 { 664 struct pctrie_node *root; 665 666 root = pctrie_getroot(ptree); 667 if (root == NULL) 668 return; 669 pctrie_setroot(ptree, NULL); 670 if (!pctrie_isleaf(root)) 671 pctrie_reclaim_allnodes_int(ptree, root, freefn); 672 } 673 674 #ifdef DDB 675 /* 676 * Show details about the given node. 677 */ 678 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) 679 { 680 struct pctrie_node *node; 681 int i; 682 683 if (!have_addr) 684 return; 685 node = (struct pctrie_node *)addr; 686 db_printf("node %p, owner %jx, children count %u, level %u:\n", 687 (void *)node, (uintmax_t)node->pn_owner, node->pn_count, 688 node->pn_clev); 689 for (i = 0; i < PCTRIE_COUNT; i++) 690 if (node->pn_child[i] != NULL) 691 db_printf("slot: %d, val: %p, value: %p, clev: %d\n", 692 i, (void *)node->pn_child[i], 693 pctrie_isleaf(node->pn_child[i]) ? 694 pctrie_toval(node->pn_child[i]) : NULL, 695 node->pn_clev); 696 } 697 #endif /* DDB */ 698