1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * 35 * The implementation takes into account the following rationale: 36 * - Size of the nodes should be as small as possible but still big enough 37 * to avoid a large maximum depth for the trie. This is a balance 38 * between the necessity to not wire too much physical memory for the nodes 39 * and the necessity to avoid too much cache pollution during the trie 40 * operations. 41 * - There is not a huge bias toward the number of lookup operations over 42 * the number of insert and remove operations. This basically implies 43 * that optimizations supposedly helping one operation but hurting the 44 * other might be carefully evaluated. 45 * - On average not many nodes are expected to be fully populated, hence 46 * level compression may just complicate things. 47 */ 48 49 #include <sys/cdefs.h> 50 __FBSDID("$FreeBSD$"); 51 52 #include "opt_ddb.h" 53 54 #include <sys/param.h> 55 #include <sys/systm.h> 56 #include <sys/kernel.h> 57 #include <sys/pctrie.h> 58 59 #ifdef DDB 60 #include <ddb/ddb.h> 61 #endif 62 63 #define PCTRIE_MASK (PCTRIE_COUNT - 1) 64 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) 65 66 /* Flag bits stored in node pointers. */ 67 #define PCTRIE_ISLEAF 0x1 68 #define PCTRIE_FLAGS 0x1 69 #define PCTRIE_PAD PCTRIE_FLAGS 70 71 /* Returns one unit associated with specified level. */ 72 #define PCTRIE_UNITLEVEL(lev) \ 73 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) 74 75 struct pctrie_node { 76 uint64_t pn_owner; /* Owner of record. */ 77 uint16_t pn_count; /* Valid children. */ 78 uint16_t pn_clev; /* Current level. */ 79 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ 80 }; 81 82 /* 83 * Allocate a node. Pre-allocation should ensure that the request 84 * will always be satisfied. 85 */ 86 static __inline struct pctrie_node * 87 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, 88 uint16_t count, uint16_t clevel) 89 { 90 struct pctrie_node *node; 91 92 node = allocfn(ptree); 93 if (node == NULL) 94 return (NULL); 95 node->pn_owner = owner; 96 node->pn_count = count; 97 node->pn_clev = clevel; 98 99 return (node); 100 } 101 102 /* 103 * Free radix node. 104 */ 105 static __inline void 106 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, 107 pctrie_free_t freefn) 108 { 109 #ifdef INVARIANTS 110 int slot; 111 112 KASSERT(node->pn_count == 0, 113 ("pctrie_node_put: node %p has %d children", node, 114 node->pn_count)); 115 for (slot = 0; slot < PCTRIE_COUNT; slot++) 116 KASSERT(node->pn_child[slot] == NULL, 117 ("pctrie_node_put: node %p has a child", node)); 118 #endif 119 freefn(ptree, node); 120 } 121 122 /* 123 * Return the position in the array for a given level. 124 */ 125 static __inline int 126 pctrie_slot(uint64_t index, uint16_t level) 127 { 128 129 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); 130 } 131 132 /* Trims the key after the specified level. */ 133 static __inline uint64_t 134 pctrie_trimkey(uint64_t index, uint16_t level) 135 { 136 uint64_t ret; 137 138 ret = index; 139 if (level > 0) { 140 ret >>= level * PCTRIE_WIDTH; 141 ret <<= level * PCTRIE_WIDTH; 142 } 143 return (ret); 144 } 145 146 /* 147 * Get the root node for a tree. 148 */ 149 static __inline struct pctrie_node * 150 pctrie_getroot(struct pctrie *ptree) 151 { 152 153 return ((struct pctrie_node *)ptree->pt_root); 154 } 155 156 /* 157 * Set the root node for a tree. 158 */ 159 static __inline void 160 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) 161 { 162 163 ptree->pt_root = (uintptr_t)node; 164 } 165 166 /* 167 * Returns TRUE if the specified node is a leaf and FALSE otherwise. 168 */ 169 static __inline boolean_t 170 pctrie_isleaf(struct pctrie_node *node) 171 { 172 173 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); 174 } 175 176 /* 177 * Returns the associated val extracted from node. 178 */ 179 static __inline uint64_t * 180 pctrie_toval(struct pctrie_node *node) 181 { 182 183 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); 184 } 185 186 /* 187 * Adds the val as a child of the provided node. 188 */ 189 static __inline void 190 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, 191 uint64_t *val) 192 { 193 int slot; 194 195 slot = pctrie_slot(index, clev); 196 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); 197 } 198 199 /* 200 * Returns the slot where two keys differ. 201 * It cannot accept 2 equal keys. 202 */ 203 static __inline uint16_t 204 pctrie_keydiff(uint64_t index1, uint64_t index2) 205 { 206 uint16_t clev; 207 208 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 209 __func__, (uintmax_t)index1)); 210 211 index1 ^= index2; 212 for (clev = PCTRIE_LIMIT;; clev--) 213 if (pctrie_slot(index1, clev) != 0) 214 return (clev); 215 } 216 217 /* 218 * Returns TRUE if it can be determined that key does not belong to the 219 * specified node. Otherwise, returns FALSE. 220 */ 221 static __inline boolean_t 222 pctrie_keybarr(struct pctrie_node *node, uint64_t idx) 223 { 224 225 if (node->pn_clev < PCTRIE_LIMIT) { 226 idx = pctrie_trimkey(idx, node->pn_clev + 1); 227 return (idx != node->pn_owner); 228 } 229 return (FALSE); 230 } 231 232 /* 233 * Internal helper for pctrie_reclaim_allnodes(). 234 * This function is recursive. 235 */ 236 static void 237 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, 238 pctrie_free_t freefn) 239 { 240 int slot; 241 242 KASSERT(node->pn_count <= PCTRIE_COUNT, 243 ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); 244 for (slot = 0; node->pn_count != 0; slot++) { 245 if (node->pn_child[slot] == NULL) 246 continue; 247 if (!pctrie_isleaf(node->pn_child[slot])) 248 pctrie_reclaim_allnodes_int(ptree, 249 node->pn_child[slot], freefn); 250 node->pn_child[slot] = NULL; 251 node->pn_count--; 252 } 253 pctrie_node_put(ptree, node, freefn); 254 } 255 256 /* 257 * pctrie node zone initializer. 258 */ 259 int 260 pctrie_zone_init(void *mem, int size __unused, int flags __unused) 261 { 262 struct pctrie_node *node; 263 264 node = mem; 265 memset(node->pn_child, 0, sizeof(node->pn_child)); 266 return (0); 267 } 268 269 size_t 270 pctrie_node_size(void) 271 { 272 273 return (sizeof(struct pctrie_node)); 274 } 275 276 /* 277 * Inserts the key-value pair into the trie. 278 * Panics if the key already exists. 279 */ 280 int 281 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) 282 { 283 uint64_t index, newind; 284 void **parentp; 285 struct pctrie_node *node, *tmp; 286 uint64_t *m; 287 int slot; 288 uint16_t clev; 289 290 index = *val; 291 292 /* 293 * The owner of record for root is not really important because it 294 * will never be used. 295 */ 296 node = pctrie_getroot(ptree); 297 if (node == NULL) { 298 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; 299 return (0); 300 } 301 parentp = (void **)&ptree->pt_root; 302 for (;;) { 303 if (pctrie_isleaf(node)) { 304 m = pctrie_toval(node); 305 if (*m == index) 306 panic("%s: key %jx is already present", 307 __func__, (uintmax_t)index); 308 clev = pctrie_keydiff(*m, index); 309 tmp = pctrie_node_get(ptree, allocfn, 310 pctrie_trimkey(index, clev + 1), 2, clev); 311 if (tmp == NULL) 312 return (ENOMEM); 313 *parentp = tmp; 314 pctrie_addval(tmp, index, clev, val); 315 pctrie_addval(tmp, *m, clev, m); 316 return (0); 317 } else if (pctrie_keybarr(node, index)) 318 break; 319 slot = pctrie_slot(index, node->pn_clev); 320 if (node->pn_child[slot] == NULL) { 321 node->pn_count++; 322 pctrie_addval(node, index, node->pn_clev, val); 323 return (0); 324 } 325 parentp = &node->pn_child[slot]; 326 node = node->pn_child[slot]; 327 } 328 329 /* 330 * A new node is needed because the right insertion level is reached. 331 * Setup the new intermediate node and add the 2 children: the 332 * new object and the older edge. 333 */ 334 newind = node->pn_owner; 335 clev = pctrie_keydiff(newind, index); 336 tmp = pctrie_node_get(ptree, allocfn, 337 pctrie_trimkey(index, clev + 1), 2, clev); 338 if (tmp == NULL) 339 return (ENOMEM); 340 *parentp = tmp; 341 pctrie_addval(tmp, index, clev, val); 342 slot = pctrie_slot(newind, clev); 343 tmp->pn_child[slot] = node; 344 345 return (0); 346 } 347 348 /* 349 * Returns the value stored at the index. If the index is not present, 350 * NULL is returned. 351 */ 352 uint64_t * 353 pctrie_lookup(struct pctrie *ptree, uint64_t index) 354 { 355 struct pctrie_node *node; 356 uint64_t *m; 357 int slot; 358 359 node = pctrie_getroot(ptree); 360 while (node != NULL) { 361 if (pctrie_isleaf(node)) { 362 m = pctrie_toval(node); 363 if (*m == index) 364 return (m); 365 else 366 break; 367 } else if (pctrie_keybarr(node, index)) 368 break; 369 slot = pctrie_slot(index, node->pn_clev); 370 node = node->pn_child[slot]; 371 } 372 return (NULL); 373 } 374 375 /* 376 * Look up the nearest entry at a position bigger than or equal to index. 377 */ 378 uint64_t * 379 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) 380 { 381 struct pctrie_node *stack[PCTRIE_LIMIT]; 382 uint64_t inc; 383 uint64_t *m; 384 struct pctrie_node *child, *node; 385 #ifdef INVARIANTS 386 int loops = 0; 387 #endif 388 int slot, tos; 389 390 node = pctrie_getroot(ptree); 391 if (node == NULL) 392 return (NULL); 393 else if (pctrie_isleaf(node)) { 394 m = pctrie_toval(node); 395 if (*m >= index) 396 return (m); 397 else 398 return (NULL); 399 } 400 tos = 0; 401 for (;;) { 402 /* 403 * If the keys differ before the current bisection node, 404 * then the search key might rollback to the earliest 405 * available bisection node or to the smallest key 406 * in the current node (if the owner is bigger than the 407 * search key). 408 */ 409 if (pctrie_keybarr(node, index)) { 410 if (index > node->pn_owner) { 411 ascend: 412 KASSERT(++loops < 1000, 413 ("pctrie_lookup_ge: too many loops")); 414 415 /* 416 * Pop nodes from the stack until either the 417 * stack is empty or a node that could have a 418 * matching descendant is found. 419 */ 420 do { 421 if (tos == 0) 422 return (NULL); 423 node = stack[--tos]; 424 } while (pctrie_slot(index, 425 node->pn_clev) == (PCTRIE_COUNT - 1)); 426 427 /* 428 * The following computation cannot overflow 429 * because index's slot at the current level 430 * is less than PCTRIE_COUNT - 1. 431 */ 432 index = pctrie_trimkey(index, 433 node->pn_clev); 434 index += PCTRIE_UNITLEVEL(node->pn_clev); 435 } else 436 index = node->pn_owner; 437 KASSERT(!pctrie_keybarr(node, index), 438 ("pctrie_lookup_ge: keybarr failed")); 439 } 440 slot = pctrie_slot(index, node->pn_clev); 441 child = node->pn_child[slot]; 442 if (pctrie_isleaf(child)) { 443 m = pctrie_toval(child); 444 if (*m >= index) 445 return (m); 446 } else if (child != NULL) 447 goto descend; 448 449 /* 450 * Look for an available edge or val within the current 451 * bisection node. 452 */ 453 if (slot < (PCTRIE_COUNT - 1)) { 454 inc = PCTRIE_UNITLEVEL(node->pn_clev); 455 index = pctrie_trimkey(index, node->pn_clev); 456 do { 457 index += inc; 458 slot++; 459 child = node->pn_child[slot]; 460 if (pctrie_isleaf(child)) { 461 m = pctrie_toval(child); 462 if (*m >= index) 463 return (m); 464 } else if (child != NULL) 465 goto descend; 466 } while (slot < (PCTRIE_COUNT - 1)); 467 } 468 KASSERT(child == NULL || pctrie_isleaf(child), 469 ("pctrie_lookup_ge: child is radix node")); 470 471 /* 472 * If a value or edge bigger than the search slot is not found 473 * in the current node, ascend to the next higher-level node. 474 */ 475 goto ascend; 476 descend: 477 KASSERT(node->pn_clev > 0, 478 ("pctrie_lookup_ge: pushing leaf's parent")); 479 KASSERT(tos < PCTRIE_LIMIT, 480 ("pctrie_lookup_ge: stack overflow")); 481 stack[tos++] = node; 482 node = child; 483 } 484 } 485 486 /* 487 * Look up the nearest entry at a position less than or equal to index. 488 */ 489 uint64_t * 490 pctrie_lookup_le(struct pctrie *ptree, uint64_t index) 491 { 492 struct pctrie_node *stack[PCTRIE_LIMIT]; 493 uint64_t inc; 494 uint64_t *m; 495 struct pctrie_node *child, *node; 496 #ifdef INVARIANTS 497 int loops = 0; 498 #endif 499 int slot, tos; 500 501 node = pctrie_getroot(ptree); 502 if (node == NULL) 503 return (NULL); 504 else if (pctrie_isleaf(node)) { 505 m = pctrie_toval(node); 506 if (*m <= index) 507 return (m); 508 else 509 return (NULL); 510 } 511 tos = 0; 512 for (;;) { 513 /* 514 * If the keys differ before the current bisection node, 515 * then the search key might rollback to the earliest 516 * available bisection node or to the largest key 517 * in the current node (if the owner is smaller than the 518 * search key). 519 */ 520 if (pctrie_keybarr(node, index)) { 521 if (index > node->pn_owner) { 522 index = node->pn_owner + PCTRIE_COUNT * 523 PCTRIE_UNITLEVEL(node->pn_clev); 524 } else { 525 ascend: 526 KASSERT(++loops < 1000, 527 ("pctrie_lookup_le: too many loops")); 528 529 /* 530 * Pop nodes from the stack until either the 531 * stack is empty or a node that could have a 532 * matching descendant is found. 533 */ 534 do { 535 if (tos == 0) 536 return (NULL); 537 node = stack[--tos]; 538 } while (pctrie_slot(index, 539 node->pn_clev) == 0); 540 541 /* 542 * The following computation cannot overflow 543 * because index's slot at the current level 544 * is greater than 0. 545 */ 546 index = pctrie_trimkey(index, 547 node->pn_clev); 548 } 549 index--; 550 KASSERT(!pctrie_keybarr(node, index), 551 ("pctrie_lookup_le: keybarr failed")); 552 } 553 slot = pctrie_slot(index, node->pn_clev); 554 child = node->pn_child[slot]; 555 if (pctrie_isleaf(child)) { 556 m = pctrie_toval(child); 557 if (*m <= index) 558 return (m); 559 } else if (child != NULL) 560 goto descend; 561 562 /* 563 * Look for an available edge or value within the current 564 * bisection node. 565 */ 566 if (slot > 0) { 567 inc = PCTRIE_UNITLEVEL(node->pn_clev); 568 index |= inc - 1; 569 do { 570 index -= inc; 571 slot--; 572 child = node->pn_child[slot]; 573 if (pctrie_isleaf(child)) { 574 m = pctrie_toval(child); 575 if (*m <= index) 576 return (m); 577 } else if (child != NULL) 578 goto descend; 579 } while (slot > 0); 580 } 581 KASSERT(child == NULL || pctrie_isleaf(child), 582 ("pctrie_lookup_le: child is radix node")); 583 584 /* 585 * If a value or edge smaller than the search slot is not found 586 * in the current node, ascend to the next higher-level node. 587 */ 588 goto ascend; 589 descend: 590 KASSERT(node->pn_clev > 0, 591 ("pctrie_lookup_le: pushing leaf's parent")); 592 KASSERT(tos < PCTRIE_LIMIT, 593 ("pctrie_lookup_le: stack overflow")); 594 stack[tos++] = node; 595 node = child; 596 } 597 } 598 599 /* 600 * Remove the specified index from the tree. 601 * Panics if the key is not present. 602 */ 603 void 604 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) 605 { 606 struct pctrie_node *node, *parent; 607 uint64_t *m; 608 int i, slot; 609 610 node = pctrie_getroot(ptree); 611 if (pctrie_isleaf(node)) { 612 m = pctrie_toval(node); 613 if (*m != index) 614 panic("%s: invalid key found", __func__); 615 pctrie_setroot(ptree, NULL); 616 return; 617 } 618 parent = NULL; 619 for (;;) { 620 if (node == NULL) 621 panic("pctrie_remove: impossible to locate the key"); 622 slot = pctrie_slot(index, node->pn_clev); 623 if (pctrie_isleaf(node->pn_child[slot])) { 624 m = pctrie_toval(node->pn_child[slot]); 625 if (*m != index) 626 panic("%s: invalid key found", __func__); 627 node->pn_child[slot] = NULL; 628 node->pn_count--; 629 if (node->pn_count > 1) 630 break; 631 for (i = 0; i < PCTRIE_COUNT; i++) 632 if (node->pn_child[i] != NULL) 633 break; 634 KASSERT(i != PCTRIE_COUNT, 635 ("%s: invalid node configuration", __func__)); 636 if (parent == NULL) 637 pctrie_setroot(ptree, node->pn_child[i]); 638 else { 639 slot = pctrie_slot(index, parent->pn_clev); 640 KASSERT(parent->pn_child[slot] == node, 641 ("%s: invalid child value", __func__)); 642 parent->pn_child[slot] = node->pn_child[i]; 643 } 644 node->pn_count--; 645 node->pn_child[i] = NULL; 646 pctrie_node_put(ptree, node, freefn); 647 break; 648 } 649 parent = node; 650 node = node->pn_child[slot]; 651 } 652 } 653 654 /* 655 * Remove and free all the nodes from the tree. 656 * This function is recursive but there is a tight control on it as the 657 * maximum depth of the tree is fixed. 658 */ 659 void 660 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) 661 { 662 struct pctrie_node *root; 663 664 root = pctrie_getroot(ptree); 665 if (root == NULL) 666 return; 667 pctrie_setroot(ptree, NULL); 668 if (!pctrie_isleaf(root)) 669 pctrie_reclaim_allnodes_int(ptree, root, freefn); 670 } 671 672 #ifdef DDB 673 /* 674 * Show details about the given node. 675 */ 676 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) 677 { 678 struct pctrie_node *node; 679 int i; 680 681 if (!have_addr) 682 return; 683 node = (struct pctrie_node *)addr; 684 db_printf("node %p, owner %jx, children count %u, level %u:\n", 685 (void *)node, (uintmax_t)node->pn_owner, node->pn_count, 686 node->pn_clev); 687 for (i = 0; i < PCTRIE_COUNT; i++) 688 if (node->pn_child[i] != NULL) 689 db_printf("slot: %d, val: %p, value: %p, clev: %d\n", 690 i, (void *)node->pn_child[i], 691 pctrie_isleaf(node->pn_child[i]) ? 692 pctrie_toval(node->pn_child[i]) : NULL, 693 node->pn_clev); 694 } 695 #endif /* DDB */ 696