1 /* 2 * Copyright (c) 2013 EMC Corp. 3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 */ 29 30 /* 31 * Path-compressed radix trie implementation. 32 * 33 * The implementation takes into account the following rationale: 34 * - Size of the nodes should be as small as possible but still big enough 35 * to avoid a large maximum depth for the trie. This is a balance 36 * between the necessity to not wire too much physical memory for the nodes 37 * and the necessity to avoid too much cache pollution during the trie 38 * operations. 39 * - There is not a huge bias toward the number of lookup operations over 40 * the number of insert and remove operations. This basically implies 41 * that optimizations supposedly helping one operation but hurting the 42 * other might be carefully evaluated. 43 * - On average not many nodes are expected to be fully populated, hence 44 * level compression may just complicate things. 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include "opt_ddb.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/pctrie.h> 56 57 #ifdef DDB 58 #include <ddb/ddb.h> 59 #endif 60 61 /* 62 * These widths should allow the pointers to a node's children to fit within 63 * a single cache line. The extra levels from a narrow width should not be 64 * a problem thanks to path compression. 65 */ 66 #ifdef __LP64__ 67 #define PCTRIE_WIDTH 4 68 #else 69 #define PCTRIE_WIDTH 3 70 #endif 71 72 #define PCTRIE_COUNT (1 << PCTRIE_WIDTH) 73 #define PCTRIE_MASK (PCTRIE_COUNT - 1) 74 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) 75 76 /* Flag bits stored in node pointers. */ 77 #define PCTRIE_ISLEAF 0x1 78 #define PCTRIE_FLAGS 0x1 79 #define PCTRIE_PAD PCTRIE_FLAGS 80 81 /* Returns one unit associated with specified level. */ 82 #define PCTRIE_UNITLEVEL(lev) \ 83 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) 84 85 struct pctrie_node { 86 uint64_t pn_owner; /* Owner of record. */ 87 uint16_t pn_count; /* Valid children. */ 88 uint16_t pn_clev; /* Current level. */ 89 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ 90 }; 91 92 /* 93 * Allocate a node. Pre-allocation should ensure that the request 94 * will always be satisfied. 95 */ 96 static __inline struct pctrie_node * 97 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, 98 uint16_t count, uint16_t clevel) 99 { 100 struct pctrie_node *node; 101 102 node = allocfn(ptree); 103 if (node == NULL) 104 return (NULL); 105 node->pn_owner = owner; 106 node->pn_count = count; 107 node->pn_clev = clevel; 108 109 return (node); 110 } 111 112 /* 113 * Free radix node. 114 */ 115 static __inline void 116 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, 117 pctrie_free_t freefn) 118 { 119 #ifdef INVARIANTS 120 int slot; 121 122 KASSERT(node->pn_count == 0, 123 ("pctrie_node_put: node %p has %d children", node, 124 node->pn_count)); 125 for (slot = 0; slot < PCTRIE_COUNT; slot++) 126 KASSERT(node->pn_child[slot] == NULL, 127 ("pctrie_node_put: node %p has a child", node)); 128 #endif 129 freefn(ptree, node); 130 } 131 132 /* 133 * Return the position in the array for a given level. 134 */ 135 static __inline int 136 pctrie_slot(uint64_t index, uint16_t level) 137 { 138 139 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); 140 } 141 142 /* Trims the key after the specified level. */ 143 static __inline uint64_t 144 pctrie_trimkey(uint64_t index, uint16_t level) 145 { 146 uint64_t ret; 147 148 ret = index; 149 if (level > 0) { 150 ret >>= level * PCTRIE_WIDTH; 151 ret <<= level * PCTRIE_WIDTH; 152 } 153 return (ret); 154 } 155 156 /* 157 * Get the root node for a tree. 158 */ 159 static __inline struct pctrie_node * 160 pctrie_getroot(struct pctrie *ptree) 161 { 162 163 return ((struct pctrie_node *)ptree->pt_root); 164 } 165 166 /* 167 * Set the root node for a tree. 168 */ 169 static __inline void 170 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) 171 { 172 173 ptree->pt_root = (uintptr_t)node; 174 } 175 176 /* 177 * Returns TRUE if the specified node is a leaf and FALSE otherwise. 178 */ 179 static __inline boolean_t 180 pctrie_isleaf(struct pctrie_node *node) 181 { 182 183 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); 184 } 185 186 /* 187 * Returns the associated val extracted from node. 188 */ 189 static __inline uint64_t * 190 pctrie_toval(struct pctrie_node *node) 191 { 192 193 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); 194 } 195 196 /* 197 * Adds the val as a child of the provided node. 198 */ 199 static __inline void 200 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, 201 uint64_t *val) 202 { 203 int slot; 204 205 slot = pctrie_slot(index, clev); 206 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); 207 } 208 209 /* 210 * Returns the slot where two keys differ. 211 * It cannot accept 2 equal keys. 212 */ 213 static __inline uint16_t 214 pctrie_keydiff(uint64_t index1, uint64_t index2) 215 { 216 uint16_t clev; 217 218 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 219 __func__, (uintmax_t)index1)); 220 221 index1 ^= index2; 222 for (clev = PCTRIE_LIMIT;; clev--) 223 if (pctrie_slot(index1, clev) != 0) 224 return (clev); 225 } 226 227 /* 228 * Returns TRUE if it can be determined that key does not belong to the 229 * specified node. Otherwise, returns FALSE. 230 */ 231 static __inline boolean_t 232 pctrie_keybarr(struct pctrie_node *node, uint64_t idx) 233 { 234 235 if (node->pn_clev < PCTRIE_LIMIT) { 236 idx = pctrie_trimkey(idx, node->pn_clev + 1); 237 return (idx != node->pn_owner); 238 } 239 return (FALSE); 240 } 241 242 /* 243 * Internal helper for pctrie_reclaim_allnodes(). 244 * This function is recursive. 245 */ 246 static void 247 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, 248 pctrie_free_t freefn) 249 { 250 int slot; 251 252 KASSERT(node->pn_count <= PCTRIE_COUNT, 253 ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); 254 for (slot = 0; node->pn_count != 0; slot++) { 255 if (node->pn_child[slot] == NULL) 256 continue; 257 if (!pctrie_isleaf(node->pn_child[slot])) 258 pctrie_reclaim_allnodes_int(ptree, 259 node->pn_child[slot], freefn); 260 node->pn_child[slot] = NULL; 261 node->pn_count--; 262 } 263 pctrie_node_put(ptree, node, freefn); 264 } 265 266 /* 267 * pctrie node zone initializer. 268 */ 269 int 270 pctrie_zone_init(void *mem, int size __unused, int flags __unused) 271 { 272 struct pctrie_node *node; 273 274 node = mem; 275 memset(node->pn_child, 0, sizeof(node->pn_child)); 276 return (0); 277 } 278 279 size_t 280 pctrie_node_size(void) 281 { 282 283 return (sizeof(struct pctrie_node)); 284 } 285 286 /* 287 * Inserts the key-value pair into the trie. 288 * Panics if the key already exists. 289 */ 290 int 291 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) 292 { 293 uint64_t index, newind; 294 void **parentp; 295 struct pctrie_node *node, *tmp; 296 uint64_t *m; 297 int slot; 298 uint16_t clev; 299 300 index = *val; 301 302 /* 303 * The owner of record for root is not really important because it 304 * will never be used. 305 */ 306 node = pctrie_getroot(ptree); 307 if (node == NULL) { 308 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; 309 return (0); 310 } 311 parentp = (void **)&ptree->pt_root; 312 for (;;) { 313 if (pctrie_isleaf(node)) { 314 m = pctrie_toval(node); 315 if (*m == index) 316 panic("%s: key %jx is already present", 317 __func__, (uintmax_t)index); 318 clev = pctrie_keydiff(*m, index); 319 tmp = pctrie_node_get(ptree, allocfn, 320 pctrie_trimkey(index, clev + 1), 2, clev); 321 if (tmp == NULL) 322 return (ENOMEM); 323 *parentp = tmp; 324 pctrie_addval(tmp, index, clev, val); 325 pctrie_addval(tmp, *m, clev, m); 326 return (0); 327 } else if (pctrie_keybarr(node, index)) 328 break; 329 slot = pctrie_slot(index, node->pn_clev); 330 if (node->pn_child[slot] == NULL) { 331 node->pn_count++; 332 pctrie_addval(node, index, node->pn_clev, val); 333 return (0); 334 } 335 parentp = &node->pn_child[slot]; 336 node = node->pn_child[slot]; 337 } 338 339 /* 340 * A new node is needed because the right insertion level is reached. 341 * Setup the new intermediate node and add the 2 children: the 342 * new object and the older edge. 343 */ 344 newind = node->pn_owner; 345 clev = pctrie_keydiff(newind, index); 346 tmp = pctrie_node_get(ptree, allocfn, 347 pctrie_trimkey(index, clev + 1), 2, clev); 348 if (tmp == NULL) 349 return (ENOMEM); 350 *parentp = tmp; 351 pctrie_addval(tmp, index, clev, val); 352 slot = pctrie_slot(newind, clev); 353 tmp->pn_child[slot] = node; 354 355 return (0); 356 } 357 358 /* 359 * Returns the value stored at the index. If the index is not present, 360 * NULL is returned. 361 */ 362 uint64_t * 363 pctrie_lookup(struct pctrie *ptree, uint64_t index) 364 { 365 struct pctrie_node *node; 366 uint64_t *m; 367 int slot; 368 369 node = pctrie_getroot(ptree); 370 while (node != NULL) { 371 if (pctrie_isleaf(node)) { 372 m = pctrie_toval(node); 373 if (*m == index) 374 return (m); 375 else 376 break; 377 } else if (pctrie_keybarr(node, index)) 378 break; 379 slot = pctrie_slot(index, node->pn_clev); 380 node = node->pn_child[slot]; 381 } 382 return (NULL); 383 } 384 385 /* 386 * Look up the nearest entry at a position bigger than or equal to index. 387 */ 388 uint64_t * 389 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) 390 { 391 struct pctrie_node *stack[PCTRIE_LIMIT]; 392 uint64_t inc; 393 uint64_t *m; 394 struct pctrie_node *child, *node; 395 #ifdef INVARIANTS 396 int loops = 0; 397 #endif 398 int slot, tos; 399 400 node = pctrie_getroot(ptree); 401 if (node == NULL) 402 return (NULL); 403 else if (pctrie_isleaf(node)) { 404 m = pctrie_toval(node); 405 if (*m >= index) 406 return (m); 407 else 408 return (NULL); 409 } 410 tos = 0; 411 for (;;) { 412 /* 413 * If the keys differ before the current bisection node, 414 * then the search key might rollback to the earliest 415 * available bisection node or to the smallest key 416 * in the current node (if the owner is bigger than the 417 * search key). 418 */ 419 if (pctrie_keybarr(node, index)) { 420 if (index > node->pn_owner) { 421 ascend: 422 KASSERT(++loops < 1000, 423 ("pctrie_lookup_ge: too many loops")); 424 425 /* 426 * Pop nodes from the stack until either the 427 * stack is empty or a node that could have a 428 * matching descendant is found. 429 */ 430 do { 431 if (tos == 0) 432 return (NULL); 433 node = stack[--tos]; 434 } while (pctrie_slot(index, 435 node->pn_clev) == (PCTRIE_COUNT - 1)); 436 437 /* 438 * The following computation cannot overflow 439 * because index's slot at the current level 440 * is less than PCTRIE_COUNT - 1. 441 */ 442 index = pctrie_trimkey(index, 443 node->pn_clev); 444 index += PCTRIE_UNITLEVEL(node->pn_clev); 445 } else 446 index = node->pn_owner; 447 KASSERT(!pctrie_keybarr(node, index), 448 ("pctrie_lookup_ge: keybarr failed")); 449 } 450 slot = pctrie_slot(index, node->pn_clev); 451 child = node->pn_child[slot]; 452 if (pctrie_isleaf(child)) { 453 m = pctrie_toval(child); 454 if (*m >= index) 455 return (m); 456 } else if (child != NULL) 457 goto descend; 458 459 /* 460 * Look for an available edge or val within the current 461 * bisection node. 462 */ 463 if (slot < (PCTRIE_COUNT - 1)) { 464 inc = PCTRIE_UNITLEVEL(node->pn_clev); 465 index = pctrie_trimkey(index, node->pn_clev); 466 do { 467 index += inc; 468 slot++; 469 child = node->pn_child[slot]; 470 if (pctrie_isleaf(child)) { 471 m = pctrie_toval(child); 472 if (*m >= index) 473 return (m); 474 } else if (child != NULL) 475 goto descend; 476 } while (slot < (PCTRIE_COUNT - 1)); 477 } 478 KASSERT(child == NULL || pctrie_isleaf(child), 479 ("pctrie_lookup_ge: child is radix node")); 480 481 /* 482 * If a value or edge bigger than the search slot is not found 483 * in the current node, ascend to the next higher-level node. 484 */ 485 goto ascend; 486 descend: 487 KASSERT(node->pn_clev > 0, 488 ("pctrie_lookup_ge: pushing leaf's parent")); 489 KASSERT(tos < PCTRIE_LIMIT, 490 ("pctrie_lookup_ge: stack overflow")); 491 stack[tos++] = node; 492 node = child; 493 } 494 } 495 496 /* 497 * Look up the nearest entry at a position less than or equal to index. 498 */ 499 uint64_t * 500 pctrie_lookup_le(struct pctrie *ptree, uint64_t index) 501 { 502 struct pctrie_node *stack[PCTRIE_LIMIT]; 503 uint64_t inc; 504 uint64_t *m; 505 struct pctrie_node *child, *node; 506 #ifdef INVARIANTS 507 int loops = 0; 508 #endif 509 int slot, tos; 510 511 node = pctrie_getroot(ptree); 512 if (node == NULL) 513 return (NULL); 514 else if (pctrie_isleaf(node)) { 515 m = pctrie_toval(node); 516 if (*m <= index) 517 return (m); 518 else 519 return (NULL); 520 } 521 tos = 0; 522 for (;;) { 523 /* 524 * If the keys differ before the current bisection node, 525 * then the search key might rollback to the earliest 526 * available bisection node or to the largest key 527 * in the current node (if the owner is smaller than the 528 * search key). 529 */ 530 if (pctrie_keybarr(node, index)) { 531 if (index > node->pn_owner) { 532 index = node->pn_owner + PCTRIE_COUNT * 533 PCTRIE_UNITLEVEL(node->pn_clev); 534 } else { 535 ascend: 536 KASSERT(++loops < 1000, 537 ("pctrie_lookup_le: too many loops")); 538 539 /* 540 * Pop nodes from the stack until either the 541 * stack is empty or a node that could have a 542 * matching descendant is found. 543 */ 544 do { 545 if (tos == 0) 546 return (NULL); 547 node = stack[--tos]; 548 } while (pctrie_slot(index, 549 node->pn_clev) == 0); 550 551 /* 552 * The following computation cannot overflow 553 * because index's slot at the current level 554 * is greater than 0. 555 */ 556 index = pctrie_trimkey(index, 557 node->pn_clev); 558 } 559 index--; 560 KASSERT(!pctrie_keybarr(node, index), 561 ("pctrie_lookup_le: keybarr failed")); 562 } 563 slot = pctrie_slot(index, node->pn_clev); 564 child = node->pn_child[slot]; 565 if (pctrie_isleaf(child)) { 566 m = pctrie_toval(child); 567 if (*m <= index) 568 return (m); 569 } else if (child != NULL) 570 goto descend; 571 572 /* 573 * Look for an available edge or value within the current 574 * bisection node. 575 */ 576 if (slot > 0) { 577 inc = PCTRIE_UNITLEVEL(node->pn_clev); 578 index |= inc - 1; 579 do { 580 index -= inc; 581 slot--; 582 child = node->pn_child[slot]; 583 if (pctrie_isleaf(child)) { 584 m = pctrie_toval(child); 585 if (*m <= index) 586 return (m); 587 } else if (child != NULL) 588 goto descend; 589 } while (slot > 0); 590 } 591 KASSERT(child == NULL || pctrie_isleaf(child), 592 ("pctrie_lookup_le: child is radix node")); 593 594 /* 595 * If a value or edge smaller than the search slot is not found 596 * in the current node, ascend to the next higher-level node. 597 */ 598 goto ascend; 599 descend: 600 KASSERT(node->pn_clev > 0, 601 ("pctrie_lookup_le: pushing leaf's parent")); 602 KASSERT(tos < PCTRIE_LIMIT, 603 ("pctrie_lookup_le: stack overflow")); 604 stack[tos++] = node; 605 node = child; 606 } 607 } 608 609 /* 610 * Remove the specified index from the tree. 611 * Panics if the key is not present. 612 */ 613 void 614 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) 615 { 616 struct pctrie_node *node, *parent; 617 uint64_t *m; 618 int i, slot; 619 620 node = pctrie_getroot(ptree); 621 if (pctrie_isleaf(node)) { 622 m = pctrie_toval(node); 623 if (*m != index) 624 panic("%s: invalid key found", __func__); 625 pctrie_setroot(ptree, NULL); 626 return; 627 } 628 parent = NULL; 629 for (;;) { 630 if (node == NULL) 631 panic("pctrie_remove: impossible to locate the key"); 632 slot = pctrie_slot(index, node->pn_clev); 633 if (pctrie_isleaf(node->pn_child[slot])) { 634 m = pctrie_toval(node->pn_child[slot]); 635 if (*m != index) 636 panic("%s: invalid key found", __func__); 637 node->pn_child[slot] = NULL; 638 node->pn_count--; 639 if (node->pn_count > 1) 640 break; 641 for (i = 0; i < PCTRIE_COUNT; i++) 642 if (node->pn_child[i] != NULL) 643 break; 644 KASSERT(i != PCTRIE_COUNT, 645 ("%s: invalid node configuration", __func__)); 646 if (parent == NULL) 647 pctrie_setroot(ptree, node->pn_child[i]); 648 else { 649 slot = pctrie_slot(index, parent->pn_clev); 650 KASSERT(parent->pn_child[slot] == node, 651 ("%s: invalid child value", __func__)); 652 parent->pn_child[slot] = node->pn_child[i]; 653 } 654 node->pn_count--; 655 node->pn_child[i] = NULL; 656 pctrie_node_put(ptree, node, freefn); 657 break; 658 } 659 parent = node; 660 node = node->pn_child[slot]; 661 } 662 } 663 664 /* 665 * Remove and free all the nodes from the tree. 666 * This function is recursive but there is a tight control on it as the 667 * maximum depth of the tree is fixed. 668 */ 669 void 670 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) 671 { 672 struct pctrie_node *root; 673 674 root = pctrie_getroot(ptree); 675 if (root == NULL) 676 return; 677 pctrie_setroot(ptree, NULL); 678 if (!pctrie_isleaf(root)) 679 pctrie_reclaim_allnodes_int(ptree, root, freefn); 680 } 681 682 #ifdef DDB 683 /* 684 * Show details about the given node. 685 */ 686 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) 687 { 688 struct pctrie_node *node; 689 int i; 690 691 if (!have_addr) 692 return; 693 node = (struct pctrie_node *)addr; 694 db_printf("node %p, owner %jx, children count %u, level %u:\n", 695 (void *)node, (uintmax_t)node->pn_owner, node->pn_count, 696 node->pn_clev); 697 for (i = 0; i < PCTRIE_COUNT; i++) 698 if (node->pn_child[i] != NULL) 699 db_printf("slot: %d, val: %p, value: %p, clev: %d\n", 700 i, (void *)node->pn_child[i], 701 pctrie_isleaf(node->pn_child[i]) ? 702 pctrie_toval(node->pn_child[i]) : NULL, 703 node->pn_clev); 704 } 705 #endif /* DDB */ 706