1 /* 2 * Copyright (c) 2013 EMC Corp. 3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 */ 29 30 /* 31 * Path-compressed radix trie implementation. 32 * 33 * The implementation takes into account the following rationale: 34 * - Size of the nodes should be as small as possible but still big enough 35 * to avoid a large maximum depth for the trie. This is a balance 36 * between the necessity to not wire too much physical memory for the nodes 37 * and the necessity to avoid too much cache pollution during the trie 38 * operations. 39 * - There is not a huge bias toward the number of lookup operations over 40 * the number of insert and remove operations. This basically implies 41 * that optimizations supposedly helping one operation but hurting the 42 * other might be carefully evaluated. 43 * - On average not many nodes are expected to be fully populated, hence 44 * level compression may just complicate things. 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include "opt_ddb.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/pctrie.h> 56 57 #ifdef DDB 58 #include <ddb/ddb.h> 59 #endif 60 61 #define PCTRIE_MASK (PCTRIE_COUNT - 1) 62 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) 63 64 /* Flag bits stored in node pointers. */ 65 #define PCTRIE_ISLEAF 0x1 66 #define PCTRIE_FLAGS 0x1 67 #define PCTRIE_PAD PCTRIE_FLAGS 68 69 /* Returns one unit associated with specified level. */ 70 #define PCTRIE_UNITLEVEL(lev) \ 71 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) 72 73 struct pctrie_node { 74 uint64_t pn_owner; /* Owner of record. */ 75 uint16_t pn_count; /* Valid children. */ 76 uint16_t pn_clev; /* Current level. */ 77 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ 78 }; 79 80 /* 81 * Allocate a node. Pre-allocation should ensure that the request 82 * will always be satisfied. 83 */ 84 static __inline struct pctrie_node * 85 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, 86 uint16_t count, uint16_t clevel) 87 { 88 struct pctrie_node *node; 89 90 node = allocfn(ptree); 91 if (node == NULL) 92 return (NULL); 93 node->pn_owner = owner; 94 node->pn_count = count; 95 node->pn_clev = clevel; 96 97 return (node); 98 } 99 100 /* 101 * Free radix node. 102 */ 103 static __inline void 104 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, 105 pctrie_free_t freefn) 106 { 107 #ifdef INVARIANTS 108 int slot; 109 110 KASSERT(node->pn_count == 0, 111 ("pctrie_node_put: node %p has %d children", node, 112 node->pn_count)); 113 for (slot = 0; slot < PCTRIE_COUNT; slot++) 114 KASSERT(node->pn_child[slot] == NULL, 115 ("pctrie_node_put: node %p has a child", node)); 116 #endif 117 freefn(ptree, node); 118 } 119 120 /* 121 * Return the position in the array for a given level. 122 */ 123 static __inline int 124 pctrie_slot(uint64_t index, uint16_t level) 125 { 126 127 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); 128 } 129 130 /* Trims the key after the specified level. */ 131 static __inline uint64_t 132 pctrie_trimkey(uint64_t index, uint16_t level) 133 { 134 uint64_t ret; 135 136 ret = index; 137 if (level > 0) { 138 ret >>= level * PCTRIE_WIDTH; 139 ret <<= level * PCTRIE_WIDTH; 140 } 141 return (ret); 142 } 143 144 /* 145 * Get the root node for a tree. 146 */ 147 static __inline struct pctrie_node * 148 pctrie_getroot(struct pctrie *ptree) 149 { 150 151 return ((struct pctrie_node *)ptree->pt_root); 152 } 153 154 /* 155 * Set the root node for a tree. 156 */ 157 static __inline void 158 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) 159 { 160 161 ptree->pt_root = (uintptr_t)node; 162 } 163 164 /* 165 * Returns TRUE if the specified node is a leaf and FALSE otherwise. 166 */ 167 static __inline boolean_t 168 pctrie_isleaf(struct pctrie_node *node) 169 { 170 171 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); 172 } 173 174 /* 175 * Returns the associated val extracted from node. 176 */ 177 static __inline uint64_t * 178 pctrie_toval(struct pctrie_node *node) 179 { 180 181 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); 182 } 183 184 /* 185 * Adds the val as a child of the provided node. 186 */ 187 static __inline void 188 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, 189 uint64_t *val) 190 { 191 int slot; 192 193 slot = pctrie_slot(index, clev); 194 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); 195 } 196 197 /* 198 * Returns the slot where two keys differ. 199 * It cannot accept 2 equal keys. 200 */ 201 static __inline uint16_t 202 pctrie_keydiff(uint64_t index1, uint64_t index2) 203 { 204 uint16_t clev; 205 206 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 207 __func__, (uintmax_t)index1)); 208 209 index1 ^= index2; 210 for (clev = PCTRIE_LIMIT;; clev--) 211 if (pctrie_slot(index1, clev) != 0) 212 return (clev); 213 } 214 215 /* 216 * Returns TRUE if it can be determined that key does not belong to the 217 * specified node. Otherwise, returns FALSE. 218 */ 219 static __inline boolean_t 220 pctrie_keybarr(struct pctrie_node *node, uint64_t idx) 221 { 222 223 if (node->pn_clev < PCTRIE_LIMIT) { 224 idx = pctrie_trimkey(idx, node->pn_clev + 1); 225 return (idx != node->pn_owner); 226 } 227 return (FALSE); 228 } 229 230 /* 231 * Internal helper for pctrie_reclaim_allnodes(). 232 * This function is recursive. 233 */ 234 static void 235 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, 236 pctrie_free_t freefn) 237 { 238 int slot; 239 240 KASSERT(node->pn_count <= PCTRIE_COUNT, 241 ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); 242 for (slot = 0; node->pn_count != 0; slot++) { 243 if (node->pn_child[slot] == NULL) 244 continue; 245 if (!pctrie_isleaf(node->pn_child[slot])) 246 pctrie_reclaim_allnodes_int(ptree, 247 node->pn_child[slot], freefn); 248 node->pn_child[slot] = NULL; 249 node->pn_count--; 250 } 251 pctrie_node_put(ptree, node, freefn); 252 } 253 254 /* 255 * pctrie node zone initializer. 256 */ 257 int 258 pctrie_zone_init(void *mem, int size __unused, int flags __unused) 259 { 260 struct pctrie_node *node; 261 262 node = mem; 263 memset(node->pn_child, 0, sizeof(node->pn_child)); 264 return (0); 265 } 266 267 size_t 268 pctrie_node_size(void) 269 { 270 271 return (sizeof(struct pctrie_node)); 272 } 273 274 /* 275 * Inserts the key-value pair into the trie. 276 * Panics if the key already exists. 277 */ 278 int 279 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) 280 { 281 uint64_t index, newind; 282 void **parentp; 283 struct pctrie_node *node, *tmp; 284 uint64_t *m; 285 int slot; 286 uint16_t clev; 287 288 index = *val; 289 290 /* 291 * The owner of record for root is not really important because it 292 * will never be used. 293 */ 294 node = pctrie_getroot(ptree); 295 if (node == NULL) { 296 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; 297 return (0); 298 } 299 parentp = (void **)&ptree->pt_root; 300 for (;;) { 301 if (pctrie_isleaf(node)) { 302 m = pctrie_toval(node); 303 if (*m == index) 304 panic("%s: key %jx is already present", 305 __func__, (uintmax_t)index); 306 clev = pctrie_keydiff(*m, index); 307 tmp = pctrie_node_get(ptree, allocfn, 308 pctrie_trimkey(index, clev + 1), 2, clev); 309 if (tmp == NULL) 310 return (ENOMEM); 311 *parentp = tmp; 312 pctrie_addval(tmp, index, clev, val); 313 pctrie_addval(tmp, *m, clev, m); 314 return (0); 315 } else if (pctrie_keybarr(node, index)) 316 break; 317 slot = pctrie_slot(index, node->pn_clev); 318 if (node->pn_child[slot] == NULL) { 319 node->pn_count++; 320 pctrie_addval(node, index, node->pn_clev, val); 321 return (0); 322 } 323 parentp = &node->pn_child[slot]; 324 node = node->pn_child[slot]; 325 } 326 327 /* 328 * A new node is needed because the right insertion level is reached. 329 * Setup the new intermediate node and add the 2 children: the 330 * new object and the older edge. 331 */ 332 newind = node->pn_owner; 333 clev = pctrie_keydiff(newind, index); 334 tmp = pctrie_node_get(ptree, allocfn, 335 pctrie_trimkey(index, clev + 1), 2, clev); 336 if (tmp == NULL) 337 return (ENOMEM); 338 *parentp = tmp; 339 pctrie_addval(tmp, index, clev, val); 340 slot = pctrie_slot(newind, clev); 341 tmp->pn_child[slot] = node; 342 343 return (0); 344 } 345 346 /* 347 * Returns the value stored at the index. If the index is not present, 348 * NULL is returned. 349 */ 350 uint64_t * 351 pctrie_lookup(struct pctrie *ptree, uint64_t index) 352 { 353 struct pctrie_node *node; 354 uint64_t *m; 355 int slot; 356 357 node = pctrie_getroot(ptree); 358 while (node != NULL) { 359 if (pctrie_isleaf(node)) { 360 m = pctrie_toval(node); 361 if (*m == index) 362 return (m); 363 else 364 break; 365 } else if (pctrie_keybarr(node, index)) 366 break; 367 slot = pctrie_slot(index, node->pn_clev); 368 node = node->pn_child[slot]; 369 } 370 return (NULL); 371 } 372 373 /* 374 * Look up the nearest entry at a position bigger than or equal to index. 375 */ 376 uint64_t * 377 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) 378 { 379 struct pctrie_node *stack[PCTRIE_LIMIT]; 380 uint64_t inc; 381 uint64_t *m; 382 struct pctrie_node *child, *node; 383 #ifdef INVARIANTS 384 int loops = 0; 385 #endif 386 int slot, tos; 387 388 node = pctrie_getroot(ptree); 389 if (node == NULL) 390 return (NULL); 391 else if (pctrie_isleaf(node)) { 392 m = pctrie_toval(node); 393 if (*m >= index) 394 return (m); 395 else 396 return (NULL); 397 } 398 tos = 0; 399 for (;;) { 400 /* 401 * If the keys differ before the current bisection node, 402 * then the search key might rollback to the earliest 403 * available bisection node or to the smallest key 404 * in the current node (if the owner is bigger than the 405 * search key). 406 */ 407 if (pctrie_keybarr(node, index)) { 408 if (index > node->pn_owner) { 409 ascend: 410 KASSERT(++loops < 1000, 411 ("pctrie_lookup_ge: too many loops")); 412 413 /* 414 * Pop nodes from the stack until either the 415 * stack is empty or a node that could have a 416 * matching descendant is found. 417 */ 418 do { 419 if (tos == 0) 420 return (NULL); 421 node = stack[--tos]; 422 } while (pctrie_slot(index, 423 node->pn_clev) == (PCTRIE_COUNT - 1)); 424 425 /* 426 * The following computation cannot overflow 427 * because index's slot at the current level 428 * is less than PCTRIE_COUNT - 1. 429 */ 430 index = pctrie_trimkey(index, 431 node->pn_clev); 432 index += PCTRIE_UNITLEVEL(node->pn_clev); 433 } else 434 index = node->pn_owner; 435 KASSERT(!pctrie_keybarr(node, index), 436 ("pctrie_lookup_ge: keybarr failed")); 437 } 438 slot = pctrie_slot(index, node->pn_clev); 439 child = node->pn_child[slot]; 440 if (pctrie_isleaf(child)) { 441 m = pctrie_toval(child); 442 if (*m >= index) 443 return (m); 444 } else if (child != NULL) 445 goto descend; 446 447 /* 448 * Look for an available edge or val within the current 449 * bisection node. 450 */ 451 if (slot < (PCTRIE_COUNT - 1)) { 452 inc = PCTRIE_UNITLEVEL(node->pn_clev); 453 index = pctrie_trimkey(index, node->pn_clev); 454 do { 455 index += inc; 456 slot++; 457 child = node->pn_child[slot]; 458 if (pctrie_isleaf(child)) { 459 m = pctrie_toval(child); 460 if (*m >= index) 461 return (m); 462 } else if (child != NULL) 463 goto descend; 464 } while (slot < (PCTRIE_COUNT - 1)); 465 } 466 KASSERT(child == NULL || pctrie_isleaf(child), 467 ("pctrie_lookup_ge: child is radix node")); 468 469 /* 470 * If a value or edge bigger than the search slot is not found 471 * in the current node, ascend to the next higher-level node. 472 */ 473 goto ascend; 474 descend: 475 KASSERT(node->pn_clev > 0, 476 ("pctrie_lookup_ge: pushing leaf's parent")); 477 KASSERT(tos < PCTRIE_LIMIT, 478 ("pctrie_lookup_ge: stack overflow")); 479 stack[tos++] = node; 480 node = child; 481 } 482 } 483 484 /* 485 * Look up the nearest entry at a position less than or equal to index. 486 */ 487 uint64_t * 488 pctrie_lookup_le(struct pctrie *ptree, uint64_t index) 489 { 490 struct pctrie_node *stack[PCTRIE_LIMIT]; 491 uint64_t inc; 492 uint64_t *m; 493 struct pctrie_node *child, *node; 494 #ifdef INVARIANTS 495 int loops = 0; 496 #endif 497 int slot, tos; 498 499 node = pctrie_getroot(ptree); 500 if (node == NULL) 501 return (NULL); 502 else if (pctrie_isleaf(node)) { 503 m = pctrie_toval(node); 504 if (*m <= index) 505 return (m); 506 else 507 return (NULL); 508 } 509 tos = 0; 510 for (;;) { 511 /* 512 * If the keys differ before the current bisection node, 513 * then the search key might rollback to the earliest 514 * available bisection node or to the largest key 515 * in the current node (if the owner is smaller than the 516 * search key). 517 */ 518 if (pctrie_keybarr(node, index)) { 519 if (index > node->pn_owner) { 520 index = node->pn_owner + PCTRIE_COUNT * 521 PCTRIE_UNITLEVEL(node->pn_clev); 522 } else { 523 ascend: 524 KASSERT(++loops < 1000, 525 ("pctrie_lookup_le: too many loops")); 526 527 /* 528 * Pop nodes from the stack until either the 529 * stack is empty or a node that could have a 530 * matching descendant is found. 531 */ 532 do { 533 if (tos == 0) 534 return (NULL); 535 node = stack[--tos]; 536 } while (pctrie_slot(index, 537 node->pn_clev) == 0); 538 539 /* 540 * The following computation cannot overflow 541 * because index's slot at the current level 542 * is greater than 0. 543 */ 544 index = pctrie_trimkey(index, 545 node->pn_clev); 546 } 547 index--; 548 KASSERT(!pctrie_keybarr(node, index), 549 ("pctrie_lookup_le: keybarr failed")); 550 } 551 slot = pctrie_slot(index, node->pn_clev); 552 child = node->pn_child[slot]; 553 if (pctrie_isleaf(child)) { 554 m = pctrie_toval(child); 555 if (*m <= index) 556 return (m); 557 } else if (child != NULL) 558 goto descend; 559 560 /* 561 * Look for an available edge or value within the current 562 * bisection node. 563 */ 564 if (slot > 0) { 565 inc = PCTRIE_UNITLEVEL(node->pn_clev); 566 index |= inc - 1; 567 do { 568 index -= inc; 569 slot--; 570 child = node->pn_child[slot]; 571 if (pctrie_isleaf(child)) { 572 m = pctrie_toval(child); 573 if (*m <= index) 574 return (m); 575 } else if (child != NULL) 576 goto descend; 577 } while (slot > 0); 578 } 579 KASSERT(child == NULL || pctrie_isleaf(child), 580 ("pctrie_lookup_le: child is radix node")); 581 582 /* 583 * If a value or edge smaller than the search slot is not found 584 * in the current node, ascend to the next higher-level node. 585 */ 586 goto ascend; 587 descend: 588 KASSERT(node->pn_clev > 0, 589 ("pctrie_lookup_le: pushing leaf's parent")); 590 KASSERT(tos < PCTRIE_LIMIT, 591 ("pctrie_lookup_le: stack overflow")); 592 stack[tos++] = node; 593 node = child; 594 } 595 } 596 597 /* 598 * Remove the specified index from the tree. 599 * Panics if the key is not present. 600 */ 601 void 602 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) 603 { 604 struct pctrie_node *node, *parent; 605 uint64_t *m; 606 int i, slot; 607 608 node = pctrie_getroot(ptree); 609 if (pctrie_isleaf(node)) { 610 m = pctrie_toval(node); 611 if (*m != index) 612 panic("%s: invalid key found", __func__); 613 pctrie_setroot(ptree, NULL); 614 return; 615 } 616 parent = NULL; 617 for (;;) { 618 if (node == NULL) 619 panic("pctrie_remove: impossible to locate the key"); 620 slot = pctrie_slot(index, node->pn_clev); 621 if (pctrie_isleaf(node->pn_child[slot])) { 622 m = pctrie_toval(node->pn_child[slot]); 623 if (*m != index) 624 panic("%s: invalid key found", __func__); 625 node->pn_child[slot] = NULL; 626 node->pn_count--; 627 if (node->pn_count > 1) 628 break; 629 for (i = 0; i < PCTRIE_COUNT; i++) 630 if (node->pn_child[i] != NULL) 631 break; 632 KASSERT(i != PCTRIE_COUNT, 633 ("%s: invalid node configuration", __func__)); 634 if (parent == NULL) 635 pctrie_setroot(ptree, node->pn_child[i]); 636 else { 637 slot = pctrie_slot(index, parent->pn_clev); 638 KASSERT(parent->pn_child[slot] == node, 639 ("%s: invalid child value", __func__)); 640 parent->pn_child[slot] = node->pn_child[i]; 641 } 642 node->pn_count--; 643 node->pn_child[i] = NULL; 644 pctrie_node_put(ptree, node, freefn); 645 break; 646 } 647 parent = node; 648 node = node->pn_child[slot]; 649 } 650 } 651 652 /* 653 * Remove and free all the nodes from the tree. 654 * This function is recursive but there is a tight control on it as the 655 * maximum depth of the tree is fixed. 656 */ 657 void 658 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) 659 { 660 struct pctrie_node *root; 661 662 root = pctrie_getroot(ptree); 663 if (root == NULL) 664 return; 665 pctrie_setroot(ptree, NULL); 666 if (!pctrie_isleaf(root)) 667 pctrie_reclaim_allnodes_int(ptree, root, freefn); 668 } 669 670 #ifdef DDB 671 /* 672 * Show details about the given node. 673 */ 674 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) 675 { 676 struct pctrie_node *node; 677 int i; 678 679 if (!have_addr) 680 return; 681 node = (struct pctrie_node *)addr; 682 db_printf("node %p, owner %jx, children count %u, level %u:\n", 683 (void *)node, (uintmax_t)node->pn_owner, node->pn_count, 684 node->pn_clev); 685 for (i = 0; i < PCTRIE_COUNT; i++) 686 if (node->pn_child[i] != NULL) 687 db_printf("slot: %d, val: %p, value: %p, clev: %d\n", 688 i, (void *)node->pn_child[i], 689 pctrie_isleaf(node->pn_child[i]) ? 690 pctrie_toval(node->pn_child[i]) : NULL, 691 node->pn_clev); 692 } 693 #endif /* DDB */ 694