1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * 35 * The implementation takes into account the following rationale: 36 * - Size of the nodes should be as small as possible but still big enough 37 * to avoid a large maximum depth for the trie. This is a balance 38 * between the necessity to not wire too much physical memory for the nodes 39 * and the necessity to avoid too much cache pollution during the trie 40 * operations. 41 * - There is not a huge bias toward the number of lookup operations over 42 * the number of insert and remove operations. This basically implies 43 * that optimizations supposedly helping one operation but hurting the 44 * other might be carefully evaluated. 45 * - On average not many nodes are expected to be fully populated, hence 46 * level compression may just complicate things. 47 */ 48 49 #include <sys/cdefs.h> 50 #include "opt_ddb.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/libkern.h> 56 #include <sys/pctrie.h> 57 #include <sys/proc.h> /* smr.h depends on struct thread. */ 58 #include <sys/smr.h> 59 #include <sys/smr_types.h> 60 61 #ifdef DDB 62 #include <ddb/ddb.h> 63 #endif 64 65 #if PCTRIE_WIDTH == 3 66 typedef uint8_t pn_popmap_t; 67 #elif PCTRIE_WIDTH == 4 68 typedef uint16_t pn_popmap_t; 69 #elif PCTRIE_WIDTH == 5 70 typedef uint32_t pn_popmap_t; 71 #else 72 #error Unsupported width 73 #endif 74 _Static_assert(sizeof(pn_popmap_t) <= sizeof(int), 75 "pn_popmap_t too wide"); 76 77 struct pctrie_node; 78 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t; 79 80 struct pctrie_node { 81 uint64_t pn_owner; /* Owner of record. */ 82 pn_popmap_t pn_popmap; /* Valid children. */ 83 uint8_t pn_clev; /* Level * WIDTH. */ 84 smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */ 85 }; 86 87 /* 88 * Map index to an array position for the children of node, 89 */ 90 static __inline int 91 pctrie_slot(struct pctrie_node *node, uint64_t index) 92 { 93 return ((index >> node->pn_clev) & (PCTRIE_COUNT - 1)); 94 } 95 96 /* 97 * Returns true if index does not belong to the specified node. Otherwise, 98 * sets slot value, and returns false. 99 */ 100 static __inline bool 101 pctrie_keybarr(struct pctrie_node *node, uint64_t index, int *slot) 102 { 103 index = (index - node->pn_owner) >> node->pn_clev; 104 if (index >= PCTRIE_COUNT) 105 return (true); 106 *slot = index; 107 return (false); 108 } 109 110 /* 111 * Check radix node. 112 */ 113 static __inline void 114 pctrie_node_put(struct pctrie_node *node) 115 { 116 #ifdef INVARIANTS 117 int slot; 118 119 KASSERT(powerof2(node->pn_popmap), 120 ("pctrie_node_put: node %p has too many children %04x", node, 121 node->pn_popmap)); 122 for (slot = 0; slot < PCTRIE_COUNT; slot++) { 123 if ((node->pn_popmap & (1 << slot)) != 0) 124 continue; 125 KASSERT(smr_unserialized_load(&node->pn_child[slot], true) == 126 PCTRIE_NULL, 127 ("pctrie_node_put: node %p has a child", node)); 128 } 129 #endif 130 } 131 132 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED }; 133 134 /* 135 * Fetch a node pointer from a slot. 136 */ 137 static __inline struct pctrie_node * 138 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access) 139 { 140 switch (access) { 141 case PCTRIE_UNSERIALIZED: 142 return (smr_unserialized_load(p, true)); 143 case PCTRIE_LOCKED: 144 return (smr_serialized_load(p, true)); 145 case PCTRIE_SMR: 146 return (smr_entered_load(p, smr)); 147 } 148 __assert_unreachable(); 149 } 150 151 static __inline void 152 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access) 153 { 154 switch (access) { 155 case PCTRIE_UNSERIALIZED: 156 smr_unserialized_store(p, v, true); 157 break; 158 case PCTRIE_LOCKED: 159 smr_serialized_store(p, v, true); 160 break; 161 case PCTRIE_SMR: 162 panic("%s: Not supported in SMR section.", __func__); 163 break; 164 default: 165 __assert_unreachable(); 166 break; 167 } 168 } 169 170 /* 171 * Get the root node for a tree. 172 */ 173 static __inline struct pctrie_node * 174 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access) 175 { 176 return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access)); 177 } 178 179 /* 180 * Set the root node for a tree. 181 */ 182 static __inline void 183 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node, 184 enum pctrie_access access) 185 { 186 pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access); 187 } 188 189 /* 190 * Returns TRUE if the specified node is a leaf and FALSE otherwise. 191 */ 192 static __inline bool 193 pctrie_isleaf(struct pctrie_node *node) 194 { 195 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); 196 } 197 198 /* 199 * Returns val with leaf bit set. 200 */ 201 static __inline void * 202 pctrie_toleaf(uint64_t *val) 203 { 204 return ((void *)((uintptr_t)val | PCTRIE_ISLEAF)); 205 } 206 207 /* 208 * Returns the associated val extracted from node. 209 */ 210 static __inline uint64_t * 211 pctrie_toval(struct pctrie_node *node) 212 { 213 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); 214 } 215 216 /* 217 * Returns the associated pointer extracted from node and field offset. 218 */ 219 static __inline void * 220 pctrie_toptr(struct pctrie_node *node, int keyoff) 221 { 222 return ((void *)(((uintptr_t)node & ~PCTRIE_FLAGS) - keyoff)); 223 } 224 225 /* 226 * Make 'child' a child of 'node'. 227 */ 228 static __inline void 229 pctrie_addnode(struct pctrie_node *node, uint64_t index, 230 struct pctrie_node *child, enum pctrie_access access) 231 { 232 int slot; 233 234 slot = pctrie_slot(node, index); 235 pctrie_node_store(&node->pn_child[slot], child, access); 236 node->pn_popmap ^= 1 << slot; 237 KASSERT((node->pn_popmap & (1 << slot)) != 0, 238 ("%s: bad popmap slot %d in node %p", __func__, slot, node)); 239 } 240 241 /* 242 * pctrie node zone initializer. 243 */ 244 int 245 pctrie_zone_init(void *mem, int size __unused, int flags __unused) 246 { 247 struct pctrie_node *node; 248 249 node = mem; 250 node->pn_popmap = 0; 251 for (int i = 0; i < nitems(node->pn_child); i++) 252 pctrie_node_store(&node->pn_child[i], PCTRIE_NULL, 253 PCTRIE_UNSERIALIZED); 254 return (0); 255 } 256 257 size_t 258 pctrie_node_size(void) 259 { 260 261 return (sizeof(struct pctrie_node)); 262 } 263 264 enum pctrie_insert_neighbor_mode { 265 PCTRIE_INSERT_NEIGHBOR_NONE, 266 PCTRIE_INSERT_NEIGHBOR_LT, 267 PCTRIE_INSERT_NEIGHBOR_GT, 268 }; 269 270 /* 271 * Look for where to insert the key-value pair into the trie. Complete the 272 * insertion if it replaces a null leaf. Return the insertion location if the 273 * insertion needs to be completed by the caller; otherwise return NULL. 274 * 275 * If the key is already present in the trie, populate *found_out as if by 276 * pctrie_lookup(). 277 * 278 * With mode PCTRIE_INSERT_NEIGHBOR_GT or PCTRIE_INSERT_NEIGHBOR_LT, set 279 * *neighbor_out to the lowest level node we encounter during the insert lookup 280 * that is a parent of the next greater or lesser entry. The value is not 281 * defined if the key was already present in the trie. 282 * 283 * Note that mode is expected to be a compile-time constant, and this procedure 284 * is expected to be inlined into callers with extraneous code optimized out. 285 */ 286 static __always_inline void * 287 pctrie_insert_lookup_compound(struct pctrie *ptree, uint64_t *val, 288 uint64_t **found_out, struct pctrie_node **neighbor_out, 289 enum pctrie_insert_neighbor_mode mode) 290 { 291 uint64_t index; 292 struct pctrie_node *node, *parent; 293 int slot; 294 295 index = *val; 296 297 /* 298 * The owner of record for root is not really important because it 299 * will never be used. 300 */ 301 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); 302 parent = NULL; 303 for (;;) { 304 if (pctrie_isleaf(node)) { 305 if (node == PCTRIE_NULL) { 306 if (parent == NULL) 307 pctrie_root_store(ptree, 308 pctrie_toleaf(val), PCTRIE_LOCKED); 309 else 310 pctrie_addnode(parent, index, 311 pctrie_toleaf(val), PCTRIE_LOCKED); 312 return (NULL); 313 } 314 if (*pctrie_toval(node) == index) { 315 *found_out = pctrie_toval(node); 316 return (NULL); 317 } 318 break; 319 } 320 if (pctrie_keybarr(node, index, &slot)) 321 break; 322 /* 323 * Descend. If we're tracking the next neighbor and this node 324 * contains a neighboring entry in the right direction, record 325 * it. 326 */ 327 if (mode == PCTRIE_INSERT_NEIGHBOR_LT) { 328 if ((node->pn_popmap & ((1 << slot) - 1)) != 0) 329 *neighbor_out = node; 330 } else if (mode == PCTRIE_INSERT_NEIGHBOR_GT) { 331 if ((node->pn_popmap >> slot) > 1) 332 *neighbor_out = node; 333 } 334 parent = node; 335 node = pctrie_node_load(&node->pn_child[slot], NULL, 336 PCTRIE_LOCKED); 337 } 338 339 /* 340 * The caller will split this node. If we're tracking the next 341 * neighbor, record the old node if the old entry is in the right 342 * direction. 343 */ 344 if (mode == PCTRIE_INSERT_NEIGHBOR_LT) { 345 if (*pctrie_toval(node) < index) 346 *neighbor_out = node; 347 } else if (mode == PCTRIE_INSERT_NEIGHBOR_GT) { 348 if (*pctrie_toval(node) > index) 349 *neighbor_out = node; 350 } 351 352 /* 353 * 'node' must be replaced in the tree with a new branch node, with 354 * children 'node' and 'val'. Return the place that points to 'node' 355 * now, and will point to to the new branching node later. 356 */ 357 return ((parent != NULL) ? &parent->pn_child[slot]: 358 (smr_pctnode_t *)&ptree->pt_root); 359 } 360 361 /* 362 * Wrap pctrie_insert_lookup_compound to implement a strict insertion. Panic 363 * if the key already exists, and do not look for neighboring entries. 364 */ 365 void * 366 pctrie_insert_lookup_strict(struct pctrie *ptree, uint64_t *val) 367 { 368 void *parentp; 369 uint64_t *found; 370 371 found = NULL; 372 parentp = pctrie_insert_lookup_compound(ptree, val, &found, NULL, 373 PCTRIE_INSERT_NEIGHBOR_NONE); 374 if (__predict_false(found != NULL)) 375 panic("%s: key %jx is already present", __func__, 376 (uintmax_t)*val); 377 return (parentp); 378 } 379 380 /* 381 * Wrap pctrie_insert_lookup_compound to implement find-or-insert. Do not look 382 * for neighboring entries. 383 */ 384 void * 385 pctrie_insert_lookup(struct pctrie *ptree, uint64_t *val, 386 uint64_t **found_out) 387 { 388 *found_out = NULL; 389 return (pctrie_insert_lookup_compound(ptree, val, found_out, NULL, 390 PCTRIE_INSERT_NEIGHBOR_NONE)); 391 } 392 393 /* 394 * Wrap pctrie_insert_lookup_compound to implement find or insert and find next 395 * greater entry. Find a subtree that contains the next entry greater than the 396 * newly-inserted or to-be-inserted entry. 397 */ 398 void * 399 pctrie_insert_lookup_gt(struct pctrie *ptree, uint64_t *val, 400 uint64_t **found_out, struct pctrie_node **neighbor_out) 401 { 402 *found_out = NULL; 403 *neighbor_out = NULL; 404 return (pctrie_insert_lookup_compound(ptree, val, found_out, 405 neighbor_out, PCTRIE_INSERT_NEIGHBOR_GT)); 406 } 407 408 /* 409 * Wrap pctrie_insert_lookup_compound to implement find or insert and find next 410 * lesser entry. Find a subtree that contains the next entry less than the 411 * newly-inserted or to-be-inserted entry. 412 */ 413 void * 414 pctrie_insert_lookup_lt(struct pctrie *ptree, uint64_t *val, 415 uint64_t **found_out, struct pctrie_node **neighbor_out) 416 { 417 *found_out = NULL; 418 *neighbor_out = NULL; 419 return (pctrie_insert_lookup_compound(ptree, val, found_out, 420 neighbor_out, PCTRIE_INSERT_NEIGHBOR_LT)); 421 } 422 423 /* 424 * Uses new node to insert key-value pair into the trie at given location. 425 */ 426 void 427 pctrie_insert_node(void *parentp, struct pctrie_node *parent, uint64_t *val) 428 { 429 struct pctrie_node *node; 430 uint64_t index, newind; 431 432 /* 433 * Clear the last child pointer of the newly allocated parent. We want 434 * to clear it after the final section has exited so lookup can not 435 * return false negatives. It is done here because it will be 436 * cache-cold in the dtor callback. 437 */ 438 if (parent->pn_popmap != 0) { 439 pctrie_node_store(&parent->pn_child[ffs(parent->pn_popmap) - 1], 440 PCTRIE_NULL, PCTRIE_UNSERIALIZED); 441 parent->pn_popmap = 0; 442 } 443 444 /* 445 * Recover the values of the two children of the new parent node. If 446 * 'node' is not a leaf, this stores into 'newind' the 'owner' field, 447 * which must be first in the node. 448 */ 449 index = *val; 450 node = pctrie_node_load(parentp, NULL, PCTRIE_UNSERIALIZED); 451 newind = *pctrie_toval(node); 452 453 /* 454 * From the highest-order bit where the indexes differ, 455 * compute the highest level in the trie where they differ. Then, 456 * compute the least index of this subtrie. 457 */ 458 _Static_assert(sizeof(long long) >= sizeof(uint64_t), 459 "uint64 too wide"); 460 _Static_assert(sizeof(uint64_t) * NBBY <= 461 (1 << (sizeof(parent->pn_clev) * NBBY)), "pn_clev too narrow"); 462 parent->pn_clev = rounddown(ilog2(index ^ newind), PCTRIE_WIDTH); 463 parent->pn_owner = PCTRIE_COUNT; 464 parent->pn_owner = index & -(parent->pn_owner << parent->pn_clev); 465 466 467 /* These writes are not yet visible due to ordering. */ 468 pctrie_addnode(parent, index, pctrie_toleaf(val), PCTRIE_UNSERIALIZED); 469 pctrie_addnode(parent, newind, node, PCTRIE_UNSERIALIZED); 470 /* Synchronize to make the above visible. */ 471 pctrie_node_store(parentp, parent, PCTRIE_LOCKED); 472 } 473 474 /* 475 * Return the value associated with the node, if the node is a leaf that matches 476 * the index; otherwise NULL. 477 */ 478 static __always_inline uint64_t * 479 pctrie_match_value(struct pctrie_node *node, uint64_t index) 480 { 481 uint64_t *m; 482 483 if (!pctrie_isleaf(node) || (m = pctrie_toval(node)) == NULL || 484 *m != index) 485 m = NULL; 486 return (m); 487 } 488 489 /* 490 * Returns the value stored at the index. If the index is not present, 491 * NULL is returned. 492 */ 493 static __always_inline uint64_t * 494 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr, 495 enum pctrie_access access) 496 { 497 struct pctrie_node *node; 498 int slot; 499 500 node = pctrie_root_load(ptree, smr, access); 501 /* Seek a node that matches index. */ 502 while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot)) 503 node = pctrie_node_load(&node->pn_child[slot], smr, access); 504 return (pctrie_match_value(node, index)); 505 } 506 507 /* 508 * Returns the value stored at the index, assuming access is externally 509 * synchronized by a lock. 510 * 511 * If the index is not present, NULL is returned. 512 */ 513 uint64_t * 514 pctrie_lookup(struct pctrie *ptree, uint64_t index) 515 { 516 return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED)); 517 } 518 519 /* 520 * Returns the value stored at the index without requiring an external lock. 521 * 522 * If the index is not present, NULL is returned. 523 */ 524 uint64_t * 525 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr) 526 { 527 uint64_t *res; 528 529 smr_enter(smr); 530 res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR); 531 smr_exit(smr); 532 return (res); 533 } 534 535 /* 536 * Returns the last node examined in the search for the index, and updates the 537 * search path to that node. 538 */ 539 static __always_inline struct pctrie_node * 540 _pctrie_iter_lookup_node(struct pctrie_iter *it, uint64_t index, smr_t smr, 541 enum pctrie_access access) 542 { 543 struct pctrie_node *node; 544 int slot; 545 546 /* 547 * Climb the search path to find the lowest node from which to start the 548 * search for a value matching 'index'. 549 */ 550 while (it->top != 0) { 551 node = it->path[it->top - 1]; 552 KASSERT(!powerof2(node->pn_popmap), 553 ("%s: freed node in iter path", __func__)); 554 if (!pctrie_keybarr(node, index, &slot)) { 555 node = pctrie_node_load( 556 &node->pn_child[slot], smr, access); 557 break; 558 } 559 --it->top; 560 } 561 if (it->top == 0) 562 node = pctrie_root_load(it->ptree, smr, access); 563 564 /* Seek a node that matches index. */ 565 while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot)) { 566 KASSERT(it->top < nitems(it->path), 567 ("%s: path overflow in trie %p", __func__, it->ptree)); 568 it->path[it->top++] = node; 569 node = pctrie_node_load(&node->pn_child[slot], smr, access); 570 } 571 return (node); 572 } 573 574 /* 575 * Returns the value stored at a given index value, possibly NULL. 576 */ 577 static __always_inline uint64_t * 578 _pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index, smr_t smr, 579 enum pctrie_access access) 580 { 581 struct pctrie_node *node; 582 583 it->index = index; 584 node = _pctrie_iter_lookup_node(it, index, smr, access); 585 return (pctrie_match_value(node, index)); 586 } 587 588 /* 589 * Returns the value stored at a given index value, possibly NULL. 590 */ 591 uint64_t * 592 pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index) 593 { 594 return (_pctrie_iter_lookup(it, index, NULL, PCTRIE_LOCKED)); 595 } 596 597 /* 598 * Returns the value stored at a fixed offset from the current index value, 599 * possibly NULL. 600 */ 601 static __always_inline uint64_t * 602 _pctrie_iter_stride(struct pctrie_iter *it, int stride, smr_t smr, 603 enum pctrie_access access) 604 { 605 uint64_t index = it->index + stride; 606 607 /* Detect stride overflow. */ 608 if ((stride > 0) != (index > it->index)) 609 return (NULL); 610 /* Detect crossing limit */ 611 if ((index < it->limit) != (it->index < it->limit)) 612 return (NULL); 613 614 return (_pctrie_iter_lookup(it, index, smr, access)); 615 } 616 617 /* 618 * Returns the value stored at a fixed offset from the current index value, 619 * possibly NULL. 620 */ 621 uint64_t * 622 pctrie_iter_stride(struct pctrie_iter *it, int stride) 623 { 624 return (_pctrie_iter_stride(it, stride, NULL, PCTRIE_LOCKED)); 625 } 626 627 /* 628 * Returns the value stored at one more than the current index value, possibly 629 * NULL, assuming access is externally synchronized by a lock. 630 */ 631 uint64_t * 632 pctrie_iter_next(struct pctrie_iter *it) 633 { 634 return (_pctrie_iter_stride(it, 1, NULL, PCTRIE_LOCKED)); 635 } 636 637 /* 638 * Returns the value stored at one less than the current index value, possibly 639 * NULL, assuming access is externally synchronized by a lock. 640 */ 641 uint64_t * 642 pctrie_iter_prev(struct pctrie_iter *it) 643 { 644 return (_pctrie_iter_stride(it, -1, NULL, PCTRIE_LOCKED)); 645 } 646 647 /* 648 * Returns the value with the least index that is greater than or equal to the 649 * specified index, or NULL if there are no such values. 650 * 651 * Requires that access be externally synchronized by a lock. 652 */ 653 static __inline uint64_t * 654 pctrie_lookup_ge_node(struct pctrie_node *node, uint64_t index) 655 { 656 struct pctrie_node *succ; 657 uint64_t *m; 658 int slot; 659 660 /* 661 * Descend the trie as if performing an ordinary lookup for the 662 * specified value. However, unlike an ordinary lookup, as we descend 663 * the trie, we use "succ" to remember the last branching-off point, 664 * that is, the interior node under which the least value that is both 665 * outside our current path down the trie and greater than the specified 666 * index resides. (The node's popmap makes it fast and easy to 667 * recognize a branching-off point.) If our ordinary lookup fails to 668 * yield a value that is greater than or equal to the specified index, 669 * then we will exit this loop and perform a lookup starting from 670 * "succ". If "succ" is not NULL, then that lookup is guaranteed to 671 * succeed. 672 */ 673 succ = NULL; 674 for (;;) { 675 if (pctrie_isleaf(node)) { 676 if ((m = pctrie_toval(node)) != NULL && *m >= index) 677 return (m); 678 break; 679 } 680 if (pctrie_keybarr(node, index, &slot)) { 681 /* 682 * If all values in this subtree are > index, then the 683 * least value in this subtree is the answer. 684 */ 685 if (node->pn_owner > index) 686 succ = node; 687 break; 688 } 689 690 /* 691 * Just in case the next search step leads to a subtree of all 692 * values < index, check popmap to see if a next bigger step, to 693 * a subtree of all pages with values > index, is available. If 694 * so, remember to restart the search here. 695 */ 696 if ((node->pn_popmap >> slot) > 1) 697 succ = node; 698 node = pctrie_node_load(&node->pn_child[slot], NULL, 699 PCTRIE_LOCKED); 700 } 701 702 /* 703 * Restart the search from the last place visited in the subtree that 704 * included some values > index, if there was such a place. 705 */ 706 if (succ == NULL) 707 return (NULL); 708 if (succ != node) { 709 /* 710 * Take a step to the next bigger sibling of the node chosen 711 * last time. In that subtree, all values > index. 712 */ 713 slot = pctrie_slot(succ, index) + 1; 714 KASSERT((succ->pn_popmap >> slot) != 0, 715 ("%s: no popmap siblings past slot %d in node %p", 716 __func__, slot, succ)); 717 slot += ffs(succ->pn_popmap >> slot) - 1; 718 succ = pctrie_node_load(&succ->pn_child[slot], NULL, 719 PCTRIE_LOCKED); 720 } 721 722 /* 723 * Find the value in the subtree rooted at "succ" with the least index. 724 */ 725 while (!pctrie_isleaf(succ)) { 726 KASSERT(succ->pn_popmap != 0, 727 ("%s: no popmap children in node %p", __func__, succ)); 728 slot = ffs(succ->pn_popmap) - 1; 729 succ = pctrie_node_load(&succ->pn_child[slot], NULL, 730 PCTRIE_LOCKED); 731 } 732 return (pctrie_toval(succ)); 733 } 734 735 uint64_t * 736 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) 737 { 738 return (pctrie_lookup_ge_node( 739 pctrie_root_load(ptree, NULL, PCTRIE_LOCKED), index)); 740 } 741 742 uint64_t * 743 pctrie_subtree_lookup_gt(struct pctrie_node *node, uint64_t index) 744 { 745 if (node == NULL || index + 1 == 0) 746 return (NULL); 747 return (pctrie_lookup_ge_node(node, index + 1)); 748 } 749 750 /* 751 * Find first leaf >= index, and fill iter with the path to the parent of that 752 * leaf. Return NULL if there is no such leaf less than limit. 753 */ 754 uint64_t * 755 pctrie_iter_lookup_ge(struct pctrie_iter *it, uint64_t index) 756 { 757 struct pctrie_node *node; 758 uint64_t *m; 759 int slot; 760 761 /* Seek a node that matches index. */ 762 node = _pctrie_iter_lookup_node(it, index, NULL, PCTRIE_LOCKED); 763 764 /* 765 * If no such node was found, and instead this path leads only to nodes 766 * < index, back up to find a subtrie with the least value > index. 767 */ 768 if (pctrie_isleaf(node) ? 769 (m = pctrie_toval(node)) == NULL || *m < index : 770 node->pn_owner < index) { 771 /* Climb the path to find a node with a descendant > index. */ 772 while (it->top != 0) { 773 node = it->path[it->top - 1]; 774 slot = pctrie_slot(node, index) + 1; 775 if ((node->pn_popmap >> slot) != 0) 776 break; 777 --it->top; 778 } 779 if (it->top == 0) 780 return (NULL); 781 782 /* Step to the least child with a descendant > index. */ 783 slot += ffs(node->pn_popmap >> slot) - 1; 784 node = pctrie_node_load(&node->pn_child[slot], NULL, 785 PCTRIE_LOCKED); 786 } 787 /* Descend to the least leaf of the subtrie. */ 788 while (!pctrie_isleaf(node)) { 789 if (it->limit != 0 && node->pn_owner >= it->limit) 790 return (NULL); 791 slot = ffs(node->pn_popmap) - 1; 792 KASSERT(it->top < nitems(it->path), 793 ("%s: path overflow in trie %p", __func__, it->ptree)); 794 it->path[it->top++] = node; 795 node = pctrie_node_load(&node->pn_child[slot], NULL, 796 PCTRIE_LOCKED); 797 } 798 m = pctrie_toval(node); 799 if (it->limit != 0 && *m >= it->limit) 800 return (NULL); 801 it->index = *m; 802 return (m); 803 } 804 805 /* 806 * Find the first leaf with value at least 'jump' greater than the previous 807 * leaf. Return NULL if that value is >= limit. 808 */ 809 uint64_t * 810 pctrie_iter_jump_ge(struct pctrie_iter *it, int64_t jump) 811 { 812 uint64_t index = it->index + jump; 813 814 /* Detect jump overflow. */ 815 if ((jump > 0) != (index > it->index)) 816 return (NULL); 817 if (it->limit != 0 && index >= it->limit) 818 return (NULL); 819 return (pctrie_iter_lookup_ge(it, index)); 820 } 821 822 #ifdef INVARIANTS 823 void 824 pctrie_subtree_lookup_gt_assert(struct pctrie_node *node, uint64_t index, 825 struct pctrie *ptree, uint64_t *res) 826 { 827 uint64_t *expected; 828 829 if (index + 1 == 0) 830 expected = NULL; 831 else 832 expected = pctrie_lookup_ge(ptree, index + 1); 833 KASSERT(res == expected, 834 ("pctrie subtree lookup gt result different from root lookup: " 835 "ptree %p, index %ju, subtree %p, found %p, expected %p", ptree, 836 (uintmax_t)index, node, res, expected)); 837 } 838 #endif 839 840 /* 841 * Returns the value with the greatest index that is less than or equal to the 842 * specified index, or NULL if there are no such values. 843 * 844 * Requires that access be externally synchronized by a lock. 845 */ 846 static __inline uint64_t * 847 pctrie_lookup_le_node(struct pctrie_node *node, uint64_t index) 848 { 849 struct pctrie_node *pred; 850 uint64_t *m; 851 int slot; 852 853 /* 854 * Mirror the implementation of pctrie_lookup_ge_node, described above. 855 */ 856 pred = NULL; 857 for (;;) { 858 if (pctrie_isleaf(node)) { 859 if ((m = pctrie_toval(node)) != NULL && *m <= index) 860 return (m); 861 break; 862 } 863 if (pctrie_keybarr(node, index, &slot)) { 864 if (node->pn_owner < index) 865 pred = node; 866 break; 867 } 868 if ((node->pn_popmap & ((1 << slot) - 1)) != 0) 869 pred = node; 870 node = pctrie_node_load(&node->pn_child[slot], NULL, 871 PCTRIE_LOCKED); 872 } 873 if (pred == NULL) 874 return (NULL); 875 if (pred != node) { 876 slot = pctrie_slot(pred, index); 877 KASSERT((pred->pn_popmap & ((1 << slot) - 1)) != 0, 878 ("%s: no popmap siblings before slot %d in node %p", 879 __func__, slot, pred)); 880 slot = ilog2(pred->pn_popmap & ((1 << slot) - 1)); 881 pred = pctrie_node_load(&pred->pn_child[slot], NULL, 882 PCTRIE_LOCKED); 883 } 884 while (!pctrie_isleaf(pred)) { 885 KASSERT(pred->pn_popmap != 0, 886 ("%s: no popmap children in node %p", __func__, pred)); 887 slot = ilog2(pred->pn_popmap); 888 pred = pctrie_node_load(&pred->pn_child[slot], NULL, 889 PCTRIE_LOCKED); 890 } 891 return (pctrie_toval(pred)); 892 } 893 894 uint64_t * 895 pctrie_lookup_le(struct pctrie *ptree, uint64_t index) 896 { 897 return (pctrie_lookup_le_node( 898 pctrie_root_load(ptree, NULL, PCTRIE_LOCKED), index)); 899 } 900 901 uint64_t * 902 pctrie_subtree_lookup_lt(struct pctrie_node *node, uint64_t index) 903 { 904 if (node == NULL || index == 0) 905 return (NULL); 906 return (pctrie_lookup_le_node(node, index - 1)); 907 } 908 909 /* 910 * Find first leaf <= index, and fill iter with the path to the parent of that 911 * leaf. Return NULL if there is no such leaf greater than limit. 912 */ 913 uint64_t * 914 pctrie_iter_lookup_le(struct pctrie_iter *it, uint64_t index) 915 { 916 struct pctrie_node *node; 917 uint64_t *m; 918 int slot; 919 920 /* Seek a node that matches index. */ 921 node = _pctrie_iter_lookup_node(it, index, NULL, PCTRIE_LOCKED); 922 923 /* 924 * If no such node was found, and instead this path leads only to nodes 925 * > index, back up to find a subtrie with the least value > index. 926 */ 927 if (pctrie_isleaf(node) ? 928 (m = pctrie_toval(node)) == NULL || *m > index : 929 node->pn_owner > index) { 930 /* Climb the path to find a node with a descendant < index. */ 931 while (it->top != 0) { 932 node = it->path[it->top - 1]; 933 slot = pctrie_slot(node, index); 934 if ((node->pn_popmap & ((1 << slot) - 1)) != 0) 935 break; 936 --it->top; 937 } 938 if (it->top == 0) 939 return (NULL); 940 941 /* Step to the greatest child with a descendant < index. */ 942 slot = ilog2(node->pn_popmap & ((1 << slot) - 1)); 943 node = pctrie_node_load(&node->pn_child[slot], NULL, 944 PCTRIE_LOCKED); 945 } 946 /* Descend to the greatest leaf of the subtrie. */ 947 while (!pctrie_isleaf(node)) { 948 if (it->limit != 0 && it->limit >= 949 node->pn_owner + (PCTRIE_COUNT << node->pn_clev) - 1) 950 return (NULL); 951 slot = ilog2(node->pn_popmap); 952 KASSERT(it->top < nitems(it->path), 953 ("%s: path overflow in trie %p", __func__, it->ptree)); 954 it->path[it->top++] = node; 955 node = pctrie_node_load(&node->pn_child[slot], NULL, 956 PCTRIE_LOCKED); 957 } 958 m = pctrie_toval(node); 959 if (it->limit != 0 && *m <= it->limit) 960 return (NULL); 961 it->index = *m; 962 return (m); 963 } 964 965 /* 966 * Find the first leaf with value at most 'jump' less than the previous 967 * leaf. Return NULL if that value is <= limit. 968 */ 969 uint64_t * 970 pctrie_iter_jump_le(struct pctrie_iter *it, int64_t jump) 971 { 972 uint64_t index = it->index - jump; 973 974 /* Detect jump overflow. */ 975 if ((jump > 0) != (index < it->index)) 976 return (NULL); 977 if (it->limit != 0 && index <= it->limit) 978 return (NULL); 979 return (pctrie_iter_lookup_le(it, index)); 980 } 981 982 #ifdef INVARIANTS 983 void 984 pctrie_subtree_lookup_lt_assert(struct pctrie_node *node, uint64_t index, 985 struct pctrie *ptree, uint64_t *res) 986 { 987 uint64_t *expected; 988 989 if (index == 0) 990 expected = NULL; 991 else 992 expected = pctrie_lookup_le(ptree, index - 1); 993 KASSERT(res == expected, 994 ("pctrie subtree lookup lt result different from root lookup: " 995 "ptree %p, index %ju, subtree %p, found %p, expected %p", ptree, 996 (uintmax_t)index, node, res, expected)); 997 } 998 #endif 999 1000 static void 1001 pctrie_remove(struct pctrie *ptree, uint64_t index, struct pctrie_node *parent, 1002 struct pctrie_node *node, struct pctrie_node **freenode) 1003 { 1004 struct pctrie_node *child; 1005 int slot; 1006 1007 if (node == NULL) { 1008 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED); 1009 return; 1010 } 1011 slot = pctrie_slot(node, index); 1012 KASSERT((node->pn_popmap & (1 << slot)) != 0, 1013 ("%s: bad popmap slot %d in node %p", 1014 __func__, slot, node)); 1015 node->pn_popmap ^= 1 << slot; 1016 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED); 1017 if (!powerof2(node->pn_popmap)) 1018 return; 1019 KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__)); 1020 slot = ffs(node->pn_popmap) - 1; 1021 child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); 1022 KASSERT(child != PCTRIE_NULL, 1023 ("%s: bad popmap slot %d in node %p", __func__, slot, node)); 1024 if (parent == NULL) 1025 pctrie_root_store(ptree, child, PCTRIE_LOCKED); 1026 else { 1027 slot = pctrie_slot(parent, index); 1028 KASSERT(node == 1029 pctrie_node_load(&parent->pn_child[slot], NULL, 1030 PCTRIE_LOCKED), ("%s: invalid child value", __func__)); 1031 pctrie_node_store(&parent->pn_child[slot], child, 1032 PCTRIE_LOCKED); 1033 } 1034 /* 1035 * The child is still valid and we can not zero the 1036 * pointer until all SMR references are gone. 1037 */ 1038 pctrie_node_put(node); 1039 *freenode = node; 1040 } 1041 1042 /* 1043 * Remove the specified index from the tree, and return the value stored at 1044 * that index. If the index is not present, return NULL. 1045 */ 1046 uint64_t * 1047 pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, 1048 struct pctrie_node **freenode) 1049 { 1050 struct pctrie_node *child, *node, *parent; 1051 uint64_t *m; 1052 int slot; 1053 1054 DEBUG_POISON_POINTER(parent); 1055 *freenode = node = NULL; 1056 child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); 1057 while (!pctrie_isleaf(child)) { 1058 parent = node; 1059 node = child; 1060 slot = pctrie_slot(node, index); 1061 child = pctrie_node_load(&node->pn_child[slot], NULL, 1062 PCTRIE_LOCKED); 1063 } 1064 m = pctrie_match_value(child, index); 1065 if (m != NULL) 1066 pctrie_remove(ptree, index, parent, node, freenode); 1067 return (m); 1068 } 1069 1070 /* 1071 * Remove from the trie the leaf last chosen by the iterator, and 1072 * adjust the path if it's last member is to be freed. 1073 */ 1074 uint64_t * 1075 pctrie_iter_remove(struct pctrie_iter *it, struct pctrie_node **freenode) 1076 { 1077 struct pctrie_node *child, *node, *parent; 1078 uint64_t *m; 1079 int slot; 1080 1081 DEBUG_POISON_POINTER(parent); 1082 *freenode = NULL; 1083 if (it->top >= 1) { 1084 parent = (it->top >= 2) ? it->path[it->top - 2] : NULL; 1085 node = it->path[it->top - 1]; 1086 slot = pctrie_slot(node, it->index); 1087 child = pctrie_node_load(&node->pn_child[slot], NULL, 1088 PCTRIE_LOCKED); 1089 } else { 1090 node = NULL; 1091 child = pctrie_root_load(it->ptree, NULL, PCTRIE_LOCKED); 1092 } 1093 m = pctrie_match_value(child, it->index); 1094 if (m != NULL) 1095 pctrie_remove(it->ptree, it->index, parent, node, freenode); 1096 if (*freenode != NULL) 1097 --it->top; 1098 return (m); 1099 } 1100 1101 /* 1102 * Return the current leaf, assuming access is externally synchronized by a 1103 * lock. 1104 */ 1105 uint64_t * 1106 pctrie_iter_value(struct pctrie_iter *it) 1107 { 1108 struct pctrie_node *node; 1109 int slot; 1110 1111 if (it->top == 0) 1112 node = pctrie_root_load(it->ptree, NULL, 1113 PCTRIE_LOCKED); 1114 else { 1115 node = it->path[it->top - 1]; 1116 slot = pctrie_slot(node, it->index); 1117 node = pctrie_node_load(&node->pn_child[slot], NULL, 1118 PCTRIE_LOCKED); 1119 } 1120 return (pctrie_toval(node)); 1121 } 1122 1123 /* 1124 * Walk the subtrie rooted at *pnode in order, invoking callback on leaves and 1125 * using the leftmost child pointer for path reversal, until an interior node 1126 * is stripped of all children, and returned for deallocation, with *pnode left 1127 * pointing to the parent of that node. 1128 */ 1129 static __always_inline struct pctrie_node * 1130 pctrie_reclaim_prune(struct pctrie_node **pnode, struct pctrie_node *parent, 1131 pctrie_cb_t callback, int keyoff, void *arg) 1132 { 1133 struct pctrie_node *child, *node; 1134 int slot; 1135 1136 node = *pnode; 1137 while (node->pn_popmap != 0) { 1138 slot = ffs(node->pn_popmap) - 1; 1139 node->pn_popmap ^= 1 << slot; 1140 child = pctrie_node_load(&node->pn_child[slot], NULL, 1141 PCTRIE_UNSERIALIZED); 1142 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, 1143 PCTRIE_UNSERIALIZED); 1144 if (pctrie_isleaf(child)) { 1145 if (callback != NULL) 1146 callback(pctrie_toptr(child, keyoff), arg); 1147 continue; 1148 } 1149 /* Climb one level down the trie. */ 1150 pctrie_node_store(&node->pn_child[0], parent, 1151 PCTRIE_UNSERIALIZED); 1152 parent = node; 1153 node = child; 1154 } 1155 *pnode = parent; 1156 return (node); 1157 } 1158 1159 /* 1160 * Recover the node parent from its first child and continue pruning. 1161 */ 1162 static __always_inline struct pctrie_node * 1163 pctrie_reclaim_resume_compound(struct pctrie_node **pnode, 1164 pctrie_cb_t callback, int keyoff, void *arg) 1165 { 1166 struct pctrie_node *parent, *node; 1167 1168 node = *pnode; 1169 if (node == NULL) 1170 return (NULL); 1171 /* Climb one level up the trie. */ 1172 parent = pctrie_node_load(&node->pn_child[0], NULL, 1173 PCTRIE_UNSERIALIZED); 1174 pctrie_node_store(&node->pn_child[0], PCTRIE_NULL, PCTRIE_UNSERIALIZED); 1175 return (pctrie_reclaim_prune(pnode, parent, callback, keyoff, arg)); 1176 } 1177 1178 /* 1179 * Find the trie root, and start pruning with a NULL parent. 1180 */ 1181 static __always_inline struct pctrie_node * 1182 pctrie_reclaim_begin_compound(struct pctrie_node **pnode, 1183 struct pctrie *ptree, 1184 pctrie_cb_t callback, int keyoff, void *arg) 1185 { 1186 struct pctrie_node *node; 1187 1188 node = pctrie_root_load(ptree, NULL, PCTRIE_UNSERIALIZED); 1189 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_UNSERIALIZED); 1190 if (pctrie_isleaf(node)) { 1191 if (callback != NULL && node != PCTRIE_NULL) 1192 callback(pctrie_toptr(node, keyoff), arg); 1193 return (NULL); 1194 } 1195 *pnode = node; 1196 return (pctrie_reclaim_prune(pnode, NULL, callback, keyoff, arg)); 1197 } 1198 1199 struct pctrie_node * 1200 pctrie_reclaim_resume(struct pctrie_node **pnode) 1201 { 1202 return (pctrie_reclaim_resume_compound(pnode, NULL, 0, NULL)); 1203 } 1204 1205 struct pctrie_node * 1206 pctrie_reclaim_begin(struct pctrie_node **pnode, struct pctrie *ptree) 1207 { 1208 return (pctrie_reclaim_begin_compound(pnode, ptree, NULL, 0, NULL)); 1209 } 1210 1211 struct pctrie_node * 1212 pctrie_reclaim_resume_cb(struct pctrie_node **pnode, 1213 pctrie_cb_t callback, int keyoff, void *arg) 1214 { 1215 return (pctrie_reclaim_resume_compound(pnode, callback, keyoff, arg)); 1216 } 1217 1218 struct pctrie_node * 1219 pctrie_reclaim_begin_cb(struct pctrie_node **pnode, struct pctrie *ptree, 1220 pctrie_cb_t callback, int keyoff, void *arg) 1221 { 1222 return (pctrie_reclaim_begin_compound(pnode, ptree, 1223 callback, keyoff, arg)); 1224 } 1225 1226 /* 1227 * Replace an existing value in the trie with another one. 1228 * Panics if there is not an old value in the trie at the new value's index. 1229 */ 1230 uint64_t * 1231 pctrie_replace(struct pctrie *ptree, uint64_t *newval) 1232 { 1233 struct pctrie_node *leaf, *parent, *node; 1234 uint64_t *m; 1235 uint64_t index; 1236 int slot; 1237 1238 leaf = pctrie_toleaf(newval); 1239 index = *newval; 1240 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); 1241 parent = NULL; 1242 for (;;) { 1243 if (pctrie_isleaf(node)) { 1244 if ((m = pctrie_toval(node)) != NULL && *m == index) { 1245 if (parent == NULL) 1246 pctrie_root_store(ptree, 1247 leaf, PCTRIE_LOCKED); 1248 else 1249 pctrie_node_store( 1250 &parent->pn_child[slot], leaf, 1251 PCTRIE_LOCKED); 1252 return (m); 1253 } 1254 break; 1255 } 1256 if (pctrie_keybarr(node, index, &slot)) 1257 break; 1258 parent = node; 1259 node = pctrie_node_load(&node->pn_child[slot], NULL, 1260 PCTRIE_LOCKED); 1261 } 1262 panic("%s: original replacing value not found", __func__); 1263 } 1264 1265 #ifdef DDB 1266 /* 1267 * Show details about the given node. 1268 */ 1269 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) 1270 { 1271 struct pctrie_node *node, *tmp; 1272 int slot; 1273 pn_popmap_t popmap; 1274 1275 if (!have_addr) 1276 return; 1277 node = (struct pctrie_node *)addr; 1278 db_printf("node %p, owner %jx, children popmap %04x, level %u:\n", 1279 (void *)node, (uintmax_t)node->pn_owner, node->pn_popmap, 1280 node->pn_clev / PCTRIE_WIDTH); 1281 for (popmap = node->pn_popmap; popmap != 0; popmap ^= 1 << slot) { 1282 slot = ffs(popmap) - 1; 1283 tmp = pctrie_node_load(&node->pn_child[slot], NULL, 1284 PCTRIE_UNSERIALIZED); 1285 db_printf("slot: %d, val: %p, value: %p, clev: %d\n", 1286 slot, (void *)tmp, 1287 pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL, 1288 node->pn_clev / PCTRIE_WIDTH); 1289 } 1290 } 1291 #endif /* DDB */ 1292