1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * The following code is not generalized into a general purpose library 35 * because there are way too many parameters embedded that should really 36 * be decided by the library consumers. At the same time, consumers 37 * of this code must achieve highest possible performance. 38 * 39 * The implementation takes into account the following rationale: 40 * - Size of the nodes should be as small as possible but still big enough 41 * to avoid a large maximum depth for the trie. This is a balance 42 * between the necessity to not wire too much physical memory for the nodes 43 * and the necessity to avoid too much cache pollution during the trie 44 * operations. 45 * - There is not a huge bias toward the number of lookup operations over 46 * the number of insert and remove operations. This basically implies 47 * that optimizations supposedly helping one operation but hurting the 48 * other might be carefully evaluated. 49 * - On average not many nodes are expected to be fully populated, hence 50 * level compression may just complicate things. 51 */ 52 53 #include <sys/cdefs.h> 54 __FBSDID("$FreeBSD$"); 55 56 #include "opt_ddb.h" 57 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/kernel.h> 61 #include <sys/proc.h> 62 #include <sys/vmmeter.h> 63 #include <sys/smr.h> 64 #include <sys/smr_types.h> 65 66 #include <vm/uma.h> 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_radix.h> 72 73 #ifdef DDB 74 #include <ddb/ddb.h> 75 #endif 76 77 /* 78 * These widths should allow the pointers to a node's children to fit within 79 * a single cache line. The extra levels from a narrow width should not be 80 * a problem thanks to path compression. 81 */ 82 #ifdef __LP64__ 83 #define VM_RADIX_WIDTH 4 84 #else 85 #define VM_RADIX_WIDTH 3 86 #endif 87 88 #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) 89 #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) 90 #define VM_RADIX_LIMIT \ 91 (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) 92 93 /* Flag bits stored in node pointers. */ 94 #define VM_RADIX_ISLEAF 0x1 95 #define VM_RADIX_FLAGS 0x1 96 #define VM_RADIX_PAD VM_RADIX_FLAGS 97 98 /* Returns one unit associated with specified level. */ 99 #define VM_RADIX_UNITLEVEL(lev) \ 100 ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) 101 102 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED }; 103 104 struct vm_radix_node; 105 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t; 106 107 struct vm_radix_node { 108 vm_pindex_t rn_owner; /* Owner of record. */ 109 uint16_t rn_count; /* Valid children. */ 110 uint8_t rn_clev; /* Current level. */ 111 int8_t rn_last; /* zero last ptr. */ 112 smrnode_t rn_child[VM_RADIX_COUNT]; /* Child nodes. */ 113 }; 114 115 static uma_zone_t vm_radix_node_zone; 116 static smr_t vm_radix_smr; 117 118 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 119 enum vm_radix_access access); 120 121 /* 122 * Allocate a radix node. 123 */ 124 static struct vm_radix_node * 125 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel) 126 { 127 struct vm_radix_node *rnode; 128 129 rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT); 130 if (rnode == NULL) 131 return (NULL); 132 133 /* 134 * We want to clear the last child pointer after the final section 135 * has exited so lookup can not return false negatives. It is done 136 * here because it will be cache-cold in the dtor callback. 137 */ 138 if (rnode->rn_last != 0) { 139 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1], 140 NULL, UNSERIALIZED); 141 rnode->rn_last = 0; 142 } 143 rnode->rn_owner = owner; 144 rnode->rn_count = count; 145 rnode->rn_clev = clevel; 146 return (rnode); 147 } 148 149 /* 150 * Free radix node. 151 */ 152 static __inline void 153 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last) 154 { 155 #ifdef INVARIANTS 156 int slot; 157 158 KASSERT(rnode->rn_count == 0, 159 ("vm_radix_node_put: rnode %p has %d children", rnode, 160 rnode->rn_count)); 161 for (slot = 0; slot < VM_RADIX_COUNT; slot++) { 162 if (slot == last) 163 continue; 164 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) == 165 NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); 166 } 167 #endif 168 /* Off by one so a freshly zero'd node is not assigned to. */ 169 rnode->rn_last = last + 1; 170 uma_zfree_smr(vm_radix_node_zone, rnode); 171 } 172 173 /* 174 * Return the position in the array for a given level. 175 */ 176 static __inline int 177 vm_radix_slot(vm_pindex_t index, uint16_t level) 178 { 179 180 return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); 181 } 182 183 /* Trims the key after the specified level. */ 184 static __inline vm_pindex_t 185 vm_radix_trimkey(vm_pindex_t index, uint16_t level) 186 { 187 vm_pindex_t ret; 188 189 ret = index; 190 if (level > 0) { 191 ret >>= level * VM_RADIX_WIDTH; 192 ret <<= level * VM_RADIX_WIDTH; 193 } 194 return (ret); 195 } 196 197 /* 198 * Fetch a node pointer from a slot in another node. 199 */ 200 static __inline struct vm_radix_node * 201 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access) 202 { 203 204 switch (access) { 205 case UNSERIALIZED: 206 return (smr_unserialized_load(p, true)); 207 case LOCKED: 208 return (smr_serialized_load(p, true)); 209 case SMR: 210 return (smr_entered_load(p, vm_radix_smr)); 211 } 212 __assert_unreachable(); 213 } 214 215 static __inline void 216 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 217 enum vm_radix_access access) 218 { 219 220 switch (access) { 221 case UNSERIALIZED: 222 smr_unserialized_store(p, v, true); 223 break; 224 case LOCKED: 225 smr_serialized_store(p, v, true); 226 break; 227 case SMR: 228 panic("vm_radix_node_store: Not supported in smr section."); 229 } 230 } 231 232 /* 233 * Get the root node for a radix tree. 234 */ 235 static __inline struct vm_radix_node * 236 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access) 237 { 238 239 return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access)); 240 } 241 242 /* 243 * Set the root node for a radix tree. 244 */ 245 static __inline void 246 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode, 247 enum vm_radix_access access) 248 { 249 250 vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access); 251 } 252 253 /* 254 * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. 255 */ 256 static __inline boolean_t 257 vm_radix_isleaf(struct vm_radix_node *rnode) 258 { 259 260 return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); 261 } 262 263 /* 264 * Returns the associated page extracted from rnode. 265 */ 266 static __inline vm_page_t 267 vm_radix_topage(struct vm_radix_node *rnode) 268 { 269 270 return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); 271 } 272 273 /* 274 * Adds the page as a child of the provided node. 275 */ 276 static __inline void 277 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, 278 vm_page_t page, enum vm_radix_access access) 279 { 280 int slot; 281 282 slot = vm_radix_slot(index, clev); 283 vm_radix_node_store(&rnode->rn_child[slot], 284 (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access); 285 } 286 287 /* 288 * Returns the slot where two keys differ. 289 * It cannot accept 2 equal keys. 290 */ 291 static __inline uint16_t 292 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) 293 { 294 uint16_t clev; 295 296 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 297 __func__, (uintmax_t)index1)); 298 299 index1 ^= index2; 300 for (clev = VM_RADIX_LIMIT;; clev--) 301 if (vm_radix_slot(index1, clev) != 0) 302 return (clev); 303 } 304 305 /* 306 * Returns TRUE if it can be determined that key does not belong to the 307 * specified rnode. Otherwise, returns FALSE. 308 */ 309 static __inline boolean_t 310 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) 311 { 312 313 if (rnode->rn_clev < VM_RADIX_LIMIT) { 314 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); 315 return (idx != rnode->rn_owner); 316 } 317 return (FALSE); 318 } 319 320 /* 321 * Internal helper for vm_radix_reclaim_allnodes(). 322 * This function is recursive. 323 */ 324 static void 325 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) 326 { 327 struct vm_radix_node *child; 328 int slot; 329 330 KASSERT(rnode->rn_count <= VM_RADIX_COUNT, 331 ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode)); 332 for (slot = 0; rnode->rn_count != 0; slot++) { 333 child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED); 334 if (child == NULL) 335 continue; 336 if (!vm_radix_isleaf(child)) 337 vm_radix_reclaim_allnodes_int(child); 338 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED); 339 rnode->rn_count--; 340 } 341 vm_radix_node_put(rnode, -1); 342 } 343 344 #ifndef UMA_MD_SMALL_ALLOC 345 void vm_radix_reserve_kva(void); 346 /* 347 * Reserve the KVA necessary to satisfy the node allocation. 348 * This is mandatory in architectures not supporting direct 349 * mapping as they will need otherwise to carve into the kernel maps for 350 * every node allocation, resulting into deadlocks for consumers already 351 * working with kernel maps. 352 */ 353 void 354 vm_radix_reserve_kva(void) 355 { 356 357 /* 358 * Calculate the number of reserved nodes, discounting the pages that 359 * are needed to store them. 360 */ 361 if (!uma_zone_reserve_kva(vm_radix_node_zone, 362 ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + 363 sizeof(struct vm_radix_node)))) 364 panic("%s: unable to reserve KVA", __func__); 365 } 366 #endif 367 368 /* 369 * Initialize the UMA slab zone. 370 */ 371 void 372 vm_radix_zinit(void) 373 { 374 375 vm_radix_node_zone = uma_zcreate("RADIX NODE", 376 sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL, 377 VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT); 378 vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone); 379 } 380 381 /* 382 * Inserts the key-value pair into the trie. 383 * Panics if the key already exists. 384 */ 385 int 386 vm_radix_insert(struct vm_radix *rtree, vm_page_t page) 387 { 388 vm_pindex_t index, newind; 389 struct vm_radix_node *rnode, *tmp; 390 smrnode_t *parentp; 391 vm_page_t m; 392 int slot; 393 uint16_t clev; 394 395 index = page->pindex; 396 397 /* 398 * The owner of record for root is not really important because it 399 * will never be used. 400 */ 401 rnode = vm_radix_root_load(rtree, LOCKED); 402 if (rnode == NULL) { 403 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF; 404 return (0); 405 } 406 parentp = (smrnode_t *)&rtree->rt_root; 407 for (;;) { 408 if (vm_radix_isleaf(rnode)) { 409 m = vm_radix_topage(rnode); 410 if (m->pindex == index) 411 panic("%s: key %jx is already present", 412 __func__, (uintmax_t)index); 413 clev = vm_radix_keydiff(m->pindex, index); 414 tmp = vm_radix_node_get(vm_radix_trimkey(index, 415 clev + 1), 2, clev); 416 if (tmp == NULL) 417 return (ENOMEM); 418 /* These writes are not yet visible due to ordering. */ 419 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 420 vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED); 421 /* Synchronize to make leaf visible. */ 422 vm_radix_node_store(parentp, tmp, LOCKED); 423 return (0); 424 } else if (vm_radix_keybarr(rnode, index)) 425 break; 426 slot = vm_radix_slot(index, rnode->rn_clev); 427 parentp = &rnode->rn_child[slot]; 428 tmp = vm_radix_node_load(parentp, LOCKED); 429 if (tmp == NULL) { 430 rnode->rn_count++; 431 vm_radix_addpage(rnode, index, rnode->rn_clev, page, 432 LOCKED); 433 return (0); 434 } 435 rnode = tmp; 436 } 437 438 /* 439 * A new node is needed because the right insertion level is reached. 440 * Setup the new intermediate node and add the 2 children: the 441 * new object and the older edge. 442 */ 443 newind = rnode->rn_owner; 444 clev = vm_radix_keydiff(newind, index); 445 tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); 446 if (tmp == NULL) 447 return (ENOMEM); 448 slot = vm_radix_slot(newind, clev); 449 /* These writes are not yet visible due to ordering. */ 450 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 451 vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED); 452 /* Serializing write to make the above visible. */ 453 vm_radix_node_store(parentp, tmp, LOCKED); 454 455 return (0); 456 } 457 458 /* 459 * Returns TRUE if the specified radix tree contains a single leaf and FALSE 460 * otherwise. 461 */ 462 boolean_t 463 vm_radix_is_singleton(struct vm_radix *rtree) 464 { 465 struct vm_radix_node *rnode; 466 467 rnode = vm_radix_root_load(rtree, LOCKED); 468 if (rnode == NULL) 469 return (FALSE); 470 return (vm_radix_isleaf(rnode)); 471 } 472 473 /* 474 * Returns the value stored at the index. If the index is not present, 475 * NULL is returned. 476 */ 477 static __always_inline vm_page_t 478 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, 479 enum vm_radix_access access) 480 { 481 struct vm_radix_node *rnode; 482 vm_page_t m; 483 int slot; 484 485 rnode = vm_radix_root_load(rtree, access); 486 while (rnode != NULL) { 487 if (vm_radix_isleaf(rnode)) { 488 m = vm_radix_topage(rnode); 489 if (m->pindex == index) 490 return (m); 491 break; 492 } 493 if (vm_radix_keybarr(rnode, index)) 494 break; 495 slot = vm_radix_slot(index, rnode->rn_clev); 496 rnode = vm_radix_node_load(&rnode->rn_child[slot], access); 497 } 498 return (NULL); 499 } 500 501 /* 502 * Returns the value stored at the index assuming there is an external lock. 503 * 504 * If the index is not present, NULL is returned. 505 */ 506 vm_page_t 507 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) 508 { 509 510 return _vm_radix_lookup(rtree, index, LOCKED); 511 } 512 513 /* 514 * Returns the value stored at the index without requiring an external lock. 515 * 516 * If the index is not present, NULL is returned. 517 */ 518 vm_page_t 519 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index) 520 { 521 vm_page_t m; 522 523 smr_enter(vm_radix_smr); 524 m = _vm_radix_lookup(rtree, index, SMR); 525 smr_exit(vm_radix_smr); 526 527 return (m); 528 } 529 530 /* 531 * Look up the nearest entry at a position greater than or equal to index. 532 */ 533 vm_page_t 534 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) 535 { 536 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 537 vm_pindex_t inc; 538 vm_page_t m; 539 struct vm_radix_node *child, *rnode; 540 #ifdef INVARIANTS 541 int loops = 0; 542 #endif 543 int slot, tos; 544 545 rnode = vm_radix_root_load(rtree, LOCKED); 546 if (rnode == NULL) 547 return (NULL); 548 else if (vm_radix_isleaf(rnode)) { 549 m = vm_radix_topage(rnode); 550 if (m->pindex >= index) 551 return (m); 552 else 553 return (NULL); 554 } 555 tos = 0; 556 for (;;) { 557 /* 558 * If the keys differ before the current bisection node, 559 * then the search key might rollback to the earliest 560 * available bisection node or to the smallest key 561 * in the current node (if the owner is greater than the 562 * search key). 563 */ 564 if (vm_radix_keybarr(rnode, index)) { 565 if (index > rnode->rn_owner) { 566 ascend: 567 KASSERT(++loops < 1000, 568 ("vm_radix_lookup_ge: too many loops")); 569 570 /* 571 * Pop nodes from the stack until either the 572 * stack is empty or a node that could have a 573 * matching descendant is found. 574 */ 575 do { 576 if (tos == 0) 577 return (NULL); 578 rnode = stack[--tos]; 579 } while (vm_radix_slot(index, 580 rnode->rn_clev) == (VM_RADIX_COUNT - 1)); 581 582 /* 583 * The following computation cannot overflow 584 * because index's slot at the current level 585 * is less than VM_RADIX_COUNT - 1. 586 */ 587 index = vm_radix_trimkey(index, 588 rnode->rn_clev); 589 index += VM_RADIX_UNITLEVEL(rnode->rn_clev); 590 } else 591 index = rnode->rn_owner; 592 KASSERT(!vm_radix_keybarr(rnode, index), 593 ("vm_radix_lookup_ge: keybarr failed")); 594 } 595 slot = vm_radix_slot(index, rnode->rn_clev); 596 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 597 if (vm_radix_isleaf(child)) { 598 m = vm_radix_topage(child); 599 if (m->pindex >= index) 600 return (m); 601 } else if (child != NULL) 602 goto descend; 603 604 /* 605 * Look for an available edge or page within the current 606 * bisection node. 607 */ 608 if (slot < (VM_RADIX_COUNT - 1)) { 609 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 610 index = vm_radix_trimkey(index, rnode->rn_clev); 611 do { 612 index += inc; 613 slot++; 614 child = vm_radix_node_load(&rnode->rn_child[slot], 615 LOCKED); 616 if (vm_radix_isleaf(child)) { 617 m = vm_radix_topage(child); 618 if (m->pindex >= index) 619 return (m); 620 } else if (child != NULL) 621 goto descend; 622 } while (slot < (VM_RADIX_COUNT - 1)); 623 } 624 KASSERT(child == NULL || vm_radix_isleaf(child), 625 ("vm_radix_lookup_ge: child is radix node")); 626 627 /* 628 * If a page or edge greater than the search slot is not found 629 * in the current node, ascend to the next higher-level node. 630 */ 631 goto ascend; 632 descend: 633 KASSERT(rnode->rn_clev > 0, 634 ("vm_radix_lookup_ge: pushing leaf's parent")); 635 KASSERT(tos < VM_RADIX_LIMIT, 636 ("vm_radix_lookup_ge: stack overflow")); 637 stack[tos++] = rnode; 638 rnode = child; 639 } 640 } 641 642 /* 643 * Look up the nearest entry at a position less than or equal to index. 644 */ 645 vm_page_t 646 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) 647 { 648 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 649 vm_pindex_t inc; 650 vm_page_t m; 651 struct vm_radix_node *child, *rnode; 652 #ifdef INVARIANTS 653 int loops = 0; 654 #endif 655 int slot, tos; 656 657 rnode = vm_radix_root_load(rtree, LOCKED); 658 if (rnode == NULL) 659 return (NULL); 660 else if (vm_radix_isleaf(rnode)) { 661 m = vm_radix_topage(rnode); 662 if (m->pindex <= index) 663 return (m); 664 else 665 return (NULL); 666 } 667 tos = 0; 668 for (;;) { 669 /* 670 * If the keys differ before the current bisection node, 671 * then the search key might rollback to the earliest 672 * available bisection node or to the largest key 673 * in the current node (if the owner is smaller than the 674 * search key). 675 */ 676 if (vm_radix_keybarr(rnode, index)) { 677 if (index > rnode->rn_owner) { 678 index = rnode->rn_owner + VM_RADIX_COUNT * 679 VM_RADIX_UNITLEVEL(rnode->rn_clev); 680 } else { 681 ascend: 682 KASSERT(++loops < 1000, 683 ("vm_radix_lookup_le: too many loops")); 684 685 /* 686 * Pop nodes from the stack until either the 687 * stack is empty or a node that could have a 688 * matching descendant is found. 689 */ 690 do { 691 if (tos == 0) 692 return (NULL); 693 rnode = stack[--tos]; 694 } while (vm_radix_slot(index, 695 rnode->rn_clev) == 0); 696 697 /* 698 * The following computation cannot overflow 699 * because index's slot at the current level 700 * is greater than 0. 701 */ 702 index = vm_radix_trimkey(index, 703 rnode->rn_clev); 704 } 705 index--; 706 KASSERT(!vm_radix_keybarr(rnode, index), 707 ("vm_radix_lookup_le: keybarr failed")); 708 } 709 slot = vm_radix_slot(index, rnode->rn_clev); 710 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 711 if (vm_radix_isleaf(child)) { 712 m = vm_radix_topage(child); 713 if (m->pindex <= index) 714 return (m); 715 } else if (child != NULL) 716 goto descend; 717 718 /* 719 * Look for an available edge or page within the current 720 * bisection node. 721 */ 722 if (slot > 0) { 723 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 724 index |= inc - 1; 725 do { 726 index -= inc; 727 slot--; 728 child = vm_radix_node_load(&rnode->rn_child[slot], 729 LOCKED); 730 if (vm_radix_isleaf(child)) { 731 m = vm_radix_topage(child); 732 if (m->pindex <= index) 733 return (m); 734 } else if (child != NULL) 735 goto descend; 736 } while (slot > 0); 737 } 738 KASSERT(child == NULL || vm_radix_isleaf(child), 739 ("vm_radix_lookup_le: child is radix node")); 740 741 /* 742 * If a page or edge smaller than the search slot is not found 743 * in the current node, ascend to the next higher-level node. 744 */ 745 goto ascend; 746 descend: 747 KASSERT(rnode->rn_clev > 0, 748 ("vm_radix_lookup_le: pushing leaf's parent")); 749 KASSERT(tos < VM_RADIX_LIMIT, 750 ("vm_radix_lookup_le: stack overflow")); 751 stack[tos++] = rnode; 752 rnode = child; 753 } 754 } 755 756 /* 757 * Remove the specified index from the trie, and return the value stored at 758 * that index. If the index is not present, return NULL. 759 */ 760 vm_page_t 761 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) 762 { 763 struct vm_radix_node *rnode, *parent, *tmp; 764 vm_page_t m; 765 int i, slot; 766 767 rnode = vm_radix_root_load(rtree, LOCKED); 768 if (vm_radix_isleaf(rnode)) { 769 m = vm_radix_topage(rnode); 770 if (m->pindex != index) 771 return (NULL); 772 vm_radix_root_store(rtree, NULL, LOCKED); 773 return (m); 774 } 775 parent = NULL; 776 for (;;) { 777 if (rnode == NULL) 778 return (NULL); 779 slot = vm_radix_slot(index, rnode->rn_clev); 780 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 781 if (vm_radix_isleaf(tmp)) { 782 m = vm_radix_topage(tmp); 783 if (m->pindex != index) 784 return (NULL); 785 vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED); 786 rnode->rn_count--; 787 if (rnode->rn_count > 1) 788 return (m); 789 for (i = 0; i < VM_RADIX_COUNT; i++) 790 if (vm_radix_node_load(&rnode->rn_child[i], 791 LOCKED) != NULL) 792 break; 793 KASSERT(i != VM_RADIX_COUNT, 794 ("%s: invalid node configuration", __func__)); 795 tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED); 796 if (parent == NULL) 797 vm_radix_root_store(rtree, tmp, LOCKED); 798 else { 799 slot = vm_radix_slot(index, parent->rn_clev); 800 KASSERT(vm_radix_node_load( 801 &parent->rn_child[slot], LOCKED) == rnode, 802 ("%s: invalid child value", __func__)); 803 vm_radix_node_store(&parent->rn_child[slot], 804 tmp, LOCKED); 805 } 806 /* 807 * The child is still valid and we can not zero the 808 * pointer until all smr references are gone. 809 */ 810 rnode->rn_count--; 811 vm_radix_node_put(rnode, i); 812 return (m); 813 } 814 parent = rnode; 815 rnode = tmp; 816 } 817 } 818 819 /* 820 * Remove and free all the nodes from the radix tree. 821 * This function is recursive but there is a tight control on it as the 822 * maximum depth of the tree is fixed. 823 */ 824 void 825 vm_radix_reclaim_allnodes(struct vm_radix *rtree) 826 { 827 struct vm_radix_node *root; 828 829 root = vm_radix_root_load(rtree, LOCKED); 830 if (root == NULL) 831 return; 832 vm_radix_root_store(rtree, NULL, UNSERIALIZED); 833 if (!vm_radix_isleaf(root)) 834 vm_radix_reclaim_allnodes_int(root); 835 } 836 837 /* 838 * Replace an existing page in the trie with another one. 839 * Panics if there is not an old page in the trie at the new page's index. 840 */ 841 vm_page_t 842 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) 843 { 844 struct vm_radix_node *rnode, *tmp; 845 vm_page_t m; 846 vm_pindex_t index; 847 int slot; 848 849 index = newpage->pindex; 850 rnode = vm_radix_root_load(rtree, LOCKED); 851 if (rnode == NULL) 852 panic("%s: replacing page on an empty trie", __func__); 853 if (vm_radix_isleaf(rnode)) { 854 m = vm_radix_topage(rnode); 855 if (m->pindex != index) 856 panic("%s: original replacing root key not found", 857 __func__); 858 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF; 859 return (m); 860 } 861 for (;;) { 862 slot = vm_radix_slot(index, rnode->rn_clev); 863 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 864 if (vm_radix_isleaf(tmp)) { 865 m = vm_radix_topage(tmp); 866 if (m->pindex == index) { 867 vm_radix_node_store(&rnode->rn_child[slot], 868 (struct vm_radix_node *)((uintptr_t)newpage | 869 VM_RADIX_ISLEAF), LOCKED); 870 return (m); 871 } else 872 break; 873 } else if (tmp == NULL || vm_radix_keybarr(tmp, index)) 874 break; 875 rnode = tmp; 876 } 877 panic("%s: original replacing page not found", __func__); 878 } 879 880 void 881 vm_radix_wait(void) 882 { 883 uma_zwait(vm_radix_node_zone); 884 } 885 886 #ifdef DDB 887 /* 888 * Show details about the given radix node. 889 */ 890 DB_SHOW_COMMAND(radixnode, db_show_radixnode) 891 { 892 struct vm_radix_node *rnode, *tmp; 893 int i; 894 895 if (!have_addr) 896 return; 897 rnode = (struct vm_radix_node *)addr; 898 db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", 899 (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, 900 rnode->rn_clev); 901 for (i = 0; i < VM_RADIX_COUNT; i++) { 902 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED); 903 if (tmp != NULL) 904 db_printf("slot: %d, val: %p, page: %p, clev: %d\n", 905 i, (void *)tmp, 906 vm_radix_isleaf(tmp) ? vm_radix_topage(tmp) : NULL, 907 rnode->rn_clev); 908 } 909 } 910 #endif /* DDB */ 911