1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * The following code is not generalized into a general purpose library 35 * because there are way too many parameters embedded that should really 36 * be decided by the library consumers. At the same time, consumers 37 * of this code must achieve highest possible performance. 38 * 39 * The implementation takes into account the following rationale: 40 * - Size of the nodes should be as small as possible but still big enough 41 * to avoid a large maximum depth for the trie. This is a balance 42 * between the necessity to not wire too much physical memory for the nodes 43 * and the necessity to avoid too much cache pollution during the trie 44 * operations. 45 * - There is not a huge bias toward the number of lookup operations over 46 * the number of insert and remove operations. This basically implies 47 * that optimizations supposedly helping one operation but hurting the 48 * other might be carefully evaluated. 49 * - On average not many nodes are expected to be fully populated, hence 50 * level compression may just complicate things. 51 */ 52 53 #include <sys/cdefs.h> 54 __FBSDID("$FreeBSD$"); 55 56 #include "opt_ddb.h" 57 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/kernel.h> 61 #include <sys/proc.h> 62 #include <sys/vmmeter.h> 63 #include <sys/smr.h> 64 #include <sys/smr_types.h> 65 66 #include <vm/uma.h> 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_radix.h> 72 73 #ifdef DDB 74 #include <ddb/ddb.h> 75 #endif 76 77 /* 78 * These widths should allow the pointers to a node's children to fit within 79 * a single cache line. The extra levels from a narrow width should not be 80 * a problem thanks to path compression. 81 */ 82 #ifdef __LP64__ 83 #define VM_RADIX_WIDTH 4 84 #else 85 #define VM_RADIX_WIDTH 3 86 #endif 87 88 #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) 89 #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) 90 #define VM_RADIX_LIMIT \ 91 (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) 92 93 /* Flag bits stored in node pointers. */ 94 #define VM_RADIX_ISLEAF 0x1 95 #define VM_RADIX_FLAGS 0x1 96 #define VM_RADIX_PAD VM_RADIX_FLAGS 97 98 /* Returns one unit associated with specified level. */ 99 #define VM_RADIX_UNITLEVEL(lev) \ 100 ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) 101 102 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED }; 103 104 struct vm_radix_node; 105 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t; 106 107 struct vm_radix_node { 108 vm_pindex_t rn_owner; /* Owner of record. */ 109 uint16_t rn_count; /* Valid children. */ 110 uint8_t rn_clev; /* Current level. */ 111 int8_t rn_last; /* zero last ptr. */ 112 smrnode_t rn_child[VM_RADIX_COUNT]; /* Child nodes. */ 113 }; 114 115 static uma_zone_t vm_radix_node_zone; 116 static smr_t vm_radix_smr; 117 118 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 119 enum vm_radix_access access); 120 121 /* 122 * Allocate a radix node. 123 */ 124 static struct vm_radix_node * 125 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel) 126 { 127 struct vm_radix_node *rnode; 128 129 rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT); 130 if (rnode == NULL) 131 return (NULL); 132 133 /* 134 * We want to clear the last child pointer after the final section 135 * has exited so lookup can not return false negatives. It is done 136 * here because it will be cache-cold in the dtor callback. 137 */ 138 if (rnode->rn_last != 0) { 139 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1], 140 NULL, UNSERIALIZED); 141 rnode->rn_last = 0; 142 } 143 rnode->rn_owner = owner; 144 rnode->rn_count = count; 145 rnode->rn_clev = clevel; 146 return (rnode); 147 } 148 149 /* 150 * Free radix node. 151 */ 152 static __inline void 153 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last) 154 { 155 #ifdef INVARIANTS 156 int slot; 157 158 KASSERT(rnode->rn_count == 0, 159 ("vm_radix_node_put: rnode %p has %d children", rnode, 160 rnode->rn_count)); 161 for (slot = 0; slot < VM_RADIX_COUNT; slot++) { 162 if (slot == last) 163 continue; 164 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) == 165 NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); 166 } 167 #endif 168 /* Off by one so a freshly zero'd node is not assigned to. */ 169 rnode->rn_last = last + 1; 170 uma_zfree_smr(vm_radix_node_zone, rnode); 171 } 172 173 /* 174 * Return the position in the array for a given level. 175 */ 176 static __inline int 177 vm_radix_slot(vm_pindex_t index, uint16_t level) 178 { 179 180 return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); 181 } 182 183 /* Trims the key after the specified level. */ 184 static __inline vm_pindex_t 185 vm_radix_trimkey(vm_pindex_t index, uint16_t level) 186 { 187 vm_pindex_t ret; 188 189 ret = index; 190 if (level > 0) { 191 ret >>= level * VM_RADIX_WIDTH; 192 ret <<= level * VM_RADIX_WIDTH; 193 } 194 return (ret); 195 } 196 197 /* 198 * Fetch a node pointer from a slot in another node. 199 */ 200 static __inline struct vm_radix_node * 201 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access) 202 { 203 204 switch (access) { 205 case UNSERIALIZED: 206 return (smr_unserialized_load(p, true)); 207 case LOCKED: 208 return (smr_serialized_load(p, true)); 209 case SMR: 210 return (smr_entered_load(p, vm_radix_smr)); 211 } 212 __assert_unreachable(); 213 } 214 215 static __inline void 216 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 217 enum vm_radix_access access) 218 { 219 220 221 switch (access) { 222 case UNSERIALIZED: 223 smr_unserialized_store(p, v, true); 224 break; 225 case LOCKED: 226 smr_serialized_store(p, v, true); 227 break; 228 case SMR: 229 panic("vm_radix_node_store: Not supported in smr section."); 230 } 231 } 232 233 /* 234 * Get the root node for a radix tree. 235 */ 236 static __inline struct vm_radix_node * 237 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access) 238 { 239 240 return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access)); 241 } 242 243 /* 244 * Set the root node for a radix tree. 245 */ 246 static __inline void 247 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode, 248 enum vm_radix_access access) 249 { 250 251 vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access); 252 } 253 254 /* 255 * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. 256 */ 257 static __inline boolean_t 258 vm_radix_isleaf(struct vm_radix_node *rnode) 259 { 260 261 return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); 262 } 263 264 /* 265 * Returns the associated page extracted from rnode. 266 */ 267 static __inline vm_page_t 268 vm_radix_topage(struct vm_radix_node *rnode) 269 { 270 271 return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); 272 } 273 274 /* 275 * Adds the page as a child of the provided node. 276 */ 277 static __inline void 278 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, 279 vm_page_t page, enum vm_radix_access access) 280 { 281 int slot; 282 283 slot = vm_radix_slot(index, clev); 284 vm_radix_node_store(&rnode->rn_child[slot], 285 (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access); 286 } 287 288 /* 289 * Returns the slot where two keys differ. 290 * It cannot accept 2 equal keys. 291 */ 292 static __inline uint16_t 293 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) 294 { 295 uint16_t clev; 296 297 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 298 __func__, (uintmax_t)index1)); 299 300 index1 ^= index2; 301 for (clev = VM_RADIX_LIMIT;; clev--) 302 if (vm_radix_slot(index1, clev) != 0) 303 return (clev); 304 } 305 306 /* 307 * Returns TRUE if it can be determined that key does not belong to the 308 * specified rnode. Otherwise, returns FALSE. 309 */ 310 static __inline boolean_t 311 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) 312 { 313 314 if (rnode->rn_clev < VM_RADIX_LIMIT) { 315 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); 316 return (idx != rnode->rn_owner); 317 } 318 return (FALSE); 319 } 320 321 /* 322 * Internal helper for vm_radix_reclaim_allnodes(). 323 * This function is recursive. 324 */ 325 static void 326 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) 327 { 328 struct vm_radix_node *child; 329 int slot; 330 331 KASSERT(rnode->rn_count <= VM_RADIX_COUNT, 332 ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode)); 333 for (slot = 0; rnode->rn_count != 0; slot++) { 334 child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED); 335 if (child == NULL) 336 continue; 337 if (!vm_radix_isleaf(child)) 338 vm_radix_reclaim_allnodes_int(child); 339 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED); 340 rnode->rn_count--; 341 } 342 vm_radix_node_put(rnode, -1); 343 } 344 345 #ifndef UMA_MD_SMALL_ALLOC 346 void vm_radix_reserve_kva(void); 347 /* 348 * Reserve the KVA necessary to satisfy the node allocation. 349 * This is mandatory in architectures not supporting direct 350 * mapping as they will need otherwise to carve into the kernel maps for 351 * every node allocation, resulting into deadlocks for consumers already 352 * working with kernel maps. 353 */ 354 void 355 vm_radix_reserve_kva(void) 356 { 357 358 /* 359 * Calculate the number of reserved nodes, discounting the pages that 360 * are needed to store them. 361 */ 362 if (!uma_zone_reserve_kva(vm_radix_node_zone, 363 ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + 364 sizeof(struct vm_radix_node)))) 365 panic("%s: unable to reserve KVA", __func__); 366 } 367 #endif 368 369 /* 370 * Initialize the UMA slab zone. 371 */ 372 void 373 vm_radix_zinit(void) 374 { 375 376 vm_radix_node_zone = uma_zcreate("RADIX NODE", 377 sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL, 378 VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT); 379 vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone); 380 } 381 382 /* 383 * Inserts the key-value pair into the trie. 384 * Panics if the key already exists. 385 */ 386 int 387 vm_radix_insert(struct vm_radix *rtree, vm_page_t page) 388 { 389 vm_pindex_t index, newind; 390 struct vm_radix_node *rnode, *tmp; 391 smrnode_t *parentp; 392 vm_page_t m; 393 int slot; 394 uint16_t clev; 395 396 index = page->pindex; 397 398 /* 399 * The owner of record for root is not really important because it 400 * will never be used. 401 */ 402 rnode = vm_radix_root_load(rtree, LOCKED); 403 if (rnode == NULL) { 404 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF; 405 return (0); 406 } 407 parentp = (smrnode_t *)&rtree->rt_root; 408 for (;;) { 409 if (vm_radix_isleaf(rnode)) { 410 m = vm_radix_topage(rnode); 411 if (m->pindex == index) 412 panic("%s: key %jx is already present", 413 __func__, (uintmax_t)index); 414 clev = vm_radix_keydiff(m->pindex, index); 415 tmp = vm_radix_node_get(vm_radix_trimkey(index, 416 clev + 1), 2, clev); 417 if (tmp == NULL) 418 return (ENOMEM); 419 /* These writes are not yet visible due to ordering. */ 420 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 421 vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED); 422 /* Synchronize to make leaf visible. */ 423 vm_radix_node_store(parentp, tmp, LOCKED); 424 return (0); 425 } else if (vm_radix_keybarr(rnode, index)) 426 break; 427 slot = vm_radix_slot(index, rnode->rn_clev); 428 parentp = &rnode->rn_child[slot]; 429 tmp = vm_radix_node_load(parentp, LOCKED); 430 if (tmp == NULL) { 431 rnode->rn_count++; 432 vm_radix_addpage(rnode, index, rnode->rn_clev, page, 433 LOCKED); 434 return (0); 435 } 436 rnode = tmp; 437 } 438 439 /* 440 * A new node is needed because the right insertion level is reached. 441 * Setup the new intermediate node and add the 2 children: the 442 * new object and the older edge. 443 */ 444 newind = rnode->rn_owner; 445 clev = vm_radix_keydiff(newind, index); 446 tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); 447 if (tmp == NULL) 448 return (ENOMEM); 449 slot = vm_radix_slot(newind, clev); 450 /* These writes are not yet visible due to ordering. */ 451 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 452 vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED); 453 /* Serializing write to make the above visible. */ 454 vm_radix_node_store(parentp, tmp, LOCKED); 455 456 return (0); 457 } 458 459 /* 460 * Returns TRUE if the specified radix tree contains a single leaf and FALSE 461 * otherwise. 462 */ 463 boolean_t 464 vm_radix_is_singleton(struct vm_radix *rtree) 465 { 466 struct vm_radix_node *rnode; 467 468 rnode = vm_radix_root_load(rtree, LOCKED); 469 if (rnode == NULL) 470 return (FALSE); 471 return (vm_radix_isleaf(rnode)); 472 } 473 474 /* 475 * Returns the value stored at the index. If the index is not present, 476 * NULL is returned. 477 */ 478 static __always_inline vm_page_t 479 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, 480 enum vm_radix_access access) 481 { 482 struct vm_radix_node *rnode; 483 vm_page_t m; 484 int slot; 485 486 rnode = vm_radix_root_load(rtree, access); 487 while (rnode != NULL) { 488 if (vm_radix_isleaf(rnode)) { 489 m = vm_radix_topage(rnode); 490 if (m->pindex == index) 491 return (m); 492 break; 493 } 494 if (vm_radix_keybarr(rnode, index)) 495 break; 496 slot = vm_radix_slot(index, rnode->rn_clev); 497 rnode = vm_radix_node_load(&rnode->rn_child[slot], access); 498 } 499 return (NULL); 500 } 501 502 /* 503 * Returns the value stored at the index assuming there is an external lock. 504 * 505 * If the index is not present, NULL is returned. 506 */ 507 vm_page_t 508 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) 509 { 510 511 return _vm_radix_lookup(rtree, index, LOCKED); 512 } 513 514 /* 515 * Returns the value stored at the index without requiring an external lock. 516 * 517 * If the index is not present, NULL is returned. 518 */ 519 vm_page_t 520 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index) 521 { 522 vm_page_t m; 523 524 smr_enter(vm_radix_smr); 525 m = _vm_radix_lookup(rtree, index, SMR); 526 smr_exit(vm_radix_smr); 527 528 return (m); 529 } 530 531 /* 532 * Look up the nearest entry at a position greater than or equal to index. 533 */ 534 vm_page_t 535 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) 536 { 537 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 538 vm_pindex_t inc; 539 vm_page_t m; 540 struct vm_radix_node *child, *rnode; 541 #ifdef INVARIANTS 542 int loops = 0; 543 #endif 544 int slot, tos; 545 546 rnode = vm_radix_root_load(rtree, LOCKED); 547 if (rnode == NULL) 548 return (NULL); 549 else if (vm_radix_isleaf(rnode)) { 550 m = vm_radix_topage(rnode); 551 if (m->pindex >= index) 552 return (m); 553 else 554 return (NULL); 555 } 556 tos = 0; 557 for (;;) { 558 /* 559 * If the keys differ before the current bisection node, 560 * then the search key might rollback to the earliest 561 * available bisection node or to the smallest key 562 * in the current node (if the owner is greater than the 563 * search key). 564 */ 565 if (vm_radix_keybarr(rnode, index)) { 566 if (index > rnode->rn_owner) { 567 ascend: 568 KASSERT(++loops < 1000, 569 ("vm_radix_lookup_ge: too many loops")); 570 571 /* 572 * Pop nodes from the stack until either the 573 * stack is empty or a node that could have a 574 * matching descendant is found. 575 */ 576 do { 577 if (tos == 0) 578 return (NULL); 579 rnode = stack[--tos]; 580 } while (vm_radix_slot(index, 581 rnode->rn_clev) == (VM_RADIX_COUNT - 1)); 582 583 /* 584 * The following computation cannot overflow 585 * because index's slot at the current level 586 * is less than VM_RADIX_COUNT - 1. 587 */ 588 index = vm_radix_trimkey(index, 589 rnode->rn_clev); 590 index += VM_RADIX_UNITLEVEL(rnode->rn_clev); 591 } else 592 index = rnode->rn_owner; 593 KASSERT(!vm_radix_keybarr(rnode, index), 594 ("vm_radix_lookup_ge: keybarr failed")); 595 } 596 slot = vm_radix_slot(index, rnode->rn_clev); 597 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 598 if (vm_radix_isleaf(child)) { 599 m = vm_radix_topage(child); 600 if (m->pindex >= index) 601 return (m); 602 } else if (child != NULL) 603 goto descend; 604 605 /* 606 * Look for an available edge or page within the current 607 * bisection node. 608 */ 609 if (slot < (VM_RADIX_COUNT - 1)) { 610 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 611 index = vm_radix_trimkey(index, rnode->rn_clev); 612 do { 613 index += inc; 614 slot++; 615 child = vm_radix_node_load(&rnode->rn_child[slot], 616 LOCKED); 617 if (vm_radix_isleaf(child)) { 618 m = vm_radix_topage(child); 619 if (m->pindex >= index) 620 return (m); 621 } else if (child != NULL) 622 goto descend; 623 } while (slot < (VM_RADIX_COUNT - 1)); 624 } 625 KASSERT(child == NULL || vm_radix_isleaf(child), 626 ("vm_radix_lookup_ge: child is radix node")); 627 628 /* 629 * If a page or edge greater than the search slot is not found 630 * in the current node, ascend to the next higher-level node. 631 */ 632 goto ascend; 633 descend: 634 KASSERT(rnode->rn_clev > 0, 635 ("vm_radix_lookup_ge: pushing leaf's parent")); 636 KASSERT(tos < VM_RADIX_LIMIT, 637 ("vm_radix_lookup_ge: stack overflow")); 638 stack[tos++] = rnode; 639 rnode = child; 640 } 641 } 642 643 /* 644 * Look up the nearest entry at a position less than or equal to index. 645 */ 646 vm_page_t 647 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) 648 { 649 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 650 vm_pindex_t inc; 651 vm_page_t m; 652 struct vm_radix_node *child, *rnode; 653 #ifdef INVARIANTS 654 int loops = 0; 655 #endif 656 int slot, tos; 657 658 rnode = vm_radix_root_load(rtree, LOCKED); 659 if (rnode == NULL) 660 return (NULL); 661 else if (vm_radix_isleaf(rnode)) { 662 m = vm_radix_topage(rnode); 663 if (m->pindex <= index) 664 return (m); 665 else 666 return (NULL); 667 } 668 tos = 0; 669 for (;;) { 670 /* 671 * If the keys differ before the current bisection node, 672 * then the search key might rollback to the earliest 673 * available bisection node or to the largest key 674 * in the current node (if the owner is smaller than the 675 * search key). 676 */ 677 if (vm_radix_keybarr(rnode, index)) { 678 if (index > rnode->rn_owner) { 679 index = rnode->rn_owner + VM_RADIX_COUNT * 680 VM_RADIX_UNITLEVEL(rnode->rn_clev); 681 } else { 682 ascend: 683 KASSERT(++loops < 1000, 684 ("vm_radix_lookup_le: too many loops")); 685 686 /* 687 * Pop nodes from the stack until either the 688 * stack is empty or a node that could have a 689 * matching descendant is found. 690 */ 691 do { 692 if (tos == 0) 693 return (NULL); 694 rnode = stack[--tos]; 695 } while (vm_radix_slot(index, 696 rnode->rn_clev) == 0); 697 698 /* 699 * The following computation cannot overflow 700 * because index's slot at the current level 701 * is greater than 0. 702 */ 703 index = vm_radix_trimkey(index, 704 rnode->rn_clev); 705 } 706 index--; 707 KASSERT(!vm_radix_keybarr(rnode, index), 708 ("vm_radix_lookup_le: keybarr failed")); 709 } 710 slot = vm_radix_slot(index, rnode->rn_clev); 711 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 712 if (vm_radix_isleaf(child)) { 713 m = vm_radix_topage(child); 714 if (m->pindex <= index) 715 return (m); 716 } else if (child != NULL) 717 goto descend; 718 719 /* 720 * Look for an available edge or page within the current 721 * bisection node. 722 */ 723 if (slot > 0) { 724 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 725 index |= inc - 1; 726 do { 727 index -= inc; 728 slot--; 729 child = vm_radix_node_load(&rnode->rn_child[slot], 730 LOCKED); 731 if (vm_radix_isleaf(child)) { 732 m = vm_radix_topage(child); 733 if (m->pindex <= index) 734 return (m); 735 } else if (child != NULL) 736 goto descend; 737 } while (slot > 0); 738 } 739 KASSERT(child == NULL || vm_radix_isleaf(child), 740 ("vm_radix_lookup_le: child is radix node")); 741 742 /* 743 * If a page or edge smaller than the search slot is not found 744 * in the current node, ascend to the next higher-level node. 745 */ 746 goto ascend; 747 descend: 748 KASSERT(rnode->rn_clev > 0, 749 ("vm_radix_lookup_le: pushing leaf's parent")); 750 KASSERT(tos < VM_RADIX_LIMIT, 751 ("vm_radix_lookup_le: stack overflow")); 752 stack[tos++] = rnode; 753 rnode = child; 754 } 755 } 756 757 /* 758 * Remove the specified index from the trie, and return the value stored at 759 * that index. If the index is not present, return NULL. 760 */ 761 vm_page_t 762 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) 763 { 764 struct vm_radix_node *rnode, *parent, *tmp; 765 vm_page_t m; 766 int i, slot; 767 768 rnode = vm_radix_root_load(rtree, LOCKED); 769 if (vm_radix_isleaf(rnode)) { 770 m = vm_radix_topage(rnode); 771 if (m->pindex != index) 772 return (NULL); 773 vm_radix_root_store(rtree, NULL, LOCKED); 774 return (m); 775 } 776 parent = NULL; 777 for (;;) { 778 if (rnode == NULL) 779 return (NULL); 780 slot = vm_radix_slot(index, rnode->rn_clev); 781 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 782 if (vm_radix_isleaf(tmp)) { 783 m = vm_radix_topage(tmp); 784 if (m->pindex != index) 785 return (NULL); 786 vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED); 787 rnode->rn_count--; 788 if (rnode->rn_count > 1) 789 return (m); 790 for (i = 0; i < VM_RADIX_COUNT; i++) 791 if (vm_radix_node_load(&rnode->rn_child[i], 792 LOCKED) != NULL) 793 break; 794 KASSERT(i != VM_RADIX_COUNT, 795 ("%s: invalid node configuration", __func__)); 796 tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED); 797 if (parent == NULL) 798 vm_radix_root_store(rtree, tmp, LOCKED); 799 else { 800 slot = vm_radix_slot(index, parent->rn_clev); 801 KASSERT(vm_radix_node_load( 802 &parent->rn_child[slot], LOCKED) == rnode, 803 ("%s: invalid child value", __func__)); 804 vm_radix_node_store(&parent->rn_child[slot], 805 tmp, LOCKED); 806 } 807 /* 808 * The child is still valid and we can not zero the 809 * pointer until all smr references are gone. 810 */ 811 rnode->rn_count--; 812 vm_radix_node_put(rnode, i); 813 return (m); 814 } 815 parent = rnode; 816 rnode = tmp; 817 } 818 } 819 820 /* 821 * Remove and free all the nodes from the radix tree. 822 * This function is recursive but there is a tight control on it as the 823 * maximum depth of the tree is fixed. 824 */ 825 void 826 vm_radix_reclaim_allnodes(struct vm_radix *rtree) 827 { 828 struct vm_radix_node *root; 829 830 root = vm_radix_root_load(rtree, LOCKED); 831 if (root == NULL) 832 return; 833 vm_radix_root_store(rtree, NULL, UNSERIALIZED); 834 if (!vm_radix_isleaf(root)) 835 vm_radix_reclaim_allnodes_int(root); 836 } 837 838 /* 839 * Replace an existing page in the trie with another one. 840 * Panics if there is not an old page in the trie at the new page's index. 841 */ 842 vm_page_t 843 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) 844 { 845 struct vm_radix_node *rnode, *tmp; 846 vm_page_t m; 847 vm_pindex_t index; 848 int slot; 849 850 index = newpage->pindex; 851 rnode = vm_radix_root_load(rtree, LOCKED); 852 if (rnode == NULL) 853 panic("%s: replacing page on an empty trie", __func__); 854 if (vm_radix_isleaf(rnode)) { 855 m = vm_radix_topage(rnode); 856 if (m->pindex != index) 857 panic("%s: original replacing root key not found", 858 __func__); 859 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF; 860 return (m); 861 } 862 for (;;) { 863 slot = vm_radix_slot(index, rnode->rn_clev); 864 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 865 if (vm_radix_isleaf(tmp)) { 866 m = vm_radix_topage(tmp); 867 if (m->pindex == index) { 868 vm_radix_node_store(&rnode->rn_child[slot], 869 (struct vm_radix_node *)((uintptr_t)newpage | 870 VM_RADIX_ISLEAF), LOCKED); 871 return (m); 872 } else 873 break; 874 } else if (tmp == NULL || vm_radix_keybarr(tmp, index)) 875 break; 876 rnode = tmp; 877 } 878 panic("%s: original replacing page not found", __func__); 879 } 880 881 void 882 vm_radix_wait(void) 883 { 884 uma_zwait(vm_radix_node_zone); 885 } 886 887 #ifdef DDB 888 /* 889 * Show details about the given radix node. 890 */ 891 DB_SHOW_COMMAND(radixnode, db_show_radixnode) 892 { 893 struct vm_radix_node *rnode, *tmp; 894 int i; 895 896 if (!have_addr) 897 return; 898 rnode = (struct vm_radix_node *)addr; 899 db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", 900 (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, 901 rnode->rn_clev); 902 for (i = 0; i < VM_RADIX_COUNT; i++) { 903 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED); 904 if (tmp != NULL) 905 db_printf("slot: %d, val: %p, page: %p, clev: %d\n", 906 i, (void *)tmp, 907 vm_radix_isleaf(tmp) ? vm_radix_topage(tmp) : NULL, 908 rnode->rn_clev); 909 } 910 } 911 #endif /* DDB */ 912