1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 EMC Corp. 5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> 6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * Path-compressed radix trie implementation. 34 * The following code is not generalized into a general purpose library 35 * because there are way too many parameters embedded that should really 36 * be decided by the library consumers. At the same time, consumers 37 * of this code must achieve highest possible performance. 38 * 39 * The implementation takes into account the following rationale: 40 * - Size of the nodes should be as small as possible but still big enough 41 * to avoid a large maximum depth for the trie. This is a balance 42 * between the necessity to not wire too much physical memory for the nodes 43 * and the necessity to avoid too much cache pollution during the trie 44 * operations. 45 * - There is not a huge bias toward the number of lookup operations over 46 * the number of insert and remove operations. This basically implies 47 * that optimizations supposedly helping one operation but hurting the 48 * other might be carefully evaluated. 49 * - On average not many nodes are expected to be fully populated, hence 50 * level compression may just complicate things. 51 */ 52 53 #include <sys/cdefs.h> 54 __FBSDID("$FreeBSD$"); 55 56 #include "opt_ddb.h" 57 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/kernel.h> 61 #include <sys/libkern.h> 62 #include <sys/proc.h> 63 #include <sys/vmmeter.h> 64 #include <sys/smr.h> 65 #include <sys/smr_types.h> 66 67 #include <vm/uma.h> 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_radix.h> 73 74 #ifdef DDB 75 #include <ddb/ddb.h> 76 #endif 77 78 /* 79 * These widths should allow the pointers to a node's children to fit within 80 * a single cache line. The extra levels from a narrow width should not be 81 * a problem thanks to path compression. 82 */ 83 #ifdef __LP64__ 84 #define VM_RADIX_WIDTH 4 85 #else 86 #define VM_RADIX_WIDTH 3 87 #endif 88 89 #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) 90 #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) 91 #define VM_RADIX_LIMIT \ 92 (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) 93 94 /* Flag bits stored in node pointers. */ 95 #define VM_RADIX_ISLEAF 0x1 96 #define VM_RADIX_FLAGS 0x1 97 #define VM_RADIX_PAD VM_RADIX_FLAGS 98 99 /* Returns one unit associated with specified level. */ 100 #define VM_RADIX_UNITLEVEL(lev) \ 101 ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) 102 103 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED }; 104 105 struct vm_radix_node; 106 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t; 107 108 struct vm_radix_node { 109 vm_pindex_t rn_owner; /* Owner of record. */ 110 uint16_t rn_count; /* Valid children. */ 111 uint8_t rn_clev; /* Current level. */ 112 int8_t rn_last; /* zero last ptr. */ 113 smrnode_t rn_child[VM_RADIX_COUNT]; /* Child nodes. */ 114 }; 115 116 static uma_zone_t vm_radix_node_zone; 117 static smr_t vm_radix_smr; 118 119 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 120 enum vm_radix_access access); 121 122 /* 123 * Return the position in the array for a given level. 124 */ 125 static __inline int 126 vm_radix_slot(vm_pindex_t index, uint16_t level) 127 { 128 return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); 129 } 130 131 /* Computes the key (index) with the low-order 'level' radix-digits zeroed. */ 132 static __inline vm_pindex_t 133 vm_radix_trimkey(vm_pindex_t index, uint16_t level) 134 { 135 return (index & -VM_RADIX_UNITLEVEL(level)); 136 } 137 138 /* 139 * Allocate a radix node. 140 */ 141 static struct vm_radix_node * 142 vm_radix_node_get(vm_pindex_t index, uint16_t clevel) 143 { 144 struct vm_radix_node *rnode; 145 146 rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT); 147 if (rnode == NULL) 148 return (NULL); 149 150 /* 151 * We want to clear the last child pointer after the final section 152 * has exited so lookup can not return false negatives. It is done 153 * here because it will be cache-cold in the dtor callback. 154 */ 155 if (rnode->rn_last != 0) { 156 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1], 157 NULL, UNSERIALIZED); 158 rnode->rn_last = 0; 159 } 160 rnode->rn_owner = vm_radix_trimkey(index, clevel + 1); 161 rnode->rn_count = 2; 162 rnode->rn_clev = clevel; 163 return (rnode); 164 } 165 166 /* 167 * Free radix node. 168 */ 169 static __inline void 170 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last) 171 { 172 #ifdef INVARIANTS 173 int slot; 174 175 KASSERT(rnode->rn_count == 0, 176 ("vm_radix_node_put: rnode %p has %d children", rnode, 177 rnode->rn_count)); 178 for (slot = 0; slot < VM_RADIX_COUNT; slot++) { 179 if (slot == last) 180 continue; 181 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) == 182 NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); 183 } 184 #endif 185 /* Off by one so a freshly zero'd node is not assigned to. */ 186 rnode->rn_last = last + 1; 187 uma_zfree_smr(vm_radix_node_zone, rnode); 188 } 189 190 /* 191 * Fetch a node pointer from a slot in another node. 192 */ 193 static __inline struct vm_radix_node * 194 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access) 195 { 196 197 switch (access) { 198 case UNSERIALIZED: 199 return (smr_unserialized_load(p, true)); 200 case LOCKED: 201 return (smr_serialized_load(p, true)); 202 case SMR: 203 return (smr_entered_load(p, vm_radix_smr)); 204 } 205 __assert_unreachable(); 206 } 207 208 static __inline void 209 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, 210 enum vm_radix_access access) 211 { 212 213 switch (access) { 214 case UNSERIALIZED: 215 smr_unserialized_store(p, v, true); 216 break; 217 case LOCKED: 218 smr_serialized_store(p, v, true); 219 break; 220 case SMR: 221 panic("vm_radix_node_store: Not supported in smr section."); 222 } 223 } 224 225 /* 226 * Get the root node for a radix tree. 227 */ 228 static __inline struct vm_radix_node * 229 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access) 230 { 231 232 return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access)); 233 } 234 235 /* 236 * Set the root node for a radix tree. 237 */ 238 static __inline void 239 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode, 240 enum vm_radix_access access) 241 { 242 243 vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access); 244 } 245 246 /* 247 * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. 248 */ 249 static __inline bool 250 vm_radix_isleaf(struct vm_radix_node *rnode) 251 { 252 253 return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); 254 } 255 256 /* 257 * Returns page cast to radix node with leaf bit set. 258 */ 259 static __inline struct vm_radix_node * 260 vm_radix_toleaf(vm_page_t page) 261 { 262 return ((struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF)); 263 } 264 265 /* 266 * Returns the associated page extracted from rnode. 267 */ 268 static __inline vm_page_t 269 vm_radix_topage(struct vm_radix_node *rnode) 270 { 271 272 return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); 273 } 274 275 /* 276 * Adds the page as a child of the provided node. 277 */ 278 static __inline void 279 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, 280 vm_page_t page, enum vm_radix_access access) 281 { 282 int slot; 283 284 slot = vm_radix_slot(index, clev); 285 vm_radix_node_store(&rnode->rn_child[slot], 286 vm_radix_toleaf(page), access); 287 } 288 289 /* 290 * Returns the level where two keys differ. 291 * It cannot accept 2 equal keys. 292 */ 293 static __inline uint16_t 294 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) 295 { 296 297 KASSERT(index1 != index2, ("%s: passing the same key value %jx", 298 __func__, (uintmax_t)index1)); 299 CTASSERT(sizeof(long long) >= sizeof(vm_pindex_t)); 300 301 /* 302 * From the highest-order bit where the indexes differ, 303 * compute the highest level in the trie where they differ. 304 */ 305 return ((flsll(index1 ^ index2) - 1) / VM_RADIX_WIDTH); 306 } 307 308 /* 309 * Returns TRUE if it can be determined that key does not belong to the 310 * specified rnode. Otherwise, returns FALSE. 311 */ 312 static __inline bool 313 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) 314 { 315 316 if (rnode->rn_clev < VM_RADIX_LIMIT) { 317 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); 318 return (idx != rnode->rn_owner); 319 } 320 return (false); 321 } 322 323 /* 324 * Internal helper for vm_radix_reclaim_allnodes(). 325 * This function is recursive. 326 */ 327 static void 328 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) 329 { 330 struct vm_radix_node *child; 331 int slot; 332 333 KASSERT(rnode->rn_count <= VM_RADIX_COUNT, 334 ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode)); 335 for (slot = 0; rnode->rn_count != 0; slot++) { 336 child = vm_radix_node_load(&rnode->rn_child[slot], 337 UNSERIALIZED); 338 if (child == NULL) 339 continue; 340 if (!vm_radix_isleaf(child)) 341 vm_radix_reclaim_allnodes_int(child); 342 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED); 343 rnode->rn_count--; 344 } 345 vm_radix_node_put(rnode, -1); 346 } 347 348 #ifndef UMA_MD_SMALL_ALLOC 349 void vm_radix_reserve_kva(void); 350 /* 351 * Reserve the KVA necessary to satisfy the node allocation. 352 * This is mandatory in architectures not supporting direct 353 * mapping as they will need otherwise to carve into the kernel maps for 354 * every node allocation, resulting into deadlocks for consumers already 355 * working with kernel maps. 356 */ 357 void 358 vm_radix_reserve_kva(void) 359 { 360 361 /* 362 * Calculate the number of reserved nodes, discounting the pages that 363 * are needed to store them. 364 */ 365 if (!uma_zone_reserve_kva(vm_radix_node_zone, 366 ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + 367 sizeof(struct vm_radix_node)))) 368 panic("%s: unable to reserve KVA", __func__); 369 } 370 #endif 371 372 /* 373 * Initialize the UMA slab zone. 374 */ 375 void 376 vm_radix_zinit(void) 377 { 378 379 vm_radix_node_zone = uma_zcreate("RADIX NODE", 380 sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL, 381 VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT); 382 vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone); 383 } 384 385 /* 386 * Inserts the key-value pair into the trie. 387 * Panics if the key already exists. 388 */ 389 int 390 vm_radix_insert(struct vm_radix *rtree, vm_page_t page) 391 { 392 vm_pindex_t index, newind; 393 struct vm_radix_node *rnode, *tmp; 394 smrnode_t *parentp; 395 vm_page_t m; 396 int slot; 397 uint16_t clev; 398 399 index = page->pindex; 400 401 /* 402 * The owner of record for root is not really important because it 403 * will never be used. 404 */ 405 rnode = vm_radix_root_load(rtree, LOCKED); 406 if (rnode == NULL) { 407 rtree->rt_root = (uintptr_t)vm_radix_toleaf(page); 408 return (0); 409 } 410 parentp = (smrnode_t *)&rtree->rt_root; 411 for (;;) { 412 if (vm_radix_isleaf(rnode)) { 413 m = vm_radix_topage(rnode); 414 if (m->pindex == index) 415 panic("%s: key %jx is already present", 416 __func__, (uintmax_t)index); 417 clev = vm_radix_keydiff(m->pindex, index); 418 tmp = vm_radix_node_get(index, clev); 419 if (tmp == NULL) 420 return (ENOMEM); 421 /* These writes are not yet visible due to ordering. */ 422 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 423 vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED); 424 /* Synchronize to make leaf visible. */ 425 vm_radix_node_store(parentp, tmp, LOCKED); 426 return (0); 427 } else if (vm_radix_keybarr(rnode, index)) 428 break; 429 slot = vm_radix_slot(index, rnode->rn_clev); 430 parentp = &rnode->rn_child[slot]; 431 tmp = vm_radix_node_load(parentp, LOCKED); 432 if (tmp == NULL) { 433 rnode->rn_count++; 434 vm_radix_addpage(rnode, index, rnode->rn_clev, page, 435 LOCKED); 436 return (0); 437 } 438 rnode = tmp; 439 } 440 441 /* 442 * A new node is needed because the right insertion level is reached. 443 * Setup the new intermediate node and add the 2 children: the 444 * new object and the older edge. 445 */ 446 newind = rnode->rn_owner; 447 clev = vm_radix_keydiff(newind, index); 448 tmp = vm_radix_node_get(index, clev); 449 if (tmp == NULL) 450 return (ENOMEM); 451 slot = vm_radix_slot(newind, clev); 452 /* These writes are not yet visible due to ordering. */ 453 vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); 454 vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED); 455 /* Serializing write to make the above visible. */ 456 vm_radix_node_store(parentp, tmp, LOCKED); 457 458 return (0); 459 } 460 461 /* 462 * Returns the value stored at the index. If the index is not present, 463 * NULL is returned. 464 */ 465 static __always_inline vm_page_t 466 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, 467 enum vm_radix_access access) 468 { 469 struct vm_radix_node *rnode; 470 vm_page_t m; 471 int slot; 472 473 rnode = vm_radix_root_load(rtree, access); 474 while (rnode != NULL) { 475 if (vm_radix_isleaf(rnode)) { 476 m = vm_radix_topage(rnode); 477 if (m->pindex == index) 478 return (m); 479 break; 480 } 481 if (vm_radix_keybarr(rnode, index)) 482 break; 483 slot = vm_radix_slot(index, rnode->rn_clev); 484 rnode = vm_radix_node_load(&rnode->rn_child[slot], access); 485 } 486 return (NULL); 487 } 488 489 /* 490 * Returns the value stored at the index assuming there is an external lock. 491 * 492 * If the index is not present, NULL is returned. 493 */ 494 vm_page_t 495 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) 496 { 497 498 return _vm_radix_lookup(rtree, index, LOCKED); 499 } 500 501 /* 502 * Returns the value stored at the index without requiring an external lock. 503 * 504 * If the index is not present, NULL is returned. 505 */ 506 vm_page_t 507 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index) 508 { 509 vm_page_t m; 510 511 smr_enter(vm_radix_smr); 512 m = _vm_radix_lookup(rtree, index, SMR); 513 smr_exit(vm_radix_smr); 514 515 return (m); 516 } 517 518 /* 519 * Look up the nearest entry at a position greater than or equal to index. 520 */ 521 vm_page_t 522 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) 523 { 524 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 525 vm_pindex_t inc; 526 vm_page_t m; 527 struct vm_radix_node *child, *rnode; 528 #ifdef INVARIANTS 529 int loops = 0; 530 #endif 531 int slot, tos; 532 533 rnode = vm_radix_root_load(rtree, LOCKED); 534 if (rnode == NULL) 535 return (NULL); 536 else if (vm_radix_isleaf(rnode)) { 537 m = vm_radix_topage(rnode); 538 if (m->pindex >= index) 539 return (m); 540 else 541 return (NULL); 542 } 543 tos = 0; 544 for (;;) { 545 /* 546 * If the keys differ before the current bisection node, 547 * then the search key might rollback to the earliest 548 * available bisection node or to the smallest key 549 * in the current node (if the owner is greater than the 550 * search key). 551 */ 552 if (vm_radix_keybarr(rnode, index)) { 553 if (index > rnode->rn_owner) { 554 ascend: 555 KASSERT(++loops < 1000, 556 ("vm_radix_lookup_ge: too many loops")); 557 558 /* 559 * Pop nodes from the stack until either the 560 * stack is empty or a node that could have a 561 * matching descendant is found. 562 */ 563 do { 564 if (tos == 0) 565 return (NULL); 566 rnode = stack[--tos]; 567 } while (vm_radix_slot(index, 568 rnode->rn_clev) == (VM_RADIX_COUNT - 1)); 569 570 /* 571 * The following computation cannot overflow 572 * because index's slot at the current level 573 * is less than VM_RADIX_COUNT - 1. 574 */ 575 index = vm_radix_trimkey(index, 576 rnode->rn_clev); 577 index += VM_RADIX_UNITLEVEL(rnode->rn_clev); 578 } else 579 index = rnode->rn_owner; 580 KASSERT(!vm_radix_keybarr(rnode, index), 581 ("vm_radix_lookup_ge: keybarr failed")); 582 } 583 slot = vm_radix_slot(index, rnode->rn_clev); 584 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 585 if (vm_radix_isleaf(child)) { 586 m = vm_radix_topage(child); 587 if (m->pindex >= index) 588 return (m); 589 } else if (child != NULL) 590 goto descend; 591 592 /* 593 * Look for an available edge or page within the current 594 * bisection node. 595 */ 596 if (slot < (VM_RADIX_COUNT - 1)) { 597 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 598 index = vm_radix_trimkey(index, rnode->rn_clev); 599 do { 600 index += inc; 601 slot++; 602 child = vm_radix_node_load(&rnode->rn_child[slot], 603 LOCKED); 604 if (vm_radix_isleaf(child)) { 605 m = vm_radix_topage(child); 606 KASSERT(m->pindex >= index, 607 ("vm_radix_lookup_ge: leaf<index")); 608 return (m); 609 } else if (child != NULL) 610 goto descend; 611 } while (slot < (VM_RADIX_COUNT - 1)); 612 } 613 KASSERT(child == NULL || vm_radix_isleaf(child), 614 ("vm_radix_lookup_ge: child is radix node")); 615 616 /* 617 * If a page or edge greater than the search slot is not found 618 * in the current node, ascend to the next higher-level node. 619 */ 620 goto ascend; 621 descend: 622 KASSERT(rnode->rn_clev > 0, 623 ("vm_radix_lookup_ge: pushing leaf's parent")); 624 KASSERT(tos < VM_RADIX_LIMIT, 625 ("vm_radix_lookup_ge: stack overflow")); 626 stack[tos++] = rnode; 627 rnode = child; 628 } 629 } 630 631 /* 632 * Look up the nearest entry at a position less than or equal to index. 633 */ 634 vm_page_t 635 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) 636 { 637 struct vm_radix_node *stack[VM_RADIX_LIMIT]; 638 vm_pindex_t inc; 639 vm_page_t m; 640 struct vm_radix_node *child, *rnode; 641 #ifdef INVARIANTS 642 int loops = 0; 643 #endif 644 int slot, tos; 645 646 rnode = vm_radix_root_load(rtree, LOCKED); 647 if (rnode == NULL) 648 return (NULL); 649 else if (vm_radix_isleaf(rnode)) { 650 m = vm_radix_topage(rnode); 651 if (m->pindex <= index) 652 return (m); 653 else 654 return (NULL); 655 } 656 tos = 0; 657 for (;;) { 658 /* 659 * If the keys differ before the current bisection node, 660 * then the search key might rollback to the earliest 661 * available bisection node or to the largest key 662 * in the current node (if the owner is smaller than the 663 * search key). 664 */ 665 if (vm_radix_keybarr(rnode, index)) { 666 if (index > rnode->rn_owner) { 667 index = rnode->rn_owner + VM_RADIX_COUNT * 668 VM_RADIX_UNITLEVEL(rnode->rn_clev); 669 } else { 670 ascend: 671 KASSERT(++loops < 1000, 672 ("vm_radix_lookup_le: too many loops")); 673 674 /* 675 * Pop nodes from the stack until either the 676 * stack is empty or a node that could have a 677 * matching descendant is found. 678 */ 679 do { 680 if (tos == 0) 681 return (NULL); 682 rnode = stack[--tos]; 683 } while (vm_radix_slot(index, 684 rnode->rn_clev) == 0); 685 686 /* 687 * The following computation cannot overflow 688 * because index's slot at the current level 689 * is greater than 0. 690 */ 691 index = vm_radix_trimkey(index, 692 rnode->rn_clev); 693 } 694 index--; 695 KASSERT(!vm_radix_keybarr(rnode, index), 696 ("vm_radix_lookup_le: keybarr failed")); 697 } 698 slot = vm_radix_slot(index, rnode->rn_clev); 699 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 700 if (vm_radix_isleaf(child)) { 701 m = vm_radix_topage(child); 702 if (m->pindex <= index) 703 return (m); 704 } else if (child != NULL) 705 goto descend; 706 707 /* 708 * Look for an available edge or page within the current 709 * bisection node. 710 */ 711 if (slot > 0) { 712 inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); 713 index |= inc - 1; 714 do { 715 index -= inc; 716 slot--; 717 child = vm_radix_node_load(&rnode->rn_child[slot], 718 LOCKED); 719 if (vm_radix_isleaf(child)) { 720 m = vm_radix_topage(child); 721 KASSERT(m->pindex <= index, 722 ("vm_radix_lookup_le: leaf>index")); 723 return (m); 724 } else if (child != NULL) 725 goto descend; 726 } while (slot > 0); 727 } 728 KASSERT(child == NULL || vm_radix_isleaf(child), 729 ("vm_radix_lookup_le: child is radix node")); 730 731 /* 732 * If a page or edge smaller than the search slot is not found 733 * in the current node, ascend to the next higher-level node. 734 */ 735 goto ascend; 736 descend: 737 KASSERT(rnode->rn_clev > 0, 738 ("vm_radix_lookup_le: pushing leaf's parent")); 739 KASSERT(tos < VM_RADIX_LIMIT, 740 ("vm_radix_lookup_le: stack overflow")); 741 stack[tos++] = rnode; 742 rnode = child; 743 } 744 } 745 746 /* 747 * Remove the specified index from the trie, and return the value stored at 748 * that index. If the index is not present, return NULL. 749 */ 750 vm_page_t 751 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) 752 { 753 struct vm_radix_node *rnode, *parent, *tmp; 754 vm_page_t m; 755 int i, slot; 756 757 rnode = vm_radix_root_load(rtree, LOCKED); 758 if (vm_radix_isleaf(rnode)) { 759 m = vm_radix_topage(rnode); 760 if (m->pindex != index) 761 return (NULL); 762 vm_radix_root_store(rtree, NULL, LOCKED); 763 return (m); 764 } 765 parent = NULL; 766 for (;;) { 767 if (rnode == NULL) 768 return (NULL); 769 slot = vm_radix_slot(index, rnode->rn_clev); 770 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 771 if (vm_radix_isleaf(tmp)) { 772 m = vm_radix_topage(tmp); 773 if (m->pindex != index) 774 return (NULL); 775 vm_radix_node_store( 776 &rnode->rn_child[slot], NULL, LOCKED); 777 rnode->rn_count--; 778 if (rnode->rn_count > 1) 779 return (m); 780 for (i = 0; i < VM_RADIX_COUNT; i++) { 781 tmp = vm_radix_node_load(&rnode->rn_child[i], 782 LOCKED); 783 if (tmp != NULL) 784 break; 785 } 786 KASSERT(tmp != NULL, 787 ("%s: invalid node configuration", __func__)); 788 if (parent == NULL) 789 vm_radix_root_store(rtree, tmp, LOCKED); 790 else { 791 slot = vm_radix_slot(index, parent->rn_clev); 792 KASSERT(vm_radix_node_load( 793 &parent->rn_child[slot], LOCKED) == rnode, 794 ("%s: invalid child value", __func__)); 795 vm_radix_node_store(&parent->rn_child[slot], 796 tmp, LOCKED); 797 } 798 /* 799 * The child is still valid and we can not zero the 800 * pointer until all smr references are gone. 801 */ 802 rnode->rn_count--; 803 vm_radix_node_put(rnode, i); 804 return (m); 805 } 806 parent = rnode; 807 rnode = tmp; 808 } 809 } 810 811 /* 812 * Remove and free all the nodes from the radix tree. 813 * This function is recursive but there is a tight control on it as the 814 * maximum depth of the tree is fixed. 815 */ 816 void 817 vm_radix_reclaim_allnodes(struct vm_radix *rtree) 818 { 819 struct vm_radix_node *root; 820 821 root = vm_radix_root_load(rtree, LOCKED); 822 if (root == NULL) 823 return; 824 vm_radix_root_store(rtree, NULL, UNSERIALIZED); 825 if (!vm_radix_isleaf(root)) 826 vm_radix_reclaim_allnodes_int(root); 827 } 828 829 /* 830 * Replace an existing page in the trie with another one. 831 * Panics if there is not an old page in the trie at the new page's index. 832 */ 833 vm_page_t 834 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) 835 { 836 struct vm_radix_node *rnode, *tmp; 837 vm_page_t m; 838 vm_pindex_t index; 839 int slot; 840 841 index = newpage->pindex; 842 rnode = vm_radix_root_load(rtree, LOCKED); 843 if (rnode == NULL) 844 panic("%s: replacing page on an empty trie", __func__); 845 if (vm_radix_isleaf(rnode)) { 846 m = vm_radix_topage(rnode); 847 if (m->pindex != index) 848 panic("%s: original replacing root key not found", 849 __func__); 850 rtree->rt_root = (uintptr_t)vm_radix_toleaf(newpage); 851 return (m); 852 } 853 for (;;) { 854 slot = vm_radix_slot(index, rnode->rn_clev); 855 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); 856 if (vm_radix_isleaf(tmp)) { 857 m = vm_radix_topage(tmp); 858 if (m->pindex != index) 859 break; 860 vm_radix_node_store(&rnode->rn_child[slot], 861 vm_radix_toleaf(newpage), LOCKED); 862 return (m); 863 } else if (tmp == NULL || vm_radix_keybarr(tmp, index)) 864 break; 865 rnode = tmp; 866 } 867 panic("%s: original replacing page not found", __func__); 868 } 869 870 void 871 vm_radix_wait(void) 872 { 873 uma_zwait(vm_radix_node_zone); 874 } 875 876 #ifdef DDB 877 /* 878 * Show details about the given radix node. 879 */ 880 DB_SHOW_COMMAND(radixnode, db_show_radixnode) 881 { 882 struct vm_radix_node *rnode, *tmp; 883 int i; 884 885 if (!have_addr) 886 return; 887 rnode = (struct vm_radix_node *)addr; 888 db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", 889 (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, 890 rnode->rn_clev); 891 for (i = 0; i < VM_RADIX_COUNT; i++) { 892 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED); 893 if (tmp != NULL) 894 db_printf("slot: %d, val: %p, page: %p, clev: %d\n", 895 i, (void *)tmp, 896 vm_radix_isleaf(tmp) ? vm_radix_topage(tmp) : NULL, 897 rnode->rn_clev); 898 } 899 } 900 #endif /* DDB */ 901