1 /* 2 * Copyright (C) 2001 Momchil Velikov 3 * Portions Copyright (C) 2001 Christoph Hellwig 4 * Copyright (C) 2005 SGI, Christoph Lameter 5 * Copyright (C) 2006 Nick Piggin 6 * Copyright (C) 2012 Konstantin Khlebnikov 7 * Copyright (C) 2016 Intel, Matthew Wilcox 8 * Copyright (C) 2016 Intel, Ross Zwisler 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License as 12 * published by the Free Software Foundation; either version 2, or (at 13 * your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25 #include <linux/cpu.h> 26 #include <linux/errno.h> 27 #include <linux/init.h> 28 #include <linux/kernel.h> 29 #include <linux/export.h> 30 #include <linux/radix-tree.h> 31 #include <linux/percpu.h> 32 #include <linux/slab.h> 33 #include <linux/kmemleak.h> 34 #include <linux/cpu.h> 35 #include <linux/string.h> 36 #include <linux/bitops.h> 37 #include <linux/rcupdate.h> 38 #include <linux/preempt.h> /* in_interrupt() */ 39 40 41 /* Number of nodes in fully populated tree of given height */ 42 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; 43 44 /* 45 * Radix tree node cache. 46 */ 47 static struct kmem_cache *radix_tree_node_cachep; 48 49 /* 50 * The radix tree is variable-height, so an insert operation not only has 51 * to build the branch to its corresponding item, it also has to build the 52 * branch to existing items if the size has to be increased (by 53 * radix_tree_extend). 54 * 55 * The worst case is a zero height tree with just a single item at index 0, 56 * and then inserting an item at index ULONG_MAX. This requires 2 new branches 57 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. 58 * Hence: 59 */ 60 #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) 61 62 /* 63 * Per-cpu pool of preloaded nodes 64 */ 65 struct radix_tree_preload { 66 unsigned nr; 67 /* nodes->private_data points to next preallocated node */ 68 struct radix_tree_node *nodes; 69 }; 70 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 71 72 static inline struct radix_tree_node *entry_to_node(void *ptr) 73 { 74 return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); 75 } 76 77 static inline void *node_to_entry(void *ptr) 78 { 79 return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); 80 } 81 82 #define RADIX_TREE_RETRY node_to_entry(NULL) 83 84 #ifdef CONFIG_RADIX_TREE_MULTIORDER 85 /* Sibling slots point directly to another slot in the same node */ 86 static inline 87 bool is_sibling_entry(const struct radix_tree_node *parent, void *node) 88 { 89 void **ptr = node; 90 return (parent->slots <= ptr) && 91 (ptr < parent->slots + RADIX_TREE_MAP_SIZE); 92 } 93 #else 94 static inline 95 bool is_sibling_entry(const struct radix_tree_node *parent, void *node) 96 { 97 return false; 98 } 99 #endif 100 101 static inline 102 unsigned long get_slot_offset(const struct radix_tree_node *parent, void **slot) 103 { 104 return slot - parent->slots; 105 } 106 107 static unsigned int radix_tree_descend(const struct radix_tree_node *parent, 108 struct radix_tree_node **nodep, unsigned long index) 109 { 110 unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; 111 void **entry = rcu_dereference_raw(parent->slots[offset]); 112 113 #ifdef CONFIG_RADIX_TREE_MULTIORDER 114 if (radix_tree_is_internal_node(entry)) { 115 if (is_sibling_entry(parent, entry)) { 116 void **sibentry = (void **) entry_to_node(entry); 117 offset = get_slot_offset(parent, sibentry); 118 entry = rcu_dereference_raw(*sibentry); 119 } 120 } 121 #endif 122 123 *nodep = (void *)entry; 124 return offset; 125 } 126 127 static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) 128 { 129 return root->gfp_mask & __GFP_BITS_MASK; 130 } 131 132 static inline void tag_set(struct radix_tree_node *node, unsigned int tag, 133 int offset) 134 { 135 __set_bit(offset, node->tags[tag]); 136 } 137 138 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, 139 int offset) 140 { 141 __clear_bit(offset, node->tags[tag]); 142 } 143 144 static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, 145 int offset) 146 { 147 return test_bit(offset, node->tags[tag]); 148 } 149 150 static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) 151 { 152 root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); 153 } 154 155 static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) 156 { 157 root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); 158 } 159 160 static inline void root_tag_clear_all(struct radix_tree_root *root) 161 { 162 root->gfp_mask &= __GFP_BITS_MASK; 163 } 164 165 static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) 166 { 167 return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); 168 } 169 170 static inline unsigned root_tags_get(const struct radix_tree_root *root) 171 { 172 return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT; 173 } 174 175 /* 176 * Returns 1 if any slot in the node has this tag set. 177 * Otherwise returns 0. 178 */ 179 static inline int any_tag_set(const struct radix_tree_node *node, 180 unsigned int tag) 181 { 182 unsigned idx; 183 for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { 184 if (node->tags[tag][idx]) 185 return 1; 186 } 187 return 0; 188 } 189 190 /** 191 * radix_tree_find_next_bit - find the next set bit in a memory region 192 * 193 * @addr: The address to base the search on 194 * @size: The bitmap size in bits 195 * @offset: The bitnumber to start searching at 196 * 197 * Unrollable variant of find_next_bit() for constant size arrays. 198 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. 199 * Returns next bit offset, or size if nothing found. 200 */ 201 static __always_inline unsigned long 202 radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, 203 unsigned long offset) 204 { 205 const unsigned long *addr = node->tags[tag]; 206 207 if (offset < RADIX_TREE_MAP_SIZE) { 208 unsigned long tmp; 209 210 addr += offset / BITS_PER_LONG; 211 tmp = *addr >> (offset % BITS_PER_LONG); 212 if (tmp) 213 return __ffs(tmp) + offset; 214 offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); 215 while (offset < RADIX_TREE_MAP_SIZE) { 216 tmp = *++addr; 217 if (tmp) 218 return __ffs(tmp) + offset; 219 offset += BITS_PER_LONG; 220 } 221 } 222 return RADIX_TREE_MAP_SIZE; 223 } 224 225 static unsigned int iter_offset(const struct radix_tree_iter *iter) 226 { 227 return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; 228 } 229 230 /* 231 * The maximum index which can be stored in a radix tree 232 */ 233 static inline unsigned long shift_maxindex(unsigned int shift) 234 { 235 return (RADIX_TREE_MAP_SIZE << shift) - 1; 236 } 237 238 static inline unsigned long node_maxindex(const struct radix_tree_node *node) 239 { 240 return shift_maxindex(node->shift); 241 } 242 243 #ifndef __KERNEL__ 244 static void dump_node(struct radix_tree_node *node, unsigned long index) 245 { 246 unsigned long i; 247 248 pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", 249 node, node->offset, index, index | node_maxindex(node), 250 node->parent, 251 node->tags[0][0], node->tags[1][0], node->tags[2][0], 252 node->shift, node->count, node->exceptional); 253 254 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { 255 unsigned long first = index | (i << node->shift); 256 unsigned long last = first | ((1UL << node->shift) - 1); 257 void *entry = node->slots[i]; 258 if (!entry) 259 continue; 260 if (entry == RADIX_TREE_RETRY) { 261 pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", 262 i, first, last, node); 263 } else if (!radix_tree_is_internal_node(entry)) { 264 pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", 265 entry, i, first, last, node); 266 } else if (is_sibling_entry(node, entry)) { 267 pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", 268 entry, i, first, last, node, 269 *(void **)entry_to_node(entry)); 270 } else { 271 dump_node(entry_to_node(entry), first); 272 } 273 } 274 } 275 276 /* For debug */ 277 static void radix_tree_dump(struct radix_tree_root *root) 278 { 279 pr_debug("radix root: %p rnode %p tags %x\n", 280 root, root->rnode, 281 root->gfp_mask >> __GFP_BITS_SHIFT); 282 if (!radix_tree_is_internal_node(root->rnode)) 283 return; 284 dump_node(entry_to_node(root->rnode), 0); 285 } 286 #endif 287 288 /* 289 * This assumes that the caller has performed appropriate preallocation, and 290 * that the caller has pinned this thread of control to the current CPU. 291 */ 292 static struct radix_tree_node * 293 radix_tree_node_alloc(struct radix_tree_root *root, 294 struct radix_tree_node *parent, 295 unsigned int shift, unsigned int offset, 296 unsigned int count, unsigned int exceptional) 297 { 298 struct radix_tree_node *ret = NULL; 299 gfp_t gfp_mask = root_gfp_mask(root); 300 301 /* 302 * Preload code isn't irq safe and it doesn't make sense to use 303 * preloading during an interrupt anyway as all the allocations have 304 * to be atomic. So just do normal allocation when in interrupt. 305 */ 306 if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { 307 struct radix_tree_preload *rtp; 308 309 /* 310 * Even if the caller has preloaded, try to allocate from the 311 * cache first for the new node to get accounted to the memory 312 * cgroup. 313 */ 314 ret = kmem_cache_alloc(radix_tree_node_cachep, 315 gfp_mask | __GFP_NOWARN); 316 if (ret) 317 goto out; 318 319 /* 320 * Provided the caller has preloaded here, we will always 321 * succeed in getting a node here (and never reach 322 * kmem_cache_alloc) 323 */ 324 rtp = this_cpu_ptr(&radix_tree_preloads); 325 if (rtp->nr) { 326 ret = rtp->nodes; 327 rtp->nodes = ret->private_data; 328 ret->private_data = NULL; 329 rtp->nr--; 330 } 331 /* 332 * Update the allocation stack trace as this is more useful 333 * for debugging. 334 */ 335 kmemleak_update_trace(ret); 336 goto out; 337 } 338 ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 339 out: 340 BUG_ON(radix_tree_is_internal_node(ret)); 341 if (ret) { 342 ret->parent = parent; 343 ret->shift = shift; 344 ret->offset = offset; 345 ret->count = count; 346 ret->exceptional = exceptional; 347 } 348 return ret; 349 } 350 351 static void radix_tree_node_rcu_free(struct rcu_head *head) 352 { 353 struct radix_tree_node *node = 354 container_of(head, struct radix_tree_node, rcu_head); 355 356 /* 357 * Must only free zeroed nodes into the slab. We can be left with 358 * non-NULL entries by radix_tree_free_nodes, so clear the entries 359 * and tags here. 360 */ 361 memset(node->slots, 0, sizeof(node->slots)); 362 memset(node->tags, 0, sizeof(node->tags)); 363 INIT_LIST_HEAD(&node->private_list); 364 365 kmem_cache_free(radix_tree_node_cachep, node); 366 } 367 368 static inline void 369 radix_tree_node_free(struct radix_tree_node *node) 370 { 371 call_rcu(&node->rcu_head, radix_tree_node_rcu_free); 372 } 373 374 /* 375 * Load up this CPU's radix_tree_node buffer with sufficient objects to 376 * ensure that the addition of a single element in the tree cannot fail. On 377 * success, return zero, with preemption disabled. On error, return -ENOMEM 378 * with preemption not disabled. 379 * 380 * To make use of this facility, the radix tree must be initialised without 381 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). 382 */ 383 static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) 384 { 385 struct radix_tree_preload *rtp; 386 struct radix_tree_node *node; 387 int ret = -ENOMEM; 388 389 /* 390 * Nodes preloaded by one cgroup can be be used by another cgroup, so 391 * they should never be accounted to any particular memory cgroup. 392 */ 393 gfp_mask &= ~__GFP_ACCOUNT; 394 395 preempt_disable(); 396 rtp = this_cpu_ptr(&radix_tree_preloads); 397 while (rtp->nr < nr) { 398 preempt_enable(); 399 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 400 if (node == NULL) 401 goto out; 402 preempt_disable(); 403 rtp = this_cpu_ptr(&radix_tree_preloads); 404 if (rtp->nr < nr) { 405 node->private_data = rtp->nodes; 406 rtp->nodes = node; 407 rtp->nr++; 408 } else { 409 kmem_cache_free(radix_tree_node_cachep, node); 410 } 411 } 412 ret = 0; 413 out: 414 return ret; 415 } 416 417 /* 418 * Load up this CPU's radix_tree_node buffer with sufficient objects to 419 * ensure that the addition of a single element in the tree cannot fail. On 420 * success, return zero, with preemption disabled. On error, return -ENOMEM 421 * with preemption not disabled. 422 * 423 * To make use of this facility, the radix tree must be initialised without 424 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). 425 */ 426 int radix_tree_preload(gfp_t gfp_mask) 427 { 428 /* Warn on non-sensical use... */ 429 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); 430 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); 431 } 432 EXPORT_SYMBOL(radix_tree_preload); 433 434 /* 435 * The same as above function, except we don't guarantee preloading happens. 436 * We do it, if we decide it helps. On success, return zero with preemption 437 * disabled. On error, return -ENOMEM with preemption not disabled. 438 */ 439 int radix_tree_maybe_preload(gfp_t gfp_mask) 440 { 441 if (gfpflags_allow_blocking(gfp_mask)) 442 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); 443 /* Preloading doesn't help anything with this gfp mask, skip it */ 444 preempt_disable(); 445 return 0; 446 } 447 EXPORT_SYMBOL(radix_tree_maybe_preload); 448 449 #ifdef CONFIG_RADIX_TREE_MULTIORDER 450 /* 451 * Preload with enough objects to ensure that we can split a single entry 452 * of order @old_order into many entries of size @new_order 453 */ 454 int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, 455 gfp_t gfp_mask) 456 { 457 unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); 458 unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - 459 (new_order / RADIX_TREE_MAP_SHIFT); 460 unsigned nr = 0; 461 462 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); 463 BUG_ON(new_order >= old_order); 464 465 while (layers--) 466 nr = nr * RADIX_TREE_MAP_SIZE + 1; 467 return __radix_tree_preload(gfp_mask, top * nr); 468 } 469 #endif 470 471 /* 472 * The same as function above, but preload number of nodes required to insert 473 * (1 << order) continuous naturally-aligned elements. 474 */ 475 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) 476 { 477 unsigned long nr_subtrees; 478 int nr_nodes, subtree_height; 479 480 /* Preloading doesn't help anything with this gfp mask, skip it */ 481 if (!gfpflags_allow_blocking(gfp_mask)) { 482 preempt_disable(); 483 return 0; 484 } 485 486 /* 487 * Calculate number and height of fully populated subtrees it takes to 488 * store (1 << order) elements. 489 */ 490 nr_subtrees = 1 << order; 491 for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; 492 subtree_height++) 493 nr_subtrees >>= RADIX_TREE_MAP_SHIFT; 494 495 /* 496 * The worst case is zero height tree with a single item at index 0 and 497 * then inserting items starting at ULONG_MAX - (1 << order). 498 * 499 * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to 500 * 0-index item. 501 */ 502 nr_nodes = RADIX_TREE_MAX_PATH; 503 504 /* Plus branch to fully populated subtrees. */ 505 nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; 506 507 /* Root node is shared. */ 508 nr_nodes--; 509 510 /* Plus nodes required to build subtrees. */ 511 nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; 512 513 return __radix_tree_preload(gfp_mask, nr_nodes); 514 } 515 516 static unsigned radix_tree_load_root(const struct radix_tree_root *root, 517 struct radix_tree_node **nodep, unsigned long *maxindex) 518 { 519 struct radix_tree_node *node = rcu_dereference_raw(root->rnode); 520 521 *nodep = node; 522 523 if (likely(radix_tree_is_internal_node(node))) { 524 node = entry_to_node(node); 525 *maxindex = node_maxindex(node); 526 return node->shift + RADIX_TREE_MAP_SHIFT; 527 } 528 529 *maxindex = 0; 530 return 0; 531 } 532 533 /* 534 * Extend a radix tree so it can store key @index. 535 */ 536 static int radix_tree_extend(struct radix_tree_root *root, 537 unsigned long index, unsigned int shift) 538 { 539 struct radix_tree_node *slot; 540 unsigned int maxshift; 541 int tag; 542 543 /* Figure out what the shift should be. */ 544 maxshift = shift; 545 while (index > shift_maxindex(maxshift)) 546 maxshift += RADIX_TREE_MAP_SHIFT; 547 548 slot = root->rnode; 549 if (!slot) 550 goto out; 551 552 do { 553 struct radix_tree_node *node = radix_tree_node_alloc(root, 554 NULL, shift, 0, 1, 0); 555 if (!node) 556 return -ENOMEM; 557 558 /* Propagate the aggregated tag info into the new root */ 559 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 560 if (root_tag_get(root, tag)) 561 tag_set(node, tag, 0); 562 } 563 564 BUG_ON(shift > BITS_PER_LONG); 565 if (radix_tree_is_internal_node(slot)) { 566 entry_to_node(slot)->parent = node; 567 } else if (radix_tree_exceptional_entry(slot)) { 568 /* Moving an exceptional root->rnode to a node */ 569 node->exceptional = 1; 570 } 571 node->slots[0] = slot; 572 slot = node_to_entry(node); 573 rcu_assign_pointer(root->rnode, slot); 574 shift += RADIX_TREE_MAP_SHIFT; 575 } while (shift <= maxshift); 576 out: 577 return maxshift + RADIX_TREE_MAP_SHIFT; 578 } 579 580 /** 581 * radix_tree_shrink - shrink radix tree to minimum height 582 * @root radix tree root 583 */ 584 static inline void radix_tree_shrink(struct radix_tree_root *root, 585 radix_tree_update_node_t update_node, 586 void *private) 587 { 588 for (;;) { 589 struct radix_tree_node *node = root->rnode; 590 struct radix_tree_node *child; 591 592 if (!radix_tree_is_internal_node(node)) 593 break; 594 node = entry_to_node(node); 595 596 /* 597 * The candidate node has more than one child, or its child 598 * is not at the leftmost slot, or the child is a multiorder 599 * entry, we cannot shrink. 600 */ 601 if (node->count != 1) 602 break; 603 child = node->slots[0]; 604 if (!child) 605 break; 606 if (!radix_tree_is_internal_node(child) && node->shift) 607 break; 608 609 if (radix_tree_is_internal_node(child)) 610 entry_to_node(child)->parent = NULL; 611 612 /* 613 * We don't need rcu_assign_pointer(), since we are simply 614 * moving the node from one part of the tree to another: if it 615 * was safe to dereference the old pointer to it 616 * (node->slots[0]), it will be safe to dereference the new 617 * one (root->rnode) as far as dependent read barriers go. 618 */ 619 root->rnode = child; 620 621 /* 622 * We have a dilemma here. The node's slot[0] must not be 623 * NULLed in case there are concurrent lookups expecting to 624 * find the item. However if this was a bottom-level node, 625 * then it may be subject to the slot pointer being visible 626 * to callers dereferencing it. If item corresponding to 627 * slot[0] is subsequently deleted, these callers would expect 628 * their slot to become empty sooner or later. 629 * 630 * For example, lockless pagecache will look up a slot, deref 631 * the page pointer, and if the page has 0 refcount it means it 632 * was concurrently deleted from pagecache so try the deref 633 * again. Fortunately there is already a requirement for logic 634 * to retry the entire slot lookup -- the indirect pointer 635 * problem (replacing direct root node with an indirect pointer 636 * also results in a stale slot). So tag the slot as indirect 637 * to force callers to retry. 638 */ 639 node->count = 0; 640 if (!radix_tree_is_internal_node(child)) { 641 node->slots[0] = RADIX_TREE_RETRY; 642 if (update_node) 643 update_node(node, private); 644 } 645 646 WARN_ON_ONCE(!list_empty(&node->private_list)); 647 radix_tree_node_free(node); 648 } 649 } 650 651 static void delete_node(struct radix_tree_root *root, 652 struct radix_tree_node *node, 653 radix_tree_update_node_t update_node, void *private) 654 { 655 do { 656 struct radix_tree_node *parent; 657 658 if (node->count) { 659 if (node == entry_to_node(root->rnode)) 660 radix_tree_shrink(root, update_node, private); 661 return; 662 } 663 664 parent = node->parent; 665 if (parent) { 666 parent->slots[node->offset] = NULL; 667 parent->count--; 668 } else { 669 root_tag_clear_all(root); 670 root->rnode = NULL; 671 } 672 673 WARN_ON_ONCE(!list_empty(&node->private_list)); 674 radix_tree_node_free(node); 675 676 node = parent; 677 } while (node); 678 } 679 680 /** 681 * __radix_tree_create - create a slot in a radix tree 682 * @root: radix tree root 683 * @index: index key 684 * @order: index occupies 2^order aligned slots 685 * @nodep: returns node 686 * @slotp: returns slot 687 * 688 * Create, if necessary, and return the node and slot for an item 689 * at position @index in the radix tree @root. 690 * 691 * Until there is more than one item in the tree, no nodes are 692 * allocated and @root->rnode is used as a direct slot instead of 693 * pointing to a node, in which case *@nodep will be NULL. 694 * 695 * Returns -ENOMEM, or 0 for success. 696 */ 697 int __radix_tree_create(struct radix_tree_root *root, unsigned long index, 698 unsigned order, struct radix_tree_node **nodep, 699 void ***slotp) 700 { 701 struct radix_tree_node *node = NULL, *child; 702 void **slot = (void **)&root->rnode; 703 unsigned long maxindex; 704 unsigned int shift, offset = 0; 705 unsigned long max = index | ((1UL << order) - 1); 706 707 shift = radix_tree_load_root(root, &child, &maxindex); 708 709 /* Make sure the tree is high enough. */ 710 if (order > 0 && max == ((1UL << order) - 1)) 711 max++; 712 if (max > maxindex) { 713 int error = radix_tree_extend(root, max, shift); 714 if (error < 0) 715 return error; 716 shift = error; 717 child = root->rnode; 718 } 719 720 while (shift > order) { 721 shift -= RADIX_TREE_MAP_SHIFT; 722 if (child == NULL) { 723 /* Have to add a child node. */ 724 child = radix_tree_node_alloc(root, node, shift, 725 offset, 0, 0); 726 if (!child) 727 return -ENOMEM; 728 rcu_assign_pointer(*slot, node_to_entry(child)); 729 if (node) 730 node->count++; 731 } else if (!radix_tree_is_internal_node(child)) 732 break; 733 734 /* Go a level down */ 735 node = entry_to_node(child); 736 offset = radix_tree_descend(node, &child, index); 737 slot = &node->slots[offset]; 738 } 739 740 if (nodep) 741 *nodep = node; 742 if (slotp) 743 *slotp = slot; 744 return 0; 745 } 746 747 #ifdef CONFIG_RADIX_TREE_MULTIORDER 748 /* 749 * Free any nodes below this node. The tree is presumed to not need 750 * shrinking, and any user data in the tree is presumed to not need a 751 * destructor called on it. If we need to add a destructor, we can 752 * add that functionality later. Note that we may not clear tags or 753 * slots from the tree as an RCU walker may still have a pointer into 754 * this subtree. We could replace the entries with RADIX_TREE_RETRY, 755 * but we'll still have to clear those in rcu_free. 756 */ 757 static void radix_tree_free_nodes(struct radix_tree_node *node) 758 { 759 unsigned offset = 0; 760 struct radix_tree_node *child = entry_to_node(node); 761 762 for (;;) { 763 void *entry = child->slots[offset]; 764 if (radix_tree_is_internal_node(entry) && 765 !is_sibling_entry(child, entry)) { 766 child = entry_to_node(entry); 767 offset = 0; 768 continue; 769 } 770 offset++; 771 while (offset == RADIX_TREE_MAP_SIZE) { 772 struct radix_tree_node *old = child; 773 offset = child->offset + 1; 774 child = child->parent; 775 WARN_ON_ONCE(!list_empty(&old->private_list)); 776 radix_tree_node_free(old); 777 if (old == entry_to_node(node)) 778 return; 779 } 780 } 781 } 782 783 static inline int insert_entries(struct radix_tree_node *node, void **slot, 784 void *item, unsigned order, bool replace) 785 { 786 struct radix_tree_node *child; 787 unsigned i, n, tag, offset, tags = 0; 788 789 if (node) { 790 if (order > node->shift) 791 n = 1 << (order - node->shift); 792 else 793 n = 1; 794 offset = get_slot_offset(node, slot); 795 } else { 796 n = 1; 797 offset = 0; 798 } 799 800 if (n > 1) { 801 offset = offset & ~(n - 1); 802 slot = &node->slots[offset]; 803 } 804 child = node_to_entry(slot); 805 806 for (i = 0; i < n; i++) { 807 if (slot[i]) { 808 if (replace) { 809 node->count--; 810 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 811 if (tag_get(node, tag, offset + i)) 812 tags |= 1 << tag; 813 } else 814 return -EEXIST; 815 } 816 } 817 818 for (i = 0; i < n; i++) { 819 struct radix_tree_node *old = slot[i]; 820 if (i) { 821 rcu_assign_pointer(slot[i], child); 822 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 823 if (tags & (1 << tag)) 824 tag_clear(node, tag, offset + i); 825 } else { 826 rcu_assign_pointer(slot[i], item); 827 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 828 if (tags & (1 << tag)) 829 tag_set(node, tag, offset); 830 } 831 if (radix_tree_is_internal_node(old) && 832 !is_sibling_entry(node, old) && 833 (old != RADIX_TREE_RETRY)) 834 radix_tree_free_nodes(old); 835 if (radix_tree_exceptional_entry(old)) 836 node->exceptional--; 837 } 838 if (node) { 839 node->count += n; 840 if (radix_tree_exceptional_entry(item)) 841 node->exceptional += n; 842 } 843 return n; 844 } 845 #else 846 static inline int insert_entries(struct radix_tree_node *node, void **slot, 847 void *item, unsigned order, bool replace) 848 { 849 if (*slot) 850 return -EEXIST; 851 rcu_assign_pointer(*slot, item); 852 if (node) { 853 node->count++; 854 if (radix_tree_exceptional_entry(item)) 855 node->exceptional++; 856 } 857 return 1; 858 } 859 #endif 860 861 /** 862 * __radix_tree_insert - insert into a radix tree 863 * @root: radix tree root 864 * @index: index key 865 * @order: key covers the 2^order indices around index 866 * @item: item to insert 867 * 868 * Insert an item into the radix tree at position @index. 869 */ 870 int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, 871 unsigned order, void *item) 872 { 873 struct radix_tree_node *node; 874 void **slot; 875 int error; 876 877 BUG_ON(radix_tree_is_internal_node(item)); 878 879 error = __radix_tree_create(root, index, order, &node, &slot); 880 if (error) 881 return error; 882 883 error = insert_entries(node, slot, item, order, false); 884 if (error < 0) 885 return error; 886 887 if (node) { 888 unsigned offset = get_slot_offset(node, slot); 889 BUG_ON(tag_get(node, 0, offset)); 890 BUG_ON(tag_get(node, 1, offset)); 891 BUG_ON(tag_get(node, 2, offset)); 892 } else { 893 BUG_ON(root_tags_get(root)); 894 } 895 896 return 0; 897 } 898 EXPORT_SYMBOL(__radix_tree_insert); 899 900 /** 901 * __radix_tree_lookup - lookup an item in a radix tree 902 * @root: radix tree root 903 * @index: index key 904 * @nodep: returns node 905 * @slotp: returns slot 906 * 907 * Lookup and return the item at position @index in the radix 908 * tree @root. 909 * 910 * Until there is more than one item in the tree, no nodes are 911 * allocated and @root->rnode is used as a direct slot instead of 912 * pointing to a node, in which case *@nodep will be NULL. 913 */ 914 void *__radix_tree_lookup(const struct radix_tree_root *root, 915 unsigned long index, struct radix_tree_node **nodep, 916 void ***slotp) 917 { 918 struct radix_tree_node *node, *parent; 919 unsigned long maxindex; 920 void **slot; 921 922 restart: 923 parent = NULL; 924 slot = (void **)&root->rnode; 925 radix_tree_load_root(root, &node, &maxindex); 926 if (index > maxindex) 927 return NULL; 928 929 while (radix_tree_is_internal_node(node)) { 930 unsigned offset; 931 932 if (node == RADIX_TREE_RETRY) 933 goto restart; 934 parent = entry_to_node(node); 935 offset = radix_tree_descend(parent, &node, index); 936 slot = parent->slots + offset; 937 } 938 939 if (nodep) 940 *nodep = parent; 941 if (slotp) 942 *slotp = slot; 943 return node; 944 } 945 946 /** 947 * radix_tree_lookup_slot - lookup a slot in a radix tree 948 * @root: radix tree root 949 * @index: index key 950 * 951 * Returns: the slot corresponding to the position @index in the 952 * radix tree @root. This is useful for update-if-exists operations. 953 * 954 * This function can be called under rcu_read_lock iff the slot is not 955 * modified by radix_tree_replace_slot, otherwise it must be called 956 * exclusive from other writers. Any dereference of the slot must be done 957 * using radix_tree_deref_slot. 958 */ 959 void **radix_tree_lookup_slot(const struct radix_tree_root *root, 960 unsigned long index) 961 { 962 void **slot; 963 964 if (!__radix_tree_lookup(root, index, NULL, &slot)) 965 return NULL; 966 return slot; 967 } 968 EXPORT_SYMBOL(radix_tree_lookup_slot); 969 970 /** 971 * radix_tree_lookup - perform lookup operation on a radix tree 972 * @root: radix tree root 973 * @index: index key 974 * 975 * Lookup the item at the position @index in the radix tree @root. 976 * 977 * This function can be called under rcu_read_lock, however the caller 978 * must manage lifetimes of leaf nodes (eg. RCU may also be used to free 979 * them safely). No RCU barriers are required to access or modify the 980 * returned item, however. 981 */ 982 void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) 983 { 984 return __radix_tree_lookup(root, index, NULL, NULL); 985 } 986 EXPORT_SYMBOL(radix_tree_lookup); 987 988 static inline int slot_count(struct radix_tree_node *node, 989 void **slot) 990 { 991 int n = 1; 992 #ifdef CONFIG_RADIX_TREE_MULTIORDER 993 void *ptr = node_to_entry(slot); 994 unsigned offset = get_slot_offset(node, slot); 995 int i; 996 997 for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { 998 if (node->slots[offset + i] != ptr) 999 break; 1000 n++; 1001 } 1002 #endif 1003 return n; 1004 } 1005 1006 static void replace_slot(struct radix_tree_root *root, 1007 struct radix_tree_node *node, 1008 void **slot, void *item, 1009 bool warn_typeswitch) 1010 { 1011 void *old = rcu_dereference_raw(*slot); 1012 int count, exceptional; 1013 1014 WARN_ON_ONCE(radix_tree_is_internal_node(item)); 1015 1016 count = !!item - !!old; 1017 exceptional = !!radix_tree_exceptional_entry(item) - 1018 !!radix_tree_exceptional_entry(old); 1019 1020 WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); 1021 1022 if (node) { 1023 node->count += count; 1024 if (exceptional) { 1025 exceptional *= slot_count(node, slot); 1026 node->exceptional += exceptional; 1027 } 1028 } 1029 1030 rcu_assign_pointer(*slot, item); 1031 } 1032 1033 static inline void delete_sibling_entries(struct radix_tree_node *node, 1034 void **slot) 1035 { 1036 #ifdef CONFIG_RADIX_TREE_MULTIORDER 1037 bool exceptional = radix_tree_exceptional_entry(*slot); 1038 void *ptr = node_to_entry(slot); 1039 unsigned offset = get_slot_offset(node, slot); 1040 int i; 1041 1042 for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { 1043 if (node->slots[offset + i] != ptr) 1044 break; 1045 node->slots[offset + i] = NULL; 1046 node->count--; 1047 if (exceptional) 1048 node->exceptional--; 1049 } 1050 #endif 1051 } 1052 1053 /** 1054 * __radix_tree_replace - replace item in a slot 1055 * @root: radix tree root 1056 * @node: pointer to tree node 1057 * @slot: pointer to slot in @node 1058 * @item: new item to store in the slot. 1059 * @update_node: callback for changing leaf nodes 1060 * @private: private data to pass to @update_node 1061 * 1062 * For use with __radix_tree_lookup(). Caller must hold tree write locked 1063 * across slot lookup and replacement. 1064 */ 1065 void __radix_tree_replace(struct radix_tree_root *root, 1066 struct radix_tree_node *node, 1067 void **slot, void *item, 1068 radix_tree_update_node_t update_node, void *private) 1069 { 1070 if (!item) 1071 delete_sibling_entries(node, slot); 1072 /* 1073 * This function supports replacing exceptional entries and 1074 * deleting entries, but that needs accounting against the 1075 * node unless the slot is root->rnode. 1076 */ 1077 replace_slot(root, node, slot, item, 1078 !node && slot != (void **)&root->rnode); 1079 1080 if (!node) 1081 return; 1082 1083 if (update_node) 1084 update_node(node, private); 1085 1086 delete_node(root, node, update_node, private); 1087 } 1088 1089 /** 1090 * radix_tree_replace_slot - replace item in a slot 1091 * @root: radix tree root 1092 * @slot: pointer to slot 1093 * @item: new item to store in the slot. 1094 * 1095 * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), 1096 * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked 1097 * across slot lookup and replacement. 1098 * 1099 * NOTE: This cannot be used to switch between non-entries (empty slots), 1100 * regular entries, and exceptional entries, as that requires accounting 1101 * inside the radix tree node. When switching from one type of entry or 1102 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or 1103 * radix_tree_iter_replace(). 1104 */ 1105 void radix_tree_replace_slot(struct radix_tree_root *root, 1106 void **slot, void *item) 1107 { 1108 replace_slot(root, NULL, slot, item, true); 1109 } 1110 1111 /** 1112 * radix_tree_iter_replace - replace item in a slot 1113 * @root: radix tree root 1114 * @slot: pointer to slot 1115 * @item: new item to store in the slot. 1116 * 1117 * For use with radix_tree_split() and radix_tree_for_each_slot(). 1118 * Caller must hold tree write locked across split and replacement. 1119 */ 1120 void radix_tree_iter_replace(struct radix_tree_root *root, 1121 const struct radix_tree_iter *iter, void **slot, void *item) 1122 { 1123 __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); 1124 } 1125 1126 #ifdef CONFIG_RADIX_TREE_MULTIORDER 1127 /** 1128 * radix_tree_join - replace multiple entries with one multiorder entry 1129 * @root: radix tree root 1130 * @index: an index inside the new entry 1131 * @order: order of the new entry 1132 * @item: new entry 1133 * 1134 * Call this function to replace several entries with one larger entry. 1135 * The existing entries are presumed to not need freeing as a result of 1136 * this call. 1137 * 1138 * The replacement entry will have all the tags set on it that were set 1139 * on any of the entries it is replacing. 1140 */ 1141 int radix_tree_join(struct radix_tree_root *root, unsigned long index, 1142 unsigned order, void *item) 1143 { 1144 struct radix_tree_node *node; 1145 void **slot; 1146 int error; 1147 1148 BUG_ON(radix_tree_is_internal_node(item)); 1149 1150 error = __radix_tree_create(root, index, order, &node, &slot); 1151 if (!error) 1152 error = insert_entries(node, slot, item, order, true); 1153 if (error > 0) 1154 error = 0; 1155 1156 return error; 1157 } 1158 1159 /** 1160 * radix_tree_split - Split an entry into smaller entries 1161 * @root: radix tree root 1162 * @index: An index within the large entry 1163 * @order: Order of new entries 1164 * 1165 * Call this function as the first step in replacing a multiorder entry 1166 * with several entries of lower order. After this function returns, 1167 * loop over the relevant portion of the tree using radix_tree_for_each_slot() 1168 * and call radix_tree_iter_replace() to set up each new entry. 1169 * 1170 * The tags from this entry are replicated to all the new entries. 1171 * 1172 * The radix tree should be locked against modification during the entire 1173 * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which 1174 * should prompt RCU walkers to restart the lookup from the root. 1175 */ 1176 int radix_tree_split(struct radix_tree_root *root, unsigned long index, 1177 unsigned order) 1178 { 1179 struct radix_tree_node *parent, *node, *child; 1180 void **slot; 1181 unsigned int offset, end; 1182 unsigned n, tag, tags = 0; 1183 1184 if (!__radix_tree_lookup(root, index, &parent, &slot)) 1185 return -ENOENT; 1186 if (!parent) 1187 return -ENOENT; 1188 1189 offset = get_slot_offset(parent, slot); 1190 1191 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1192 if (tag_get(parent, tag, offset)) 1193 tags |= 1 << tag; 1194 1195 for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { 1196 if (!is_sibling_entry(parent, parent->slots[end])) 1197 break; 1198 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1199 if (tags & (1 << tag)) 1200 tag_set(parent, tag, end); 1201 /* rcu_assign_pointer ensures tags are set before RETRY */ 1202 rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); 1203 } 1204 rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); 1205 parent->exceptional -= (end - offset); 1206 1207 if (order == parent->shift) 1208 return 0; 1209 if (order > parent->shift) { 1210 while (offset < end) 1211 offset += insert_entries(parent, &parent->slots[offset], 1212 RADIX_TREE_RETRY, order, true); 1213 return 0; 1214 } 1215 1216 node = parent; 1217 1218 for (;;) { 1219 if (node->shift > order) { 1220 child = radix_tree_node_alloc(root, node, 1221 node->shift - RADIX_TREE_MAP_SHIFT, 1222 offset, 0, 0); 1223 if (!child) 1224 goto nomem; 1225 if (node != parent) { 1226 node->count++; 1227 node->slots[offset] = node_to_entry(child); 1228 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1229 if (tags & (1 << tag)) 1230 tag_set(node, tag, offset); 1231 } 1232 1233 node = child; 1234 offset = 0; 1235 continue; 1236 } 1237 1238 n = insert_entries(node, &node->slots[offset], 1239 RADIX_TREE_RETRY, order, false); 1240 BUG_ON(n > RADIX_TREE_MAP_SIZE); 1241 1242 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1243 if (tags & (1 << tag)) 1244 tag_set(node, tag, offset); 1245 offset += n; 1246 1247 while (offset == RADIX_TREE_MAP_SIZE) { 1248 if (node == parent) 1249 break; 1250 offset = node->offset; 1251 child = node; 1252 node = node->parent; 1253 rcu_assign_pointer(node->slots[offset], 1254 node_to_entry(child)); 1255 offset++; 1256 } 1257 if ((node == parent) && (offset == end)) 1258 return 0; 1259 } 1260 1261 nomem: 1262 /* Shouldn't happen; did user forget to preload? */ 1263 /* TODO: free all the allocated nodes */ 1264 WARN_ON(1); 1265 return -ENOMEM; 1266 } 1267 #endif 1268 1269 /** 1270 * radix_tree_tag_set - set a tag on a radix tree node 1271 * @root: radix tree root 1272 * @index: index key 1273 * @tag: tag index 1274 * 1275 * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) 1276 * corresponding to @index in the radix tree. From 1277 * the root all the way down to the leaf node. 1278 * 1279 * Returns the address of the tagged item. Setting a tag on a not-present 1280 * item is a bug. 1281 */ 1282 void *radix_tree_tag_set(struct radix_tree_root *root, 1283 unsigned long index, unsigned int tag) 1284 { 1285 struct radix_tree_node *node, *parent; 1286 unsigned long maxindex; 1287 1288 radix_tree_load_root(root, &node, &maxindex); 1289 BUG_ON(index > maxindex); 1290 1291 while (radix_tree_is_internal_node(node)) { 1292 unsigned offset; 1293 1294 parent = entry_to_node(node); 1295 offset = radix_tree_descend(parent, &node, index); 1296 BUG_ON(!node); 1297 1298 if (!tag_get(parent, tag, offset)) 1299 tag_set(parent, tag, offset); 1300 } 1301 1302 /* set the root's tag bit */ 1303 if (!root_tag_get(root, tag)) 1304 root_tag_set(root, tag); 1305 1306 return node; 1307 } 1308 EXPORT_SYMBOL(radix_tree_tag_set); 1309 1310 static void node_tag_clear(struct radix_tree_root *root, 1311 struct radix_tree_node *node, 1312 unsigned int tag, unsigned int offset) 1313 { 1314 while (node) { 1315 if (!tag_get(node, tag, offset)) 1316 return; 1317 tag_clear(node, tag, offset); 1318 if (any_tag_set(node, tag)) 1319 return; 1320 1321 offset = node->offset; 1322 node = node->parent; 1323 } 1324 1325 /* clear the root's tag bit */ 1326 if (root_tag_get(root, tag)) 1327 root_tag_clear(root, tag); 1328 } 1329 1330 static void node_tag_set(struct radix_tree_root *root, 1331 struct radix_tree_node *node, 1332 unsigned int tag, unsigned int offset) 1333 { 1334 while (node) { 1335 if (tag_get(node, tag, offset)) 1336 return; 1337 tag_set(node, tag, offset); 1338 offset = node->offset; 1339 node = node->parent; 1340 } 1341 1342 if (!root_tag_get(root, tag)) 1343 root_tag_set(root, tag); 1344 } 1345 1346 /** 1347 * radix_tree_iter_tag_set - set a tag on the current iterator entry 1348 * @root: radix tree root 1349 * @iter: iterator state 1350 * @tag: tag to set 1351 */ 1352 void radix_tree_iter_tag_set(struct radix_tree_root *root, 1353 const struct radix_tree_iter *iter, unsigned int tag) 1354 { 1355 node_tag_set(root, iter->node, tag, iter_offset(iter)); 1356 } 1357 1358 /** 1359 * radix_tree_tag_clear - clear a tag on a radix tree node 1360 * @root: radix tree root 1361 * @index: index key 1362 * @tag: tag index 1363 * 1364 * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) 1365 * corresponding to @index in the radix tree. If this causes 1366 * the leaf node to have no tags set then clear the tag in the 1367 * next-to-leaf node, etc. 1368 * 1369 * Returns the address of the tagged item on success, else NULL. ie: 1370 * has the same return value and semantics as radix_tree_lookup(). 1371 */ 1372 void *radix_tree_tag_clear(struct radix_tree_root *root, 1373 unsigned long index, unsigned int tag) 1374 { 1375 struct radix_tree_node *node, *parent; 1376 unsigned long maxindex; 1377 int uninitialized_var(offset); 1378 1379 radix_tree_load_root(root, &node, &maxindex); 1380 if (index > maxindex) 1381 return NULL; 1382 1383 parent = NULL; 1384 1385 while (radix_tree_is_internal_node(node)) { 1386 parent = entry_to_node(node); 1387 offset = radix_tree_descend(parent, &node, index); 1388 } 1389 1390 if (node) 1391 node_tag_clear(root, parent, tag, offset); 1392 1393 return node; 1394 } 1395 EXPORT_SYMBOL(radix_tree_tag_clear); 1396 1397 /** 1398 * radix_tree_tag_get - get a tag on a radix tree node 1399 * @root: radix tree root 1400 * @index: index key 1401 * @tag: tag index (< RADIX_TREE_MAX_TAGS) 1402 * 1403 * Return values: 1404 * 1405 * 0: tag not present or not set 1406 * 1: tag set 1407 * 1408 * Note that the return value of this function may not be relied on, even if 1409 * the RCU lock is held, unless tag modification and node deletion are excluded 1410 * from concurrency. 1411 */ 1412 int radix_tree_tag_get(const struct radix_tree_root *root, 1413 unsigned long index, unsigned int tag) 1414 { 1415 struct radix_tree_node *node, *parent; 1416 unsigned long maxindex; 1417 1418 if (!root_tag_get(root, tag)) 1419 return 0; 1420 1421 radix_tree_load_root(root, &node, &maxindex); 1422 if (index > maxindex) 1423 return 0; 1424 if (node == NULL) 1425 return 0; 1426 1427 while (radix_tree_is_internal_node(node)) { 1428 unsigned offset; 1429 1430 parent = entry_to_node(node); 1431 offset = radix_tree_descend(parent, &node, index); 1432 1433 if (!node) 1434 return 0; 1435 if (!tag_get(parent, tag, offset)) 1436 return 0; 1437 if (node == RADIX_TREE_RETRY) 1438 break; 1439 } 1440 1441 return 1; 1442 } 1443 EXPORT_SYMBOL(radix_tree_tag_get); 1444 1445 static inline void __set_iter_shift(struct radix_tree_iter *iter, 1446 unsigned int shift) 1447 { 1448 #ifdef CONFIG_RADIX_TREE_MULTIORDER 1449 iter->shift = shift; 1450 #endif 1451 } 1452 1453 /* Construct iter->tags bit-mask from node->tags[tag] array */ 1454 static void set_iter_tags(struct radix_tree_iter *iter, 1455 struct radix_tree_node *node, unsigned offset, 1456 unsigned tag) 1457 { 1458 unsigned tag_long = offset / BITS_PER_LONG; 1459 unsigned tag_bit = offset % BITS_PER_LONG; 1460 1461 iter->tags = node->tags[tag][tag_long] >> tag_bit; 1462 1463 /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ 1464 if (tag_long < RADIX_TREE_TAG_LONGS - 1) { 1465 /* Pick tags from next element */ 1466 if (tag_bit) 1467 iter->tags |= node->tags[tag][tag_long + 1] << 1468 (BITS_PER_LONG - tag_bit); 1469 /* Clip chunk size, here only BITS_PER_LONG tags */ 1470 iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); 1471 } 1472 } 1473 1474 #ifdef CONFIG_RADIX_TREE_MULTIORDER 1475 static void **skip_siblings(struct radix_tree_node **nodep, 1476 void **slot, struct radix_tree_iter *iter) 1477 { 1478 void *sib = node_to_entry(slot - 1); 1479 1480 while (iter->index < iter->next_index) { 1481 *nodep = rcu_dereference_raw(*slot); 1482 if (*nodep && *nodep != sib) 1483 return slot; 1484 slot++; 1485 iter->index = __radix_tree_iter_add(iter, 1); 1486 iter->tags >>= 1; 1487 } 1488 1489 *nodep = NULL; 1490 return NULL; 1491 } 1492 1493 void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, 1494 unsigned flags) 1495 { 1496 unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; 1497 struct radix_tree_node *node = rcu_dereference_raw(*slot); 1498 1499 slot = skip_siblings(&node, slot, iter); 1500 1501 while (radix_tree_is_internal_node(node)) { 1502 unsigned offset; 1503 unsigned long next_index; 1504 1505 if (node == RADIX_TREE_RETRY) 1506 return slot; 1507 node = entry_to_node(node); 1508 iter->node = node; 1509 iter->shift = node->shift; 1510 1511 if (flags & RADIX_TREE_ITER_TAGGED) { 1512 offset = radix_tree_find_next_bit(node, tag, 0); 1513 if (offset == RADIX_TREE_MAP_SIZE) 1514 return NULL; 1515 slot = &node->slots[offset]; 1516 iter->index = __radix_tree_iter_add(iter, offset); 1517 set_iter_tags(iter, node, offset, tag); 1518 node = rcu_dereference_raw(*slot); 1519 } else { 1520 offset = 0; 1521 slot = &node->slots[0]; 1522 for (;;) { 1523 node = rcu_dereference_raw(*slot); 1524 if (node) 1525 break; 1526 slot++; 1527 offset++; 1528 if (offset == RADIX_TREE_MAP_SIZE) 1529 return NULL; 1530 } 1531 iter->index = __radix_tree_iter_add(iter, offset); 1532 } 1533 if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) 1534 goto none; 1535 next_index = (iter->index | shift_maxindex(iter->shift)) + 1; 1536 if (next_index < iter->next_index) 1537 iter->next_index = next_index; 1538 } 1539 1540 return slot; 1541 none: 1542 iter->next_index = 0; 1543 return NULL; 1544 } 1545 EXPORT_SYMBOL(__radix_tree_next_slot); 1546 #else 1547 static void **skip_siblings(struct radix_tree_node **nodep, 1548 void **slot, struct radix_tree_iter *iter) 1549 { 1550 return slot; 1551 } 1552 #endif 1553 1554 void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) 1555 { 1556 struct radix_tree_node *node; 1557 1558 slot++; 1559 iter->index = __radix_tree_iter_add(iter, 1); 1560 node = rcu_dereference_raw(*slot); 1561 skip_siblings(&node, slot, iter); 1562 iter->next_index = iter->index; 1563 iter->tags = 0; 1564 return NULL; 1565 } 1566 EXPORT_SYMBOL(radix_tree_iter_resume); 1567 1568 /** 1569 * radix_tree_next_chunk - find next chunk of slots for iteration 1570 * 1571 * @root: radix tree root 1572 * @iter: iterator state 1573 * @flags: RADIX_TREE_ITER_* flags and tag index 1574 * Returns: pointer to chunk first slot, or NULL if iteration is over 1575 */ 1576 void **radix_tree_next_chunk(const struct radix_tree_root *root, 1577 struct radix_tree_iter *iter, unsigned flags) 1578 { 1579 unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; 1580 struct radix_tree_node *node, *child; 1581 unsigned long index, offset, maxindex; 1582 1583 if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) 1584 return NULL; 1585 1586 /* 1587 * Catch next_index overflow after ~0UL. iter->index never overflows 1588 * during iterating; it can be zero only at the beginning. 1589 * And we cannot overflow iter->next_index in a single step, 1590 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. 1591 * 1592 * This condition also used by radix_tree_next_slot() to stop 1593 * contiguous iterating, and forbid switching to the next chunk. 1594 */ 1595 index = iter->next_index; 1596 if (!index && iter->index) 1597 return NULL; 1598 1599 restart: 1600 radix_tree_load_root(root, &child, &maxindex); 1601 if (index > maxindex) 1602 return NULL; 1603 if (!child) 1604 return NULL; 1605 1606 if (!radix_tree_is_internal_node(child)) { 1607 /* Single-slot tree */ 1608 iter->index = index; 1609 iter->next_index = maxindex + 1; 1610 iter->tags = 1; 1611 iter->node = NULL; 1612 __set_iter_shift(iter, 0); 1613 return (void **)&root->rnode; 1614 } 1615 1616 do { 1617 node = entry_to_node(child); 1618 offset = radix_tree_descend(node, &child, index); 1619 1620 if ((flags & RADIX_TREE_ITER_TAGGED) ? 1621 !tag_get(node, tag, offset) : !child) { 1622 /* Hole detected */ 1623 if (flags & RADIX_TREE_ITER_CONTIG) 1624 return NULL; 1625 1626 if (flags & RADIX_TREE_ITER_TAGGED) 1627 offset = radix_tree_find_next_bit(node, tag, 1628 offset + 1); 1629 else 1630 while (++offset < RADIX_TREE_MAP_SIZE) { 1631 void *slot = node->slots[offset]; 1632 if (is_sibling_entry(node, slot)) 1633 continue; 1634 if (slot) 1635 break; 1636 } 1637 index &= ~node_maxindex(node); 1638 index += offset << node->shift; 1639 /* Overflow after ~0UL */ 1640 if (!index) 1641 return NULL; 1642 if (offset == RADIX_TREE_MAP_SIZE) 1643 goto restart; 1644 child = rcu_dereference_raw(node->slots[offset]); 1645 } 1646 1647 if (!child) 1648 goto restart; 1649 if (child == RADIX_TREE_RETRY) 1650 break; 1651 } while (radix_tree_is_internal_node(child)); 1652 1653 /* Update the iterator state */ 1654 iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); 1655 iter->next_index = (index | node_maxindex(node)) + 1; 1656 iter->node = node; 1657 __set_iter_shift(iter, node->shift); 1658 1659 if (flags & RADIX_TREE_ITER_TAGGED) 1660 set_iter_tags(iter, node, offset, tag); 1661 1662 return node->slots + offset; 1663 } 1664 EXPORT_SYMBOL(radix_tree_next_chunk); 1665 1666 /** 1667 * radix_tree_gang_lookup - perform multiple lookup on a radix tree 1668 * @root: radix tree root 1669 * @results: where the results of the lookup are placed 1670 * @first_index: start the lookup from this key 1671 * @max_items: place up to this many items at *results 1672 * 1673 * Performs an index-ascending scan of the tree for present items. Places 1674 * them at *@results and returns the number of items which were placed at 1675 * *@results. 1676 * 1677 * The implementation is naive. 1678 * 1679 * Like radix_tree_lookup, radix_tree_gang_lookup may be called under 1680 * rcu_read_lock. In this case, rather than the returned results being 1681 * an atomic snapshot of the tree at a single point in time, the 1682 * semantics of an RCU protected gang lookup are as though multiple 1683 * radix_tree_lookups have been issued in individual locks, and results 1684 * stored in 'results'. 1685 */ 1686 unsigned int 1687 radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, 1688 unsigned long first_index, unsigned int max_items) 1689 { 1690 struct radix_tree_iter iter; 1691 void **slot; 1692 unsigned int ret = 0; 1693 1694 if (unlikely(!max_items)) 1695 return 0; 1696 1697 radix_tree_for_each_slot(slot, root, &iter, first_index) { 1698 results[ret] = rcu_dereference_raw(*slot); 1699 if (!results[ret]) 1700 continue; 1701 if (radix_tree_is_internal_node(results[ret])) { 1702 slot = radix_tree_iter_retry(&iter); 1703 continue; 1704 } 1705 if (++ret == max_items) 1706 break; 1707 } 1708 1709 return ret; 1710 } 1711 EXPORT_SYMBOL(radix_tree_gang_lookup); 1712 1713 /** 1714 * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree 1715 * @root: radix tree root 1716 * @results: where the results of the lookup are placed 1717 * @indices: where their indices should be placed (but usually NULL) 1718 * @first_index: start the lookup from this key 1719 * @max_items: place up to this many items at *results 1720 * 1721 * Performs an index-ascending scan of the tree for present items. Places 1722 * their slots at *@results and returns the number of items which were 1723 * placed at *@results. 1724 * 1725 * The implementation is naive. 1726 * 1727 * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must 1728 * be dereferenced with radix_tree_deref_slot, and if using only RCU 1729 * protection, radix_tree_deref_slot may fail requiring a retry. 1730 */ 1731 unsigned int 1732 radix_tree_gang_lookup_slot(const struct radix_tree_root *root, 1733 void ***results, unsigned long *indices, 1734 unsigned long first_index, unsigned int max_items) 1735 { 1736 struct radix_tree_iter iter; 1737 void **slot; 1738 unsigned int ret = 0; 1739 1740 if (unlikely(!max_items)) 1741 return 0; 1742 1743 radix_tree_for_each_slot(slot, root, &iter, first_index) { 1744 results[ret] = slot; 1745 if (indices) 1746 indices[ret] = iter.index; 1747 if (++ret == max_items) 1748 break; 1749 } 1750 1751 return ret; 1752 } 1753 EXPORT_SYMBOL(radix_tree_gang_lookup_slot); 1754 1755 /** 1756 * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree 1757 * based on a tag 1758 * @root: radix tree root 1759 * @results: where the results of the lookup are placed 1760 * @first_index: start the lookup from this key 1761 * @max_items: place up to this many items at *results 1762 * @tag: the tag index (< RADIX_TREE_MAX_TAGS) 1763 * 1764 * Performs an index-ascending scan of the tree for present items which 1765 * have the tag indexed by @tag set. Places the items at *@results and 1766 * returns the number of items which were placed at *@results. 1767 */ 1768 unsigned int 1769 radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, 1770 unsigned long first_index, unsigned int max_items, 1771 unsigned int tag) 1772 { 1773 struct radix_tree_iter iter; 1774 void **slot; 1775 unsigned int ret = 0; 1776 1777 if (unlikely(!max_items)) 1778 return 0; 1779 1780 radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { 1781 results[ret] = rcu_dereference_raw(*slot); 1782 if (!results[ret]) 1783 continue; 1784 if (radix_tree_is_internal_node(results[ret])) { 1785 slot = radix_tree_iter_retry(&iter); 1786 continue; 1787 } 1788 if (++ret == max_items) 1789 break; 1790 } 1791 1792 return ret; 1793 } 1794 EXPORT_SYMBOL(radix_tree_gang_lookup_tag); 1795 1796 /** 1797 * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a 1798 * radix tree based on a tag 1799 * @root: radix tree root 1800 * @results: where the results of the lookup are placed 1801 * @first_index: start the lookup from this key 1802 * @max_items: place up to this many items at *results 1803 * @tag: the tag index (< RADIX_TREE_MAX_TAGS) 1804 * 1805 * Performs an index-ascending scan of the tree for present items which 1806 * have the tag indexed by @tag set. Places the slots at *@results and 1807 * returns the number of slots which were placed at *@results. 1808 */ 1809 unsigned int 1810 radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, 1811 void ***results, unsigned long first_index, 1812 unsigned int max_items, unsigned int tag) 1813 { 1814 struct radix_tree_iter iter; 1815 void **slot; 1816 unsigned int ret = 0; 1817 1818 if (unlikely(!max_items)) 1819 return 0; 1820 1821 radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { 1822 results[ret] = slot; 1823 if (++ret == max_items) 1824 break; 1825 } 1826 1827 return ret; 1828 } 1829 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); 1830 1831 /** 1832 * __radix_tree_delete_node - try to free node after clearing a slot 1833 * @root: radix tree root 1834 * @node: node containing @index 1835 * @update_node: callback for changing leaf nodes 1836 * @private: private data to pass to @update_node 1837 * 1838 * After clearing the slot at @index in @node from radix tree 1839 * rooted at @root, call this function to attempt freeing the 1840 * node and shrinking the tree. 1841 */ 1842 void __radix_tree_delete_node(struct radix_tree_root *root, 1843 struct radix_tree_node *node, 1844 radix_tree_update_node_t update_node, 1845 void *private) 1846 { 1847 delete_node(root, node, update_node, private); 1848 } 1849 1850 /** 1851 * radix_tree_delete_item - delete an item from a radix tree 1852 * @root: radix tree root 1853 * @index: index key 1854 * @item: expected item 1855 * 1856 * Remove @item at @index from the radix tree rooted at @root. 1857 * 1858 * Returns the address of the deleted item, or NULL if it was not present 1859 * or the entry at the given @index was not @item. 1860 */ 1861 void *radix_tree_delete_item(struct radix_tree_root *root, 1862 unsigned long index, void *item) 1863 { 1864 struct radix_tree_node *node; 1865 unsigned int offset; 1866 void **slot; 1867 void *entry; 1868 int tag; 1869 1870 entry = __radix_tree_lookup(root, index, &node, &slot); 1871 if (!entry) 1872 return NULL; 1873 1874 if (item && entry != item) 1875 return NULL; 1876 1877 if (!node) { 1878 root_tag_clear_all(root); 1879 root->rnode = NULL; 1880 return entry; 1881 } 1882 1883 offset = get_slot_offset(node, slot); 1884 1885 /* Clear all tags associated with the item to be deleted. */ 1886 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1887 node_tag_clear(root, node, tag, offset); 1888 1889 __radix_tree_replace(root, node, slot, NULL, NULL, NULL); 1890 1891 return entry; 1892 } 1893 EXPORT_SYMBOL(radix_tree_delete_item); 1894 1895 /** 1896 * radix_tree_delete - delete an item from a radix tree 1897 * @root: radix tree root 1898 * @index: index key 1899 * 1900 * Remove the item at @index from the radix tree rooted at @root. 1901 * 1902 * Returns the address of the deleted item, or NULL if it was not present. 1903 */ 1904 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) 1905 { 1906 return radix_tree_delete_item(root, index, NULL); 1907 } 1908 EXPORT_SYMBOL(radix_tree_delete); 1909 1910 void radix_tree_clear_tags(struct radix_tree_root *root, 1911 struct radix_tree_node *node, 1912 void **slot) 1913 { 1914 if (node) { 1915 unsigned int tag, offset = get_slot_offset(node, slot); 1916 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1917 node_tag_clear(root, node, tag, offset); 1918 } else { 1919 /* Clear root node tags */ 1920 root->gfp_mask &= __GFP_BITS_MASK; 1921 } 1922 } 1923 1924 /** 1925 * radix_tree_tagged - test whether any items in the tree are tagged 1926 * @root: radix tree root 1927 * @tag: tag to test 1928 */ 1929 int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) 1930 { 1931 return root_tag_get(root, tag); 1932 } 1933 EXPORT_SYMBOL(radix_tree_tagged); 1934 1935 static void 1936 radix_tree_node_ctor(void *arg) 1937 { 1938 struct radix_tree_node *node = arg; 1939 1940 memset(node, 0, sizeof(*node)); 1941 INIT_LIST_HEAD(&node->private_list); 1942 } 1943 1944 static __init unsigned long __maxindex(unsigned int height) 1945 { 1946 unsigned int width = height * RADIX_TREE_MAP_SHIFT; 1947 int shift = RADIX_TREE_INDEX_BITS - width; 1948 1949 if (shift < 0) 1950 return ~0UL; 1951 if (shift >= BITS_PER_LONG) 1952 return 0UL; 1953 return ~0UL >> shift; 1954 } 1955 1956 static __init void radix_tree_init_maxnodes(void) 1957 { 1958 unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; 1959 unsigned int i, j; 1960 1961 for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) 1962 height_to_maxindex[i] = __maxindex(i); 1963 for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { 1964 for (j = i; j > 0; j--) 1965 height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; 1966 } 1967 } 1968 1969 static int radix_tree_cpu_dead(unsigned int cpu) 1970 { 1971 struct radix_tree_preload *rtp; 1972 struct radix_tree_node *node; 1973 1974 /* Free per-cpu pool of preloaded nodes */ 1975 rtp = &per_cpu(radix_tree_preloads, cpu); 1976 while (rtp->nr) { 1977 node = rtp->nodes; 1978 rtp->nodes = node->private_data; 1979 kmem_cache_free(radix_tree_node_cachep, node); 1980 rtp->nr--; 1981 } 1982 return 0; 1983 } 1984 1985 void __init radix_tree_init(void) 1986 { 1987 int ret; 1988 radix_tree_node_cachep = kmem_cache_create("radix_tree_node", 1989 sizeof(struct radix_tree_node), 0, 1990 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, 1991 radix_tree_node_ctor); 1992 radix_tree_init_maxnodes(); 1993 ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", 1994 NULL, radix_tree_cpu_dead); 1995 WARN_ON(ret < 0); 1996 } 1997