1 /* 2 * Copyright (C) 2001 Momchil Velikov 3 * Portions Copyright (C) 2001 Christoph Hellwig 4 * Copyright (C) 2005 SGI, Christoph Lameter 5 * Copyright (C) 2006 Nick Piggin 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License as 9 * published by the Free Software Foundation; either version 2, or (at 10 * your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 */ 21 22 #include <linux/errno.h> 23 #include <linux/init.h> 24 #include <linux/kernel.h> 25 #include <linux/module.h> 26 #include <linux/radix-tree.h> 27 #include <linux/percpu.h> 28 #include <linux/slab.h> 29 #include <linux/notifier.h> 30 #include <linux/cpu.h> 31 #include <linux/string.h> 32 #include <linux/bitops.h> 33 #include <linux/rcupdate.h> 34 35 36 #ifdef __KERNEL__ 37 #define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) 38 #else 39 #define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ 40 #endif 41 42 #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) 43 #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) 44 45 #define RADIX_TREE_TAG_LONGS \ 46 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 47 48 struct radix_tree_node { 49 unsigned int height; /* Height from the bottom */ 50 unsigned int count; 51 struct rcu_head rcu_head; 52 void *slots[RADIX_TREE_MAP_SIZE]; 53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 54 }; 55 56 struct radix_tree_path { 57 struct radix_tree_node *node; 58 int offset; 59 }; 60 61 #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) 62 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ 63 RADIX_TREE_MAP_SHIFT)) 64 65 /* 66 * The height_to_maxindex array needs to be one deeper than the maximum 67 * path as height 0 holds only 1 entry. 68 */ 69 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; 70 71 /* 72 * Radix tree node cache. 73 */ 74 static struct kmem_cache *radix_tree_node_cachep; 75 76 /* 77 * Per-cpu pool of preloaded nodes 78 */ 79 struct radix_tree_preload { 80 int nr; 81 struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; 82 }; 83 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 84 85 static inline gfp_t root_gfp_mask(struct radix_tree_root *root) 86 { 87 return root->gfp_mask & __GFP_BITS_MASK; 88 } 89 90 static inline void tag_set(struct radix_tree_node *node, unsigned int tag, 91 int offset) 92 { 93 __set_bit(offset, node->tags[tag]); 94 } 95 96 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, 97 int offset) 98 { 99 __clear_bit(offset, node->tags[tag]); 100 } 101 102 static inline int tag_get(struct radix_tree_node *node, unsigned int tag, 103 int offset) 104 { 105 return test_bit(offset, node->tags[tag]); 106 } 107 108 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) 109 { 110 root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); 111 } 112 113 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) 114 { 115 root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); 116 } 117 118 static inline void root_tag_clear_all(struct radix_tree_root *root) 119 { 120 root->gfp_mask &= __GFP_BITS_MASK; 121 } 122 123 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) 124 { 125 return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); 126 } 127 128 /* 129 * Returns 1 if any slot in the node has this tag set. 130 * Otherwise returns 0. 131 */ 132 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) 133 { 134 int idx; 135 for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { 136 if (node->tags[tag][idx]) 137 return 1; 138 } 139 return 0; 140 } 141 /* 142 * This assumes that the caller has performed appropriate preallocation, and 143 * that the caller has pinned this thread of control to the current CPU. 144 */ 145 static struct radix_tree_node * 146 radix_tree_node_alloc(struct radix_tree_root *root) 147 { 148 struct radix_tree_node *ret = NULL; 149 gfp_t gfp_mask = root_gfp_mask(root); 150 151 if (!(gfp_mask & __GFP_WAIT)) { 152 struct radix_tree_preload *rtp; 153 154 /* 155 * Provided the caller has preloaded here, we will always 156 * succeed in getting a node here (and never reach 157 * kmem_cache_alloc) 158 */ 159 rtp = &__get_cpu_var(radix_tree_preloads); 160 if (rtp->nr) { 161 ret = rtp->nodes[rtp->nr - 1]; 162 rtp->nodes[rtp->nr - 1] = NULL; 163 rtp->nr--; 164 } 165 } 166 if (ret == NULL) 167 ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 168 169 BUG_ON(radix_tree_is_indirect_ptr(ret)); 170 return ret; 171 } 172 173 static void radix_tree_node_rcu_free(struct rcu_head *head) 174 { 175 struct radix_tree_node *node = 176 container_of(head, struct radix_tree_node, rcu_head); 177 178 /* 179 * must only free zeroed nodes into the slab. radix_tree_shrink 180 * can leave us with a non-NULL entry in the first slot, so clear 181 * that here to make sure. 182 */ 183 tag_clear(node, 0, 0); 184 tag_clear(node, 1, 0); 185 node->slots[0] = NULL; 186 node->count = 0; 187 188 kmem_cache_free(radix_tree_node_cachep, node); 189 } 190 191 static inline void 192 radix_tree_node_free(struct radix_tree_node *node) 193 { 194 call_rcu(&node->rcu_head, radix_tree_node_rcu_free); 195 } 196 197 /* 198 * Load up this CPU's radix_tree_node buffer with sufficient objects to 199 * ensure that the addition of a single element in the tree cannot fail. On 200 * success, return zero, with preemption disabled. On error, return -ENOMEM 201 * with preemption not disabled. 202 * 203 * To make use of this facility, the radix tree must be initialised without 204 * __GFP_WAIT being passed to INIT_RADIX_TREE(). 205 */ 206 int radix_tree_preload(gfp_t gfp_mask) 207 { 208 struct radix_tree_preload *rtp; 209 struct radix_tree_node *node; 210 int ret = -ENOMEM; 211 212 preempt_disable(); 213 rtp = &__get_cpu_var(radix_tree_preloads); 214 while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { 215 preempt_enable(); 216 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 217 if (node == NULL) 218 goto out; 219 preempt_disable(); 220 rtp = &__get_cpu_var(radix_tree_preloads); 221 if (rtp->nr < ARRAY_SIZE(rtp->nodes)) 222 rtp->nodes[rtp->nr++] = node; 223 else 224 kmem_cache_free(radix_tree_node_cachep, node); 225 } 226 ret = 0; 227 out: 228 return ret; 229 } 230 EXPORT_SYMBOL(radix_tree_preload); 231 232 /* 233 * Return the maximum key which can be store into a 234 * radix tree with height HEIGHT. 235 */ 236 static inline unsigned long radix_tree_maxindex(unsigned int height) 237 { 238 return height_to_maxindex[height]; 239 } 240 241 /* 242 * Extend a radix tree so it can store key @index. 243 */ 244 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) 245 { 246 struct radix_tree_node *node; 247 unsigned int height; 248 int tag; 249 250 /* Figure out what the height should be. */ 251 height = root->height + 1; 252 while (index > radix_tree_maxindex(height)) 253 height++; 254 255 if (root->rnode == NULL) { 256 root->height = height; 257 goto out; 258 } 259 260 do { 261 unsigned int newheight; 262 if (!(node = radix_tree_node_alloc(root))) 263 return -ENOMEM; 264 265 /* Increase the height. */ 266 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); 267 268 /* Propagate the aggregated tag info into the new root */ 269 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 270 if (root_tag_get(root, tag)) 271 tag_set(node, tag, 0); 272 } 273 274 newheight = root->height+1; 275 node->height = newheight; 276 node->count = 1; 277 node = radix_tree_ptr_to_indirect(node); 278 rcu_assign_pointer(root->rnode, node); 279 root->height = newheight; 280 } while (height > root->height); 281 out: 282 return 0; 283 } 284 285 /** 286 * radix_tree_insert - insert into a radix tree 287 * @root: radix tree root 288 * @index: index key 289 * @item: item to insert 290 * 291 * Insert an item into the radix tree at position @index. 292 */ 293 int radix_tree_insert(struct radix_tree_root *root, 294 unsigned long index, void *item) 295 { 296 struct radix_tree_node *node = NULL, *slot; 297 unsigned int height, shift; 298 int offset; 299 int error; 300 301 BUG_ON(radix_tree_is_indirect_ptr(item)); 302 303 /* Make sure the tree is high enough. */ 304 if (index > radix_tree_maxindex(root->height)) { 305 error = radix_tree_extend(root, index); 306 if (error) 307 return error; 308 } 309 310 slot = radix_tree_indirect_to_ptr(root->rnode); 311 312 height = root->height; 313 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 314 315 offset = 0; /* uninitialised var warning */ 316 while (height > 0) { 317 if (slot == NULL) { 318 /* Have to add a child node. */ 319 if (!(slot = radix_tree_node_alloc(root))) 320 return -ENOMEM; 321 slot->height = height; 322 if (node) { 323 rcu_assign_pointer(node->slots[offset], slot); 324 node->count++; 325 } else 326 rcu_assign_pointer(root->rnode, 327 radix_tree_ptr_to_indirect(slot)); 328 } 329 330 /* Go a level down */ 331 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 332 node = slot; 333 slot = node->slots[offset]; 334 shift -= RADIX_TREE_MAP_SHIFT; 335 height--; 336 } 337 338 if (slot != NULL) 339 return -EEXIST; 340 341 if (node) { 342 node->count++; 343 rcu_assign_pointer(node->slots[offset], item); 344 BUG_ON(tag_get(node, 0, offset)); 345 BUG_ON(tag_get(node, 1, offset)); 346 } else { 347 rcu_assign_pointer(root->rnode, item); 348 BUG_ON(root_tag_get(root, 0)); 349 BUG_ON(root_tag_get(root, 1)); 350 } 351 352 return 0; 353 } 354 EXPORT_SYMBOL(radix_tree_insert); 355 356 /* 357 * is_slot == 1 : search for the slot. 358 * is_slot == 0 : search for the node. 359 */ 360 static void *radix_tree_lookup_element(struct radix_tree_root *root, 361 unsigned long index, int is_slot) 362 { 363 unsigned int height, shift; 364 struct radix_tree_node *node, **slot; 365 366 node = rcu_dereference_raw(root->rnode); 367 if (node == NULL) 368 return NULL; 369 370 if (!radix_tree_is_indirect_ptr(node)) { 371 if (index > 0) 372 return NULL; 373 return is_slot ? (void *)&root->rnode : node; 374 } 375 node = radix_tree_indirect_to_ptr(node); 376 377 height = node->height; 378 if (index > radix_tree_maxindex(height)) 379 return NULL; 380 381 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 382 383 do { 384 slot = (struct radix_tree_node **) 385 (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); 386 node = rcu_dereference_raw(*slot); 387 if (node == NULL) 388 return NULL; 389 390 shift -= RADIX_TREE_MAP_SHIFT; 391 height--; 392 } while (height > 0); 393 394 return is_slot ? (void *)slot:node; 395 } 396 397 /** 398 * radix_tree_lookup_slot - lookup a slot in a radix tree 399 * @root: radix tree root 400 * @index: index key 401 * 402 * Returns: the slot corresponding to the position @index in the 403 * radix tree @root. This is useful for update-if-exists operations. 404 * 405 * This function can be called under rcu_read_lock iff the slot is not 406 * modified by radix_tree_replace_slot, otherwise it must be called 407 * exclusive from other writers. Any dereference of the slot must be done 408 * using radix_tree_deref_slot. 409 */ 410 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) 411 { 412 return (void **)radix_tree_lookup_element(root, index, 1); 413 } 414 EXPORT_SYMBOL(radix_tree_lookup_slot); 415 416 /** 417 * radix_tree_lookup - perform lookup operation on a radix tree 418 * @root: radix tree root 419 * @index: index key 420 * 421 * Lookup the item at the position @index in the radix tree @root. 422 * 423 * This function can be called under rcu_read_lock, however the caller 424 * must manage lifetimes of leaf nodes (eg. RCU may also be used to free 425 * them safely). No RCU barriers are required to access or modify the 426 * returned item, however. 427 */ 428 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) 429 { 430 return radix_tree_lookup_element(root, index, 0); 431 } 432 EXPORT_SYMBOL(radix_tree_lookup); 433 434 /** 435 * radix_tree_tag_set - set a tag on a radix tree node 436 * @root: radix tree root 437 * @index: index key 438 * @tag: tag index 439 * 440 * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) 441 * corresponding to @index in the radix tree. From 442 * the root all the way down to the leaf node. 443 * 444 * Returns the address of the tagged item. Setting a tag on a not-present 445 * item is a bug. 446 */ 447 void *radix_tree_tag_set(struct radix_tree_root *root, 448 unsigned long index, unsigned int tag) 449 { 450 unsigned int height, shift; 451 struct radix_tree_node *slot; 452 453 height = root->height; 454 BUG_ON(index > radix_tree_maxindex(height)); 455 456 slot = radix_tree_indirect_to_ptr(root->rnode); 457 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 458 459 while (height > 0) { 460 int offset; 461 462 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 463 if (!tag_get(slot, tag, offset)) 464 tag_set(slot, tag, offset); 465 slot = slot->slots[offset]; 466 BUG_ON(slot == NULL); 467 shift -= RADIX_TREE_MAP_SHIFT; 468 height--; 469 } 470 471 /* set the root's tag bit */ 472 if (slot && !root_tag_get(root, tag)) 473 root_tag_set(root, tag); 474 475 return slot; 476 } 477 EXPORT_SYMBOL(radix_tree_tag_set); 478 479 /** 480 * radix_tree_tag_clear - clear a tag on a radix tree node 481 * @root: radix tree root 482 * @index: index key 483 * @tag: tag index 484 * 485 * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) 486 * corresponding to @index in the radix tree. If 487 * this causes the leaf node to have no tags set then clear the tag in the 488 * next-to-leaf node, etc. 489 * 490 * Returns the address of the tagged item on success, else NULL. ie: 491 * has the same return value and semantics as radix_tree_lookup(). 492 */ 493 void *radix_tree_tag_clear(struct radix_tree_root *root, 494 unsigned long index, unsigned int tag) 495 { 496 /* 497 * The radix tree path needs to be one longer than the maximum path 498 * since the "list" is null terminated. 499 */ 500 struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; 501 struct radix_tree_node *slot = NULL; 502 unsigned int height, shift; 503 504 height = root->height; 505 if (index > radix_tree_maxindex(height)) 506 goto out; 507 508 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 509 pathp->node = NULL; 510 slot = radix_tree_indirect_to_ptr(root->rnode); 511 512 while (height > 0) { 513 int offset; 514 515 if (slot == NULL) 516 goto out; 517 518 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 519 pathp[1].offset = offset; 520 pathp[1].node = slot; 521 slot = slot->slots[offset]; 522 pathp++; 523 shift -= RADIX_TREE_MAP_SHIFT; 524 height--; 525 } 526 527 if (slot == NULL) 528 goto out; 529 530 while (pathp->node) { 531 if (!tag_get(pathp->node, tag, pathp->offset)) 532 goto out; 533 tag_clear(pathp->node, tag, pathp->offset); 534 if (any_tag_set(pathp->node, tag)) 535 goto out; 536 pathp--; 537 } 538 539 /* clear the root's tag bit */ 540 if (root_tag_get(root, tag)) 541 root_tag_clear(root, tag); 542 543 out: 544 return slot; 545 } 546 EXPORT_SYMBOL(radix_tree_tag_clear); 547 548 /** 549 * radix_tree_tag_get - get a tag on a radix tree node 550 * @root: radix tree root 551 * @index: index key 552 * @tag: tag index (< RADIX_TREE_MAX_TAGS) 553 * 554 * Return values: 555 * 556 * 0: tag not present or not set 557 * 1: tag set 558 */ 559 int radix_tree_tag_get(struct radix_tree_root *root, 560 unsigned long index, unsigned int tag) 561 { 562 unsigned int height, shift; 563 struct radix_tree_node *node; 564 int saw_unset_tag = 0; 565 566 /* check the root's tag bit */ 567 if (!root_tag_get(root, tag)) 568 return 0; 569 570 node = rcu_dereference_raw(root->rnode); 571 if (node == NULL) 572 return 0; 573 574 if (!radix_tree_is_indirect_ptr(node)) 575 return (index == 0); 576 node = radix_tree_indirect_to_ptr(node); 577 578 height = node->height; 579 if (index > radix_tree_maxindex(height)) 580 return 0; 581 582 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 583 584 for ( ; ; ) { 585 int offset; 586 587 if (node == NULL) 588 return 0; 589 590 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 591 592 /* 593 * This is just a debug check. Later, we can bale as soon as 594 * we see an unset tag. 595 */ 596 if (!tag_get(node, tag, offset)) 597 saw_unset_tag = 1; 598 if (height == 1) { 599 int ret = tag_get(node, tag, offset); 600 601 BUG_ON(ret && saw_unset_tag); 602 return !!ret; 603 } 604 node = rcu_dereference_raw(node->slots[offset]); 605 shift -= RADIX_TREE_MAP_SHIFT; 606 height--; 607 } 608 } 609 EXPORT_SYMBOL(radix_tree_tag_get); 610 611 /** 612 * radix_tree_next_hole - find the next hole (not-present entry) 613 * @root: tree root 614 * @index: index key 615 * @max_scan: maximum range to search 616 * 617 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest 618 * indexed hole. 619 * 620 * Returns: the index of the hole if found, otherwise returns an index 621 * outside of the set specified (in which case 'return - index >= max_scan' 622 * will be true). In rare cases of index wrap-around, 0 will be returned. 623 * 624 * radix_tree_next_hole may be called under rcu_read_lock. However, like 625 * radix_tree_gang_lookup, this will not atomically search a snapshot of 626 * the tree at a single point in time. For example, if a hole is created 627 * at index 5, then subsequently a hole is created at index 10, 628 * radix_tree_next_hole covering both indexes may return 10 if called 629 * under rcu_read_lock. 630 */ 631 unsigned long radix_tree_next_hole(struct radix_tree_root *root, 632 unsigned long index, unsigned long max_scan) 633 { 634 unsigned long i; 635 636 for (i = 0; i < max_scan; i++) { 637 if (!radix_tree_lookup(root, index)) 638 break; 639 index++; 640 if (index == 0) 641 break; 642 } 643 644 return index; 645 } 646 EXPORT_SYMBOL(radix_tree_next_hole); 647 648 /** 649 * radix_tree_prev_hole - find the prev hole (not-present entry) 650 * @root: tree root 651 * @index: index key 652 * @max_scan: maximum range to search 653 * 654 * Search backwards in the range [max(index-max_scan+1, 0), index] 655 * for the first hole. 656 * 657 * Returns: the index of the hole if found, otherwise returns an index 658 * outside of the set specified (in which case 'index - return >= max_scan' 659 * will be true). In rare cases of wrap-around, LONG_MAX will be returned. 660 * 661 * radix_tree_next_hole may be called under rcu_read_lock. However, like 662 * radix_tree_gang_lookup, this will not atomically search a snapshot of 663 * the tree at a single point in time. For example, if a hole is created 664 * at index 10, then subsequently a hole is created at index 5, 665 * radix_tree_prev_hole covering both indexes may return 5 if called under 666 * rcu_read_lock. 667 */ 668 unsigned long radix_tree_prev_hole(struct radix_tree_root *root, 669 unsigned long index, unsigned long max_scan) 670 { 671 unsigned long i; 672 673 for (i = 0; i < max_scan; i++) { 674 if (!radix_tree_lookup(root, index)) 675 break; 676 index--; 677 if (index == LONG_MAX) 678 break; 679 } 680 681 return index; 682 } 683 EXPORT_SYMBOL(radix_tree_prev_hole); 684 685 static unsigned int 686 __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, 687 unsigned int max_items, unsigned long *next_index) 688 { 689 unsigned int nr_found = 0; 690 unsigned int shift, height; 691 unsigned long i; 692 693 height = slot->height; 694 if (height == 0) 695 goto out; 696 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 697 698 for ( ; height > 1; height--) { 699 i = (index >> shift) & RADIX_TREE_MAP_MASK; 700 for (;;) { 701 if (slot->slots[i] != NULL) 702 break; 703 index &= ~((1UL << shift) - 1); 704 index += 1UL << shift; 705 if (index == 0) 706 goto out; /* 32-bit wraparound */ 707 i++; 708 if (i == RADIX_TREE_MAP_SIZE) 709 goto out; 710 } 711 712 shift -= RADIX_TREE_MAP_SHIFT; 713 slot = rcu_dereference_raw(slot->slots[i]); 714 if (slot == NULL) 715 goto out; 716 } 717 718 /* Bottom level: grab some items */ 719 for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { 720 index++; 721 if (slot->slots[i]) { 722 results[nr_found++] = &(slot->slots[i]); 723 if (nr_found == max_items) 724 goto out; 725 } 726 } 727 out: 728 *next_index = index; 729 return nr_found; 730 } 731 732 /** 733 * radix_tree_gang_lookup - perform multiple lookup on a radix tree 734 * @root: radix tree root 735 * @results: where the results of the lookup are placed 736 * @first_index: start the lookup from this key 737 * @max_items: place up to this many items at *results 738 * 739 * Performs an index-ascending scan of the tree for present items. Places 740 * them at *@results and returns the number of items which were placed at 741 * *@results. 742 * 743 * The implementation is naive. 744 * 745 * Like radix_tree_lookup, radix_tree_gang_lookup may be called under 746 * rcu_read_lock. In this case, rather than the returned results being 747 * an atomic snapshot of the tree at a single point in time, the semantics 748 * of an RCU protected gang lookup are as though multiple radix_tree_lookups 749 * have been issued in individual locks, and results stored in 'results'. 750 */ 751 unsigned int 752 radix_tree_gang_lookup(struct radix_tree_root *root, void **results, 753 unsigned long first_index, unsigned int max_items) 754 { 755 unsigned long max_index; 756 struct radix_tree_node *node; 757 unsigned long cur_index = first_index; 758 unsigned int ret; 759 760 node = rcu_dereference_raw(root->rnode); 761 if (!node) 762 return 0; 763 764 if (!radix_tree_is_indirect_ptr(node)) { 765 if (first_index > 0) 766 return 0; 767 results[0] = node; 768 return 1; 769 } 770 node = radix_tree_indirect_to_ptr(node); 771 772 max_index = radix_tree_maxindex(node->height); 773 774 ret = 0; 775 while (ret < max_items) { 776 unsigned int nr_found, slots_found, i; 777 unsigned long next_index; /* Index of next search */ 778 779 if (cur_index > max_index) 780 break; 781 slots_found = __lookup(node, (void ***)results + ret, cur_index, 782 max_items - ret, &next_index); 783 nr_found = 0; 784 for (i = 0; i < slots_found; i++) { 785 struct radix_tree_node *slot; 786 slot = *(((void ***)results)[ret + i]); 787 if (!slot) 788 continue; 789 results[ret + nr_found] = rcu_dereference_raw(slot); 790 nr_found++; 791 } 792 ret += nr_found; 793 if (next_index == 0) 794 break; 795 cur_index = next_index; 796 } 797 798 return ret; 799 } 800 EXPORT_SYMBOL(radix_tree_gang_lookup); 801 802 /** 803 * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree 804 * @root: radix tree root 805 * @results: where the results of the lookup are placed 806 * @first_index: start the lookup from this key 807 * @max_items: place up to this many items at *results 808 * 809 * Performs an index-ascending scan of the tree for present items. Places 810 * their slots at *@results and returns the number of items which were 811 * placed at *@results. 812 * 813 * The implementation is naive. 814 * 815 * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must 816 * be dereferenced with radix_tree_deref_slot, and if using only RCU 817 * protection, radix_tree_deref_slot may fail requiring a retry. 818 */ 819 unsigned int 820 radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, 821 unsigned long first_index, unsigned int max_items) 822 { 823 unsigned long max_index; 824 struct radix_tree_node *node; 825 unsigned long cur_index = first_index; 826 unsigned int ret; 827 828 node = rcu_dereference_raw(root->rnode); 829 if (!node) 830 return 0; 831 832 if (!radix_tree_is_indirect_ptr(node)) { 833 if (first_index > 0) 834 return 0; 835 results[0] = (void **)&root->rnode; 836 return 1; 837 } 838 node = radix_tree_indirect_to_ptr(node); 839 840 max_index = radix_tree_maxindex(node->height); 841 842 ret = 0; 843 while (ret < max_items) { 844 unsigned int slots_found; 845 unsigned long next_index; /* Index of next search */ 846 847 if (cur_index > max_index) 848 break; 849 slots_found = __lookup(node, results + ret, cur_index, 850 max_items - ret, &next_index); 851 ret += slots_found; 852 if (next_index == 0) 853 break; 854 cur_index = next_index; 855 } 856 857 return ret; 858 } 859 EXPORT_SYMBOL(radix_tree_gang_lookup_slot); 860 861 /* 862 * FIXME: the two tag_get()s here should use find_next_bit() instead of 863 * open-coding the search. 864 */ 865 static unsigned int 866 __lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, 867 unsigned int max_items, unsigned long *next_index, unsigned int tag) 868 { 869 unsigned int nr_found = 0; 870 unsigned int shift, height; 871 872 height = slot->height; 873 if (height == 0) 874 goto out; 875 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 876 877 while (height > 0) { 878 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; 879 880 for (;;) { 881 if (tag_get(slot, tag, i)) 882 break; 883 index &= ~((1UL << shift) - 1); 884 index += 1UL << shift; 885 if (index == 0) 886 goto out; /* 32-bit wraparound */ 887 i++; 888 if (i == RADIX_TREE_MAP_SIZE) 889 goto out; 890 } 891 height--; 892 if (height == 0) { /* Bottom level: grab some items */ 893 unsigned long j = index & RADIX_TREE_MAP_MASK; 894 895 for ( ; j < RADIX_TREE_MAP_SIZE; j++) { 896 index++; 897 if (!tag_get(slot, tag, j)) 898 continue; 899 /* 900 * Even though the tag was found set, we need to 901 * recheck that we have a non-NULL node, because 902 * if this lookup is lockless, it may have been 903 * subsequently deleted. 904 * 905 * Similar care must be taken in any place that 906 * lookup ->slots[x] without a lock (ie. can't 907 * rely on its value remaining the same). 908 */ 909 if (slot->slots[j]) { 910 results[nr_found++] = &(slot->slots[j]); 911 if (nr_found == max_items) 912 goto out; 913 } 914 } 915 } 916 shift -= RADIX_TREE_MAP_SHIFT; 917 slot = rcu_dereference_raw(slot->slots[i]); 918 if (slot == NULL) 919 break; 920 } 921 out: 922 *next_index = index; 923 return nr_found; 924 } 925 926 /** 927 * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree 928 * based on a tag 929 * @root: radix tree root 930 * @results: where the results of the lookup are placed 931 * @first_index: start the lookup from this key 932 * @max_items: place up to this many items at *results 933 * @tag: the tag index (< RADIX_TREE_MAX_TAGS) 934 * 935 * Performs an index-ascending scan of the tree for present items which 936 * have the tag indexed by @tag set. Places the items at *@results and 937 * returns the number of items which were placed at *@results. 938 */ 939 unsigned int 940 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, 941 unsigned long first_index, unsigned int max_items, 942 unsigned int tag) 943 { 944 struct radix_tree_node *node; 945 unsigned long max_index; 946 unsigned long cur_index = first_index; 947 unsigned int ret; 948 949 /* check the root's tag bit */ 950 if (!root_tag_get(root, tag)) 951 return 0; 952 953 node = rcu_dereference_raw(root->rnode); 954 if (!node) 955 return 0; 956 957 if (!radix_tree_is_indirect_ptr(node)) { 958 if (first_index > 0) 959 return 0; 960 results[0] = node; 961 return 1; 962 } 963 node = radix_tree_indirect_to_ptr(node); 964 965 max_index = radix_tree_maxindex(node->height); 966 967 ret = 0; 968 while (ret < max_items) { 969 unsigned int nr_found, slots_found, i; 970 unsigned long next_index; /* Index of next search */ 971 972 if (cur_index > max_index) 973 break; 974 slots_found = __lookup_tag(node, (void ***)results + ret, 975 cur_index, max_items - ret, &next_index, tag); 976 nr_found = 0; 977 for (i = 0; i < slots_found; i++) { 978 struct radix_tree_node *slot; 979 slot = *(((void ***)results)[ret + i]); 980 if (!slot) 981 continue; 982 results[ret + nr_found] = rcu_dereference_raw(slot); 983 nr_found++; 984 } 985 ret += nr_found; 986 if (next_index == 0) 987 break; 988 cur_index = next_index; 989 } 990 991 return ret; 992 } 993 EXPORT_SYMBOL(radix_tree_gang_lookup_tag); 994 995 /** 996 * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a 997 * radix tree based on a tag 998 * @root: radix tree root 999 * @results: where the results of the lookup are placed 1000 * @first_index: start the lookup from this key 1001 * @max_items: place up to this many items at *results 1002 * @tag: the tag index (< RADIX_TREE_MAX_TAGS) 1003 * 1004 * Performs an index-ascending scan of the tree for present items which 1005 * have the tag indexed by @tag set. Places the slots at *@results and 1006 * returns the number of slots which were placed at *@results. 1007 */ 1008 unsigned int 1009 radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, 1010 unsigned long first_index, unsigned int max_items, 1011 unsigned int tag) 1012 { 1013 struct radix_tree_node *node; 1014 unsigned long max_index; 1015 unsigned long cur_index = first_index; 1016 unsigned int ret; 1017 1018 /* check the root's tag bit */ 1019 if (!root_tag_get(root, tag)) 1020 return 0; 1021 1022 node = rcu_dereference_raw(root->rnode); 1023 if (!node) 1024 return 0; 1025 1026 if (!radix_tree_is_indirect_ptr(node)) { 1027 if (first_index > 0) 1028 return 0; 1029 results[0] = (void **)&root->rnode; 1030 return 1; 1031 } 1032 node = radix_tree_indirect_to_ptr(node); 1033 1034 max_index = radix_tree_maxindex(node->height); 1035 1036 ret = 0; 1037 while (ret < max_items) { 1038 unsigned int slots_found; 1039 unsigned long next_index; /* Index of next search */ 1040 1041 if (cur_index > max_index) 1042 break; 1043 slots_found = __lookup_tag(node, results + ret, 1044 cur_index, max_items - ret, &next_index, tag); 1045 ret += slots_found; 1046 if (next_index == 0) 1047 break; 1048 cur_index = next_index; 1049 } 1050 1051 return ret; 1052 } 1053 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); 1054 1055 1056 /** 1057 * radix_tree_shrink - shrink height of a radix tree to minimal 1058 * @root radix tree root 1059 */ 1060 static inline void radix_tree_shrink(struct radix_tree_root *root) 1061 { 1062 /* try to shrink tree height */ 1063 while (root->height > 0) { 1064 struct radix_tree_node *to_free = root->rnode; 1065 void *newptr; 1066 1067 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1068 to_free = radix_tree_indirect_to_ptr(to_free); 1069 1070 /* 1071 * The candidate node has more than one child, or its child 1072 * is not at the leftmost slot, we cannot shrink. 1073 */ 1074 if (to_free->count != 1) 1075 break; 1076 if (!to_free->slots[0]) 1077 break; 1078 1079 /* 1080 * We don't need rcu_assign_pointer(), since we are simply 1081 * moving the node from one part of the tree to another. If 1082 * it was safe to dereference the old pointer to it 1083 * (to_free->slots[0]), it will be safe to dereference the new 1084 * one (root->rnode). 1085 */ 1086 newptr = to_free->slots[0]; 1087 if (root->height > 1) 1088 newptr = radix_tree_ptr_to_indirect(newptr); 1089 root->rnode = newptr; 1090 root->height--; 1091 radix_tree_node_free(to_free); 1092 } 1093 } 1094 1095 /** 1096 * radix_tree_delete - delete an item from a radix tree 1097 * @root: radix tree root 1098 * @index: index key 1099 * 1100 * Remove the item at @index from the radix tree rooted at @root. 1101 * 1102 * Returns the address of the deleted item, or NULL if it was not present. 1103 */ 1104 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) 1105 { 1106 /* 1107 * The radix tree path needs to be one longer than the maximum path 1108 * since the "list" is null terminated. 1109 */ 1110 struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; 1111 struct radix_tree_node *slot = NULL; 1112 struct radix_tree_node *to_free; 1113 unsigned int height, shift; 1114 int tag; 1115 int offset; 1116 1117 height = root->height; 1118 if (index > radix_tree_maxindex(height)) 1119 goto out; 1120 1121 slot = root->rnode; 1122 if (height == 0) { 1123 root_tag_clear_all(root); 1124 root->rnode = NULL; 1125 goto out; 1126 } 1127 slot = radix_tree_indirect_to_ptr(slot); 1128 1129 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 1130 pathp->node = NULL; 1131 1132 do { 1133 if (slot == NULL) 1134 goto out; 1135 1136 pathp++; 1137 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 1138 pathp->offset = offset; 1139 pathp->node = slot; 1140 slot = slot->slots[offset]; 1141 shift -= RADIX_TREE_MAP_SHIFT; 1142 height--; 1143 } while (height > 0); 1144 1145 if (slot == NULL) 1146 goto out; 1147 1148 /* 1149 * Clear all tags associated with the just-deleted item 1150 */ 1151 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 1152 if (tag_get(pathp->node, tag, pathp->offset)) 1153 radix_tree_tag_clear(root, index, tag); 1154 } 1155 1156 to_free = NULL; 1157 /* Now free the nodes we do not need anymore */ 1158 while (pathp->node) { 1159 pathp->node->slots[pathp->offset] = NULL; 1160 pathp->node->count--; 1161 /* 1162 * Queue the node for deferred freeing after the 1163 * last reference to it disappears (set NULL, above). 1164 */ 1165 if (to_free) 1166 radix_tree_node_free(to_free); 1167 1168 if (pathp->node->count) { 1169 if (pathp->node == 1170 radix_tree_indirect_to_ptr(root->rnode)) 1171 radix_tree_shrink(root); 1172 goto out; 1173 } 1174 1175 /* Node with zero slots in use so free it */ 1176 to_free = pathp->node; 1177 pathp--; 1178 1179 } 1180 root_tag_clear_all(root); 1181 root->height = 0; 1182 root->rnode = NULL; 1183 if (to_free) 1184 radix_tree_node_free(to_free); 1185 1186 out: 1187 return slot; 1188 } 1189 EXPORT_SYMBOL(radix_tree_delete); 1190 1191 /** 1192 * radix_tree_tagged - test whether any items in the tree are tagged 1193 * @root: radix tree root 1194 * @tag: tag to test 1195 */ 1196 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) 1197 { 1198 return root_tag_get(root, tag); 1199 } 1200 EXPORT_SYMBOL(radix_tree_tagged); 1201 1202 static void 1203 radix_tree_node_ctor(void *node) 1204 { 1205 memset(node, 0, sizeof(struct radix_tree_node)); 1206 } 1207 1208 static __init unsigned long __maxindex(unsigned int height) 1209 { 1210 unsigned int width = height * RADIX_TREE_MAP_SHIFT; 1211 int shift = RADIX_TREE_INDEX_BITS - width; 1212 1213 if (shift < 0) 1214 return ~0UL; 1215 if (shift >= BITS_PER_LONG) 1216 return 0UL; 1217 return ~0UL >> shift; 1218 } 1219 1220 static __init void radix_tree_init_maxindex(void) 1221 { 1222 unsigned int i; 1223 1224 for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) 1225 height_to_maxindex[i] = __maxindex(i); 1226 } 1227 1228 static int radix_tree_callback(struct notifier_block *nfb, 1229 unsigned long action, 1230 void *hcpu) 1231 { 1232 int cpu = (long)hcpu; 1233 struct radix_tree_preload *rtp; 1234 1235 /* Free per-cpu pool of perloaded nodes */ 1236 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 1237 rtp = &per_cpu(radix_tree_preloads, cpu); 1238 while (rtp->nr) { 1239 kmem_cache_free(radix_tree_node_cachep, 1240 rtp->nodes[rtp->nr-1]); 1241 rtp->nodes[rtp->nr-1] = NULL; 1242 rtp->nr--; 1243 } 1244 } 1245 return NOTIFY_OK; 1246 } 1247 1248 void __init radix_tree_init(void) 1249 { 1250 radix_tree_node_cachep = kmem_cache_create("radix_tree_node", 1251 sizeof(struct radix_tree_node), 0, 1252 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, 1253 radix_tree_node_ctor); 1254 radix_tree_init_maxindex(); 1255 hotcpu_notifier(radix_tree_callback, 0); 1256 } 1257