xref: /linux/lib/radix-tree.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5  * Copyright (C) 2006 Nick Piggin
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2, or (at
10  * your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21 
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35 
36 
37 #ifdef __KERNEL__
38 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
39 #else
40 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
41 #endif
42 
43 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
44 #define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
45 
46 #define RADIX_TREE_TAG_LONGS	\
47 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
48 
49 struct radix_tree_node {
50 	unsigned int	height;		/* Height from the bottom */
51 	unsigned int	count;
52 	struct rcu_head	rcu_head;
53 	void		*slots[RADIX_TREE_MAP_SIZE];
54 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
55 };
56 
57 struct radix_tree_path {
58 	struct radix_tree_node *node;
59 	int offset;
60 };
61 
62 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
63 #define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
64 
65 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
66 
67 /*
68  * Radix tree node cache.
69  */
70 static struct kmem_cache *radix_tree_node_cachep;
71 
72 /*
73  * Per-cpu pool of preloaded nodes
74  */
75 struct radix_tree_preload {
76 	int nr;
77 	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
78 };
79 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
80 
81 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
82 {
83 	return root->gfp_mask & __GFP_BITS_MASK;
84 }
85 
86 /*
87  * This assumes that the caller has performed appropriate preallocation, and
88  * that the caller has pinned this thread of control to the current CPU.
89  */
90 static struct radix_tree_node *
91 radix_tree_node_alloc(struct radix_tree_root *root)
92 {
93 	struct radix_tree_node *ret;
94 	gfp_t gfp_mask = root_gfp_mask(root);
95 
96 	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
97 	if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
98 		struct radix_tree_preload *rtp;
99 
100 		rtp = &__get_cpu_var(radix_tree_preloads);
101 		if (rtp->nr) {
102 			ret = rtp->nodes[rtp->nr - 1];
103 			rtp->nodes[rtp->nr - 1] = NULL;
104 			rtp->nr--;
105 		}
106 	}
107 	BUG_ON(radix_tree_is_direct_ptr(ret));
108 	return ret;
109 }
110 
111 static void radix_tree_node_rcu_free(struct rcu_head *head)
112 {
113 	struct radix_tree_node *node =
114 			container_of(head, struct radix_tree_node, rcu_head);
115 	kmem_cache_free(radix_tree_node_cachep, node);
116 }
117 
118 static inline void
119 radix_tree_node_free(struct radix_tree_node *node)
120 {
121 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
122 }
123 
124 /*
125  * Load up this CPU's radix_tree_node buffer with sufficient objects to
126  * ensure that the addition of a single element in the tree cannot fail.  On
127  * success, return zero, with preemption disabled.  On error, return -ENOMEM
128  * with preemption not disabled.
129  */
130 int radix_tree_preload(gfp_t gfp_mask)
131 {
132 	struct radix_tree_preload *rtp;
133 	struct radix_tree_node *node;
134 	int ret = -ENOMEM;
135 
136 	preempt_disable();
137 	rtp = &__get_cpu_var(radix_tree_preloads);
138 	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
139 		preempt_enable();
140 		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
141 		if (node == NULL)
142 			goto out;
143 		preempt_disable();
144 		rtp = &__get_cpu_var(radix_tree_preloads);
145 		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
146 			rtp->nodes[rtp->nr++] = node;
147 		else
148 			kmem_cache_free(radix_tree_node_cachep, node);
149 	}
150 	ret = 0;
151 out:
152 	return ret;
153 }
154 EXPORT_SYMBOL(radix_tree_preload);
155 
156 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
157 		int offset)
158 {
159 	__set_bit(offset, node->tags[tag]);
160 }
161 
162 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
163 		int offset)
164 {
165 	__clear_bit(offset, node->tags[tag]);
166 }
167 
168 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
169 		int offset)
170 {
171 	return test_bit(offset, node->tags[tag]);
172 }
173 
174 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
175 {
176 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
177 }
178 
179 
180 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
181 {
182 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
183 }
184 
185 static inline void root_tag_clear_all(struct radix_tree_root *root)
186 {
187 	root->gfp_mask &= __GFP_BITS_MASK;
188 }
189 
190 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
191 {
192 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
193 }
194 
195 /*
196  * Returns 1 if any slot in the node has this tag set.
197  * Otherwise returns 0.
198  */
199 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
200 {
201 	int idx;
202 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
203 		if (node->tags[tag][idx])
204 			return 1;
205 	}
206 	return 0;
207 }
208 
209 /*
210  *	Return the maximum key which can be store into a
211  *	radix tree with height HEIGHT.
212  */
213 static inline unsigned long radix_tree_maxindex(unsigned int height)
214 {
215 	return height_to_maxindex[height];
216 }
217 
218 /*
219  *	Extend a radix tree so it can store key @index.
220  */
221 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
222 {
223 	struct radix_tree_node *node;
224 	unsigned int height;
225 	int tag;
226 
227 	/* Figure out what the height should be.  */
228 	height = root->height + 1;
229 	while (index > radix_tree_maxindex(height))
230 		height++;
231 
232 	if (root->rnode == NULL) {
233 		root->height = height;
234 		goto out;
235 	}
236 
237 	do {
238 		unsigned int newheight;
239 		if (!(node = radix_tree_node_alloc(root)))
240 			return -ENOMEM;
241 
242 		/* Increase the height.  */
243 		node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
244 
245 		/* Propagate the aggregated tag info into the new root */
246 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
247 			if (root_tag_get(root, tag))
248 				tag_set(node, tag, 0);
249 		}
250 
251 		newheight = root->height+1;
252 		node->height = newheight;
253 		node->count = 1;
254 		rcu_assign_pointer(root->rnode, node);
255 		root->height = newheight;
256 	} while (height > root->height);
257 out:
258 	return 0;
259 }
260 
261 /**
262  *	radix_tree_insert    -    insert into a radix tree
263  *	@root:		radix tree root
264  *	@index:		index key
265  *	@item:		item to insert
266  *
267  *	Insert an item into the radix tree at position @index.
268  */
269 int radix_tree_insert(struct radix_tree_root *root,
270 			unsigned long index, void *item)
271 {
272 	struct radix_tree_node *node = NULL, *slot;
273 	unsigned int height, shift;
274 	int offset;
275 	int error;
276 
277 	BUG_ON(radix_tree_is_direct_ptr(item));
278 
279 	/* Make sure the tree is high enough.  */
280 	if (index > radix_tree_maxindex(root->height)) {
281 		error = radix_tree_extend(root, index);
282 		if (error)
283 			return error;
284 	}
285 
286 	slot = root->rnode;
287 	height = root->height;
288 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
289 
290 	offset = 0;			/* uninitialised var warning */
291 	while (height > 0) {
292 		if (slot == NULL) {
293 			/* Have to add a child node.  */
294 			if (!(slot = radix_tree_node_alloc(root)))
295 				return -ENOMEM;
296 			slot->height = height;
297 			if (node) {
298 				rcu_assign_pointer(node->slots[offset], slot);
299 				node->count++;
300 			} else
301 				rcu_assign_pointer(root->rnode, slot);
302 		}
303 
304 		/* Go a level down */
305 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
306 		node = slot;
307 		slot = node->slots[offset];
308 		shift -= RADIX_TREE_MAP_SHIFT;
309 		height--;
310 	}
311 
312 	if (slot != NULL)
313 		return -EEXIST;
314 
315 	if (node) {
316 		node->count++;
317 		rcu_assign_pointer(node->slots[offset], item);
318 		BUG_ON(tag_get(node, 0, offset));
319 		BUG_ON(tag_get(node, 1, offset));
320 	} else {
321 		rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
322 		BUG_ON(root_tag_get(root, 0));
323 		BUG_ON(root_tag_get(root, 1));
324 	}
325 
326 	return 0;
327 }
328 EXPORT_SYMBOL(radix_tree_insert);
329 
330 /**
331  *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
332  *	@root:		radix tree root
333  *	@index:		index key
334  *
335  *	Returns:  the slot corresponding to the position @index in the
336  *	radix tree @root. This is useful for update-if-exists operations.
337  *
338  *	This function cannot be called under rcu_read_lock, it must be
339  *	excluded from writers, as must the returned slot for subsequent
340  *	use by radix_tree_deref_slot() and radix_tree_replace slot.
341  *	Caller must hold tree write locked across slot lookup and
342  *	replace.
343  */
344 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
345 {
346 	unsigned int height, shift;
347 	struct radix_tree_node *node, **slot;
348 
349 	node = root->rnode;
350 	if (node == NULL)
351 		return NULL;
352 
353 	if (radix_tree_is_direct_ptr(node)) {
354 		if (index > 0)
355 			return NULL;
356 		return (void **)&root->rnode;
357 	}
358 
359 	height = node->height;
360 	if (index > radix_tree_maxindex(height))
361 		return NULL;
362 
363 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
364 
365 	do {
366 		slot = (struct radix_tree_node **)
367 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
368 		node = *slot;
369 		if (node == NULL)
370 			return NULL;
371 
372 		shift -= RADIX_TREE_MAP_SHIFT;
373 		height--;
374 	} while (height > 0);
375 
376 	return (void **)slot;
377 }
378 EXPORT_SYMBOL(radix_tree_lookup_slot);
379 
380 /**
381  *	radix_tree_lookup    -    perform lookup operation on a radix tree
382  *	@root:		radix tree root
383  *	@index:		index key
384  *
385  *	Lookup the item at the position @index in the radix tree @root.
386  *
387  *	This function can be called under rcu_read_lock, however the caller
388  *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
389  *	them safely). No RCU barriers are required to access or modify the
390  *	returned item, however.
391  */
392 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
393 {
394 	unsigned int height, shift;
395 	struct radix_tree_node *node, **slot;
396 
397 	node = rcu_dereference(root->rnode);
398 	if (node == NULL)
399 		return NULL;
400 
401 	if (radix_tree_is_direct_ptr(node)) {
402 		if (index > 0)
403 			return NULL;
404 		return radix_tree_direct_to_ptr(node);
405 	}
406 
407 	height = node->height;
408 	if (index > radix_tree_maxindex(height))
409 		return NULL;
410 
411 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
412 
413 	do {
414 		slot = (struct radix_tree_node **)
415 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
416 		node = rcu_dereference(*slot);
417 		if (node == NULL)
418 			return NULL;
419 
420 		shift -= RADIX_TREE_MAP_SHIFT;
421 		height--;
422 	} while (height > 0);
423 
424 	return node;
425 }
426 EXPORT_SYMBOL(radix_tree_lookup);
427 
428 /**
429  *	radix_tree_tag_set - set a tag on a radix tree node
430  *	@root:		radix tree root
431  *	@index:		index key
432  *	@tag: 		tag index
433  *
434  *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
435  *	corresponding to @index in the radix tree.  From
436  *	the root all the way down to the leaf node.
437  *
438  *	Returns the address of the tagged item.   Setting a tag on a not-present
439  *	item is a bug.
440  */
441 void *radix_tree_tag_set(struct radix_tree_root *root,
442 			unsigned long index, unsigned int tag)
443 {
444 	unsigned int height, shift;
445 	struct radix_tree_node *slot;
446 
447 	height = root->height;
448 	BUG_ON(index > radix_tree_maxindex(height));
449 
450 	slot = root->rnode;
451 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
452 
453 	while (height > 0) {
454 		int offset;
455 
456 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
457 		if (!tag_get(slot, tag, offset))
458 			tag_set(slot, tag, offset);
459 		slot = slot->slots[offset];
460 		BUG_ON(slot == NULL);
461 		shift -= RADIX_TREE_MAP_SHIFT;
462 		height--;
463 	}
464 
465 	/* set the root's tag bit */
466 	if (slot && !root_tag_get(root, tag))
467 		root_tag_set(root, tag);
468 
469 	return slot;
470 }
471 EXPORT_SYMBOL(radix_tree_tag_set);
472 
473 /**
474  *	radix_tree_tag_clear - clear a tag on a radix tree node
475  *	@root:		radix tree root
476  *	@index:		index key
477  *	@tag: 		tag index
478  *
479  *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
480  *	corresponding to @index in the radix tree.  If
481  *	this causes the leaf node to have no tags set then clear the tag in the
482  *	next-to-leaf node, etc.
483  *
484  *	Returns the address of the tagged item on success, else NULL.  ie:
485  *	has the same return value and semantics as radix_tree_lookup().
486  */
487 void *radix_tree_tag_clear(struct radix_tree_root *root,
488 			unsigned long index, unsigned int tag)
489 {
490 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
491 	struct radix_tree_node *slot = NULL;
492 	unsigned int height, shift;
493 
494 	height = root->height;
495 	if (index > radix_tree_maxindex(height))
496 		goto out;
497 
498 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
499 	pathp->node = NULL;
500 	slot = root->rnode;
501 
502 	while (height > 0) {
503 		int offset;
504 
505 		if (slot == NULL)
506 			goto out;
507 
508 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
509 		pathp[1].offset = offset;
510 		pathp[1].node = slot;
511 		slot = slot->slots[offset];
512 		pathp++;
513 		shift -= RADIX_TREE_MAP_SHIFT;
514 		height--;
515 	}
516 
517 	if (slot == NULL)
518 		goto out;
519 
520 	while (pathp->node) {
521 		if (!tag_get(pathp->node, tag, pathp->offset))
522 			goto out;
523 		tag_clear(pathp->node, tag, pathp->offset);
524 		if (any_tag_set(pathp->node, tag))
525 			goto out;
526 		pathp--;
527 	}
528 
529 	/* clear the root's tag bit */
530 	if (root_tag_get(root, tag))
531 		root_tag_clear(root, tag);
532 
533 out:
534 	return slot;
535 }
536 EXPORT_SYMBOL(radix_tree_tag_clear);
537 
538 #ifndef __KERNEL__	/* Only the test harness uses this at present */
539 /**
540  * radix_tree_tag_get - get a tag on a radix tree node
541  * @root:		radix tree root
542  * @index:		index key
543  * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
544  *
545  * Return values:
546  *
547  *  0: tag not present or not set
548  *  1: tag set
549  */
550 int radix_tree_tag_get(struct radix_tree_root *root,
551 			unsigned long index, unsigned int tag)
552 {
553 	unsigned int height, shift;
554 	struct radix_tree_node *node;
555 	int saw_unset_tag = 0;
556 
557 	/* check the root's tag bit */
558 	if (!root_tag_get(root, tag))
559 		return 0;
560 
561 	node = rcu_dereference(root->rnode);
562 	if (node == NULL)
563 		return 0;
564 
565 	if (radix_tree_is_direct_ptr(node))
566 		return (index == 0);
567 
568 	height = node->height;
569 	if (index > radix_tree_maxindex(height))
570 		return 0;
571 
572 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
573 
574 	for ( ; ; ) {
575 		int offset;
576 
577 		if (node == NULL)
578 			return 0;
579 
580 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
581 
582 		/*
583 		 * This is just a debug check.  Later, we can bale as soon as
584 		 * we see an unset tag.
585 		 */
586 		if (!tag_get(node, tag, offset))
587 			saw_unset_tag = 1;
588 		if (height == 1) {
589 			int ret = tag_get(node, tag, offset);
590 
591 			BUG_ON(ret && saw_unset_tag);
592 			return !!ret;
593 		}
594 		node = rcu_dereference(node->slots[offset]);
595 		shift -= RADIX_TREE_MAP_SHIFT;
596 		height--;
597 	}
598 }
599 EXPORT_SYMBOL(radix_tree_tag_get);
600 #endif
601 
602 static unsigned int
603 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
604 	unsigned int max_items, unsigned long *next_index)
605 {
606 	unsigned int nr_found = 0;
607 	unsigned int shift, height;
608 	unsigned long i;
609 
610 	height = slot->height;
611 	if (height == 0)
612 		goto out;
613 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
614 
615 	for ( ; height > 1; height--) {
616 		i = (index >> shift) & RADIX_TREE_MAP_MASK;
617 		for (;;) {
618 			if (slot->slots[i] != NULL)
619 				break;
620 			index &= ~((1UL << shift) - 1);
621 			index += 1UL << shift;
622 			if (index == 0)
623 				goto out;	/* 32-bit wraparound */
624 			i++;
625 			if (i == RADIX_TREE_MAP_SIZE)
626 				goto out;
627 		}
628 
629 		shift -= RADIX_TREE_MAP_SHIFT;
630 		slot = rcu_dereference(slot->slots[i]);
631 		if (slot == NULL)
632 			goto out;
633 	}
634 
635 	/* Bottom level: grab some items */
636 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
637 		struct radix_tree_node *node;
638 		index++;
639 		node = slot->slots[i];
640 		if (node) {
641 			results[nr_found++] = rcu_dereference(node);
642 			if (nr_found == max_items)
643 				goto out;
644 		}
645 	}
646 out:
647 	*next_index = index;
648 	return nr_found;
649 }
650 
651 /**
652  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
653  *	@root:		radix tree root
654  *	@results:	where the results of the lookup are placed
655  *	@first_index:	start the lookup from this key
656  *	@max_items:	place up to this many items at *results
657  *
658  *	Performs an index-ascending scan of the tree for present items.  Places
659  *	them at *@results and returns the number of items which were placed at
660  *	*@results.
661  *
662  *	The implementation is naive.
663  *
664  *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
665  *	rcu_read_lock. In this case, rather than the returned results being
666  *	an atomic snapshot of the tree at a single point in time, the semantics
667  *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
668  *	have been issued in individual locks, and results stored in 'results'.
669  */
670 unsigned int
671 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
672 			unsigned long first_index, unsigned int max_items)
673 {
674 	unsigned long max_index;
675 	struct radix_tree_node *node;
676 	unsigned long cur_index = first_index;
677 	unsigned int ret;
678 
679 	node = rcu_dereference(root->rnode);
680 	if (!node)
681 		return 0;
682 
683 	if (radix_tree_is_direct_ptr(node)) {
684 		if (first_index > 0)
685 			return 0;
686 		node = radix_tree_direct_to_ptr(node);
687 		results[0] = rcu_dereference(node);
688 		return 1;
689 	}
690 
691 	max_index = radix_tree_maxindex(node->height);
692 
693 	ret = 0;
694 	while (ret < max_items) {
695 		unsigned int nr_found;
696 		unsigned long next_index;	/* Index of next search */
697 
698 		if (cur_index > max_index)
699 			break;
700 		nr_found = __lookup(node, results + ret, cur_index,
701 					max_items - ret, &next_index);
702 		ret += nr_found;
703 		if (next_index == 0)
704 			break;
705 		cur_index = next_index;
706 	}
707 
708 	return ret;
709 }
710 EXPORT_SYMBOL(radix_tree_gang_lookup);
711 
712 /*
713  * FIXME: the two tag_get()s here should use find_next_bit() instead of
714  * open-coding the search.
715  */
716 static unsigned int
717 __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
718 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
719 {
720 	unsigned int nr_found = 0;
721 	unsigned int shift, height;
722 
723 	height = slot->height;
724 	if (height == 0)
725 		goto out;
726 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
727 
728 	while (height > 0) {
729 		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
730 
731 		for (;;) {
732 			if (tag_get(slot, tag, i))
733 				break;
734 			index &= ~((1UL << shift) - 1);
735 			index += 1UL << shift;
736 			if (index == 0)
737 				goto out;	/* 32-bit wraparound */
738 			i++;
739 			if (i == RADIX_TREE_MAP_SIZE)
740 				goto out;
741 		}
742 		height--;
743 		if (height == 0) {	/* Bottom level: grab some items */
744 			unsigned long j = index & RADIX_TREE_MAP_MASK;
745 
746 			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
747 				struct radix_tree_node *node;
748 				index++;
749 				if (!tag_get(slot, tag, j))
750 					continue;
751 				node = slot->slots[j];
752 				/*
753 				 * Even though the tag was found set, we need to
754 				 * recheck that we have a non-NULL node, because
755 				 * if this lookup is lockless, it may have been
756 				 * subsequently deleted.
757 				 *
758 				 * Similar care must be taken in any place that
759 				 * lookup ->slots[x] without a lock (ie. can't
760 				 * rely on its value remaining the same).
761 				 */
762 				if (node) {
763 					node = rcu_dereference(node);
764 					results[nr_found++] = node;
765 					if (nr_found == max_items)
766 						goto out;
767 				}
768 			}
769 		}
770 		shift -= RADIX_TREE_MAP_SHIFT;
771 		slot = rcu_dereference(slot->slots[i]);
772 		if (slot == NULL)
773 			break;
774 	}
775 out:
776 	*next_index = index;
777 	return nr_found;
778 }
779 
780 /**
781  *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
782  *	                             based on a tag
783  *	@root:		radix tree root
784  *	@results:	where the results of the lookup are placed
785  *	@first_index:	start the lookup from this key
786  *	@max_items:	place up to this many items at *results
787  *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
788  *
789  *	Performs an index-ascending scan of the tree for present items which
790  *	have the tag indexed by @tag set.  Places the items at *@results and
791  *	returns the number of items which were placed at *@results.
792  */
793 unsigned int
794 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
795 		unsigned long first_index, unsigned int max_items,
796 		unsigned int tag)
797 {
798 	struct radix_tree_node *node;
799 	unsigned long max_index;
800 	unsigned long cur_index = first_index;
801 	unsigned int ret;
802 
803 	/* check the root's tag bit */
804 	if (!root_tag_get(root, tag))
805 		return 0;
806 
807 	node = rcu_dereference(root->rnode);
808 	if (!node)
809 		return 0;
810 
811 	if (radix_tree_is_direct_ptr(node)) {
812 		if (first_index > 0)
813 			return 0;
814 		node = radix_tree_direct_to_ptr(node);
815 		results[0] = rcu_dereference(node);
816 		return 1;
817 	}
818 
819 	max_index = radix_tree_maxindex(node->height);
820 
821 	ret = 0;
822 	while (ret < max_items) {
823 		unsigned int nr_found;
824 		unsigned long next_index;	/* Index of next search */
825 
826 		if (cur_index > max_index)
827 			break;
828 		nr_found = __lookup_tag(node, results + ret, cur_index,
829 					max_items - ret, &next_index, tag);
830 		ret += nr_found;
831 		if (next_index == 0)
832 			break;
833 		cur_index = next_index;
834 	}
835 
836 	return ret;
837 }
838 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
839 
840 /**
841  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
842  *	@root		radix tree root
843  */
844 static inline void radix_tree_shrink(struct radix_tree_root *root)
845 {
846 	/* try to shrink tree height */
847 	while (root->height > 0 &&
848 			root->rnode->count == 1 &&
849 			root->rnode->slots[0]) {
850 		struct radix_tree_node *to_free = root->rnode;
851 		void *newptr;
852 
853 		/*
854 		 * We don't need rcu_assign_pointer(), since we are simply
855 		 * moving the node from one part of the tree to another. If
856 		 * it was safe to dereference the old pointer to it
857 		 * (to_free->slots[0]), it will be safe to dereference the new
858 		 * one (root->rnode).
859 		 */
860 		newptr = to_free->slots[0];
861 		if (root->height == 1)
862 			newptr = radix_tree_ptr_to_direct(newptr);
863 		root->rnode = newptr;
864 		root->height--;
865 		/* must only free zeroed nodes into the slab */
866 		tag_clear(to_free, 0, 0);
867 		tag_clear(to_free, 1, 0);
868 		to_free->slots[0] = NULL;
869 		to_free->count = 0;
870 		radix_tree_node_free(to_free);
871 	}
872 }
873 
874 /**
875  *	radix_tree_delete    -    delete an item from a radix tree
876  *	@root:		radix tree root
877  *	@index:		index key
878  *
879  *	Remove the item at @index from the radix tree rooted at @root.
880  *
881  *	Returns the address of the deleted item, or NULL if it was not present.
882  */
883 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
884 {
885 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
886 	struct radix_tree_node *slot = NULL;
887 	struct radix_tree_node *to_free;
888 	unsigned int height, shift;
889 	int tag;
890 	int offset;
891 
892 	height = root->height;
893 	if (index > radix_tree_maxindex(height))
894 		goto out;
895 
896 	slot = root->rnode;
897 	if (height == 0 && root->rnode) {
898 		slot = radix_tree_direct_to_ptr(slot);
899 		root_tag_clear_all(root);
900 		root->rnode = NULL;
901 		goto out;
902 	}
903 
904 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
905 	pathp->node = NULL;
906 
907 	do {
908 		if (slot == NULL)
909 			goto out;
910 
911 		pathp++;
912 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
913 		pathp->offset = offset;
914 		pathp->node = slot;
915 		slot = slot->slots[offset];
916 		shift -= RADIX_TREE_MAP_SHIFT;
917 		height--;
918 	} while (height > 0);
919 
920 	if (slot == NULL)
921 		goto out;
922 
923 	/*
924 	 * Clear all tags associated with the just-deleted item
925 	 */
926 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
927 		if (tag_get(pathp->node, tag, pathp->offset))
928 			radix_tree_tag_clear(root, index, tag);
929 	}
930 
931 	to_free = NULL;
932 	/* Now free the nodes we do not need anymore */
933 	while (pathp->node) {
934 		pathp->node->slots[pathp->offset] = NULL;
935 		pathp->node->count--;
936 		/*
937 		 * Queue the node for deferred freeing after the
938 		 * last reference to it disappears (set NULL, above).
939 		 */
940 		if (to_free)
941 			radix_tree_node_free(to_free);
942 
943 		if (pathp->node->count) {
944 			if (pathp->node == root->rnode)
945 				radix_tree_shrink(root);
946 			goto out;
947 		}
948 
949 		/* Node with zero slots in use so free it */
950 		to_free = pathp->node;
951 		pathp--;
952 
953 	}
954 	root_tag_clear_all(root);
955 	root->height = 0;
956 	root->rnode = NULL;
957 	if (to_free)
958 		radix_tree_node_free(to_free);
959 
960 out:
961 	return slot;
962 }
963 EXPORT_SYMBOL(radix_tree_delete);
964 
965 /**
966  *	radix_tree_tagged - test whether any items in the tree are tagged
967  *	@root:		radix tree root
968  *	@tag:		tag to test
969  */
970 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
971 {
972 	return root_tag_get(root, tag);
973 }
974 EXPORT_SYMBOL(radix_tree_tagged);
975 
976 static void
977 radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
978 {
979 	memset(node, 0, sizeof(struct radix_tree_node));
980 }
981 
982 static __init unsigned long __maxindex(unsigned int height)
983 {
984 	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
985 	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
986 
987 	if (tmp >= RADIX_TREE_INDEX_BITS)
988 		index = ~0UL;
989 	return index;
990 }
991 
992 static __init void radix_tree_init_maxindex(void)
993 {
994 	unsigned int i;
995 
996 	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
997 		height_to_maxindex[i] = __maxindex(i);
998 }
999 
1000 static int radix_tree_callback(struct notifier_block *nfb,
1001                             unsigned long action,
1002                             void *hcpu)
1003 {
1004        int cpu = (long)hcpu;
1005        struct radix_tree_preload *rtp;
1006 
1007        /* Free per-cpu pool of perloaded nodes */
1008        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1009                rtp = &per_cpu(radix_tree_preloads, cpu);
1010                while (rtp->nr) {
1011                        kmem_cache_free(radix_tree_node_cachep,
1012                                        rtp->nodes[rtp->nr-1]);
1013                        rtp->nodes[rtp->nr-1] = NULL;
1014                        rtp->nr--;
1015                }
1016        }
1017        return NOTIFY_OK;
1018 }
1019 
1020 void __init radix_tree_init(void)
1021 {
1022 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1023 			sizeof(struct radix_tree_node), 0,
1024 			SLAB_PANIC, radix_tree_node_ctor);
1025 	radix_tree_init_maxindex();
1026 	hotcpu_notifier(radix_tree_callback, 0);
1027 }
1028