xref: /linux/drivers/md/persistent-data/dm-btree.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10 
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13 
14 #define DM_MSG_PREFIX "btree"
15 
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 	__dm_written_to_disk(src)
21 {
22 	memcpy(dest, src, len);
23 	__dm_unbless_for_disk(src);
24 }
25 
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 			 unsigned index, void *elt)
28 	__dm_written_to_disk(elt)
29 {
30 	if (index < nr_elts)
31 		memmove(base + (elt_size * (index + 1)),
32 			base + (elt_size * index),
33 			(nr_elts - index) * elt_size);
34 
35 	memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37 
38 /*----------------------------------------------------------------*/
39 
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct node *n, uint64_t key, int want_hi)
42 {
43 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44 
45 	while (hi - lo > 1) {
46 		int mid = lo + ((hi - lo) / 2);
47 		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48 
49 		if (mid_key == key)
50 			return mid;
51 
52 		if (mid_key < key)
53 			lo = mid;
54 		else
55 			hi = mid;
56 	}
57 
58 	return want_hi ? hi : lo;
59 }
60 
61 int lower_bound(struct node *n, uint64_t key)
62 {
63 	return bsearch(n, key, 0);
64 }
65 
66 void inc_children(struct dm_transaction_manager *tm, struct node *n,
67 		  struct dm_btree_value_type *vt)
68 {
69 	unsigned i;
70 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71 
72 	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 		for (i = 0; i < nr_entries; i++)
74 			dm_tm_inc(tm, value64(n, i));
75 	else if (vt->inc)
76 		for (i = 0; i < nr_entries; i++)
77 			vt->inc(vt->context,
78 				value_ptr(n, i, vt->size));
79 }
80 
81 static int insert_at(size_t value_size, struct node *node, unsigned index,
82 		      uint64_t key, void *value)
83 		      __dm_written_to_disk(value)
84 {
85 	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86 	__le64 key_le = cpu_to_le64(key);
87 
88 	if (index > nr_entries ||
89 	    index >= le32_to_cpu(node->header.max_entries)) {
90 		DMERR("too many entries in btree node for insert");
91 		__dm_unbless_for_disk(value);
92 		return -ENOMEM;
93 	}
94 
95 	__dm_bless_for_disk(&key_le);
96 
97 	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98 	array_insert(value_base(node), value_size, nr_entries, index, value);
99 	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
100 
101 	return 0;
102 }
103 
104 /*----------------------------------------------------------------*/
105 
106 /*
107  * We want 3n entries (for some n).  This works more nicely for repeated
108  * insert remove loops than (2n + 1).
109  */
110 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
111 {
112 	uint32_t total, n;
113 	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
114 
115 	block_size -= sizeof(struct node_header);
116 	total = block_size / elt_size;
117 	n = total / 3;		/* rounds down */
118 
119 	return 3 * n;
120 }
121 
122 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
123 {
124 	int r;
125 	struct dm_block *b;
126 	struct node *n;
127 	size_t block_size;
128 	uint32_t max_entries;
129 
130 	r = new_block(info, &b);
131 	if (r < 0)
132 		return r;
133 
134 	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135 	max_entries = calc_max_entries(info->value_type.size, block_size);
136 
137 	n = dm_block_data(b);
138 	memset(n, 0, block_size);
139 	n->header.flags = cpu_to_le32(LEAF_NODE);
140 	n->header.nr_entries = cpu_to_le32(0);
141 	n->header.max_entries = cpu_to_le32(max_entries);
142 	n->header.value_size = cpu_to_le32(info->value_type.size);
143 
144 	*root = dm_block_location(b);
145 	return unlock_block(info, b);
146 }
147 EXPORT_SYMBOL_GPL(dm_btree_empty);
148 
149 /*----------------------------------------------------------------*/
150 
151 /*
152  * Deletion uses a recursive algorithm, since we have limited stack space
153  * we explicitly manage our own stack on the heap.
154  */
155 #define MAX_SPINE_DEPTH 64
156 struct frame {
157 	struct dm_block *b;
158 	struct node *n;
159 	unsigned level;
160 	unsigned nr_children;
161 	unsigned current_child;
162 };
163 
164 struct del_stack {
165 	struct dm_transaction_manager *tm;
166 	int top;
167 	struct frame spine[MAX_SPINE_DEPTH];
168 };
169 
170 static int top_frame(struct del_stack *s, struct frame **f)
171 {
172 	if (s->top < 0) {
173 		DMERR("btree deletion stack empty");
174 		return -EINVAL;
175 	}
176 
177 	*f = s->spine + s->top;
178 
179 	return 0;
180 }
181 
182 static int unprocessed_frames(struct del_stack *s)
183 {
184 	return s->top >= 0;
185 }
186 
187 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
188 {
189 	int r;
190 	uint32_t ref_count;
191 
192 	if (s->top >= MAX_SPINE_DEPTH - 1) {
193 		DMERR("btree deletion stack out of memory");
194 		return -ENOMEM;
195 	}
196 
197 	r = dm_tm_ref(s->tm, b, &ref_count);
198 	if (r)
199 		return r;
200 
201 	if (ref_count > 1)
202 		/*
203 		 * This is a shared node, so we can just decrement it's
204 		 * reference counter and leave the children.
205 		 */
206 		dm_tm_dec(s->tm, b);
207 
208 	else {
209 		struct frame *f = s->spine + ++s->top;
210 
211 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
212 		if (r) {
213 			s->top--;
214 			return r;
215 		}
216 
217 		f->n = dm_block_data(f->b);
218 		f->level = level;
219 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
220 		f->current_child = 0;
221 	}
222 
223 	return 0;
224 }
225 
226 static void pop_frame(struct del_stack *s)
227 {
228 	struct frame *f = s->spine + s->top--;
229 
230 	dm_tm_dec(s->tm, dm_block_location(f->b));
231 	dm_tm_unlock(s->tm, f->b);
232 }
233 
234 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
235 {
236 	int r;
237 	struct del_stack *s;
238 
239 	s = kmalloc(sizeof(*s), GFP_KERNEL);
240 	if (!s)
241 		return -ENOMEM;
242 	s->tm = info->tm;
243 	s->top = -1;
244 
245 	r = push_frame(s, root, 1);
246 	if (r)
247 		goto out;
248 
249 	while (unprocessed_frames(s)) {
250 		uint32_t flags;
251 		struct frame *f;
252 		dm_block_t b;
253 
254 		r = top_frame(s, &f);
255 		if (r)
256 			goto out;
257 
258 		if (f->current_child >= f->nr_children) {
259 			pop_frame(s);
260 			continue;
261 		}
262 
263 		flags = le32_to_cpu(f->n->header.flags);
264 		if (flags & INTERNAL_NODE) {
265 			b = value64(f->n, f->current_child);
266 			f->current_child++;
267 			r = push_frame(s, b, f->level);
268 			if (r)
269 				goto out;
270 
271 		} else if (f->level != (info->levels - 1)) {
272 			b = value64(f->n, f->current_child);
273 			f->current_child++;
274 			r = push_frame(s, b, f->level + 1);
275 			if (r)
276 				goto out;
277 
278 		} else {
279 			if (info->value_type.dec) {
280 				unsigned i;
281 
282 				for (i = 0; i < f->nr_children; i++)
283 					info->value_type.dec(info->value_type.context,
284 							     value_ptr(f->n, i, info->value_type.size));
285 			}
286 			f->current_child = f->nr_children;
287 		}
288 	}
289 
290 out:
291 	kfree(s);
292 	return r;
293 }
294 EXPORT_SYMBOL_GPL(dm_btree_del);
295 
296 /*----------------------------------------------------------------*/
297 
298 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
299 			    int (*search_fn)(struct node *, uint64_t),
300 			    uint64_t *result_key, void *v, size_t value_size)
301 {
302 	int i, r;
303 	uint32_t flags, nr_entries;
304 
305 	do {
306 		r = ro_step(s, block);
307 		if (r < 0)
308 			return r;
309 
310 		i = search_fn(ro_node(s), key);
311 
312 		flags = le32_to_cpu(ro_node(s)->header.flags);
313 		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
314 		if (i < 0 || i >= nr_entries)
315 			return -ENODATA;
316 
317 		if (flags & INTERNAL_NODE)
318 			block = value64(ro_node(s), i);
319 
320 	} while (!(flags & LEAF_NODE));
321 
322 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 	memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
324 
325 	return 0;
326 }
327 
328 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
329 		    uint64_t *keys, void *value_le)
330 {
331 	unsigned level, last_level = info->levels - 1;
332 	int r = -ENODATA;
333 	uint64_t rkey;
334 	__le64 internal_value_le;
335 	struct ro_spine spine;
336 
337 	init_ro_spine(&spine, info);
338 	for (level = 0; level < info->levels; level++) {
339 		size_t size;
340 		void *value_p;
341 
342 		if (level == last_level) {
343 			value_p = value_le;
344 			size = info->value_type.size;
345 
346 		} else {
347 			value_p = &internal_value_le;
348 			size = sizeof(uint64_t);
349 		}
350 
351 		r = btree_lookup_raw(&spine, root, keys[level],
352 				     lower_bound, &rkey,
353 				     value_p, size);
354 
355 		if (!r) {
356 			if (rkey != keys[level]) {
357 				exit_ro_spine(&spine);
358 				return -ENODATA;
359 			}
360 		} else {
361 			exit_ro_spine(&spine);
362 			return r;
363 		}
364 
365 		root = le64_to_cpu(internal_value_le);
366 	}
367 	exit_ro_spine(&spine);
368 
369 	return r;
370 }
371 EXPORT_SYMBOL_GPL(dm_btree_lookup);
372 
373 /*
374  * Splits a node by creating a sibling node and shifting half the nodes
375  * contents across.  Assumes there is a parent node, and it has room for
376  * another child.
377  *
378  * Before:
379  *	  +--------+
380  *	  | Parent |
381  *	  +--------+
382  *	     |
383  *	     v
384  *	+----------+
385  *	| A ++++++ |
386  *	+----------+
387  *
388  *
389  * After:
390  *		+--------+
391  *		| Parent |
392  *		+--------+
393  *		  |	|
394  *		  v	+------+
395  *	    +---------+	       |
396  *	    | A* +++  |	       v
397  *	    +---------+	  +-------+
398  *			  | B +++ |
399  *			  +-------+
400  *
401  * Where A* is a shadow of A.
402  */
403 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
404 			       unsigned parent_index, uint64_t key)
405 {
406 	int r;
407 	size_t size;
408 	unsigned nr_left, nr_right;
409 	struct dm_block *left, *right, *parent;
410 	struct node *ln, *rn, *pn;
411 	__le64 location;
412 
413 	left = shadow_current(s);
414 
415 	r = new_block(s->info, &right);
416 	if (r < 0)
417 		return r;
418 
419 	ln = dm_block_data(left);
420 	rn = dm_block_data(right);
421 
422 	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
423 	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
424 
425 	ln->header.nr_entries = cpu_to_le32(nr_left);
426 
427 	rn->header.flags = ln->header.flags;
428 	rn->header.nr_entries = cpu_to_le32(nr_right);
429 	rn->header.max_entries = ln->header.max_entries;
430 	rn->header.value_size = ln->header.value_size;
431 	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
432 
433 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 		sizeof(uint64_t) : s->info->value_type.size;
435 	memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
436 	       size * nr_right);
437 
438 	/*
439 	 * Patch up the parent
440 	 */
441 	parent = shadow_parent(s);
442 
443 	pn = dm_block_data(parent);
444 	location = cpu_to_le64(dm_block_location(left));
445 	__dm_bless_for_disk(&location);
446 	memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
447 		    &location, sizeof(__le64));
448 
449 	location = cpu_to_le64(dm_block_location(right));
450 	__dm_bless_for_disk(&location);
451 
452 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
453 		      le64_to_cpu(rn->keys[0]), &location);
454 	if (r)
455 		return r;
456 
457 	if (key < le64_to_cpu(rn->keys[0])) {
458 		unlock_block(s->info, right);
459 		s->nodes[1] = left;
460 	} else {
461 		unlock_block(s->info, left);
462 		s->nodes[1] = right;
463 	}
464 
465 	return 0;
466 }
467 
468 /*
469  * Splits a node by creating two new children beneath the given node.
470  *
471  * Before:
472  *	  +----------+
473  *	  | A ++++++ |
474  *	  +----------+
475  *
476  *
477  * After:
478  *	+------------+
479  *	| A (shadow) |
480  *	+------------+
481  *	    |	|
482  *   +------+	+----+
483  *   |		     |
484  *   v		     v
485  * +-------+	 +-------+
486  * | B +++ |	 | C +++ |
487  * +-------+	 +-------+
488  */
489 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
490 {
491 	int r;
492 	size_t size;
493 	unsigned nr_left, nr_right;
494 	struct dm_block *left, *right, *new_parent;
495 	struct node *pn, *ln, *rn;
496 	__le64 val;
497 
498 	new_parent = shadow_current(s);
499 
500 	r = new_block(s->info, &left);
501 	if (r < 0)
502 		return r;
503 
504 	r = new_block(s->info, &right);
505 	if (r < 0) {
506 		/* FIXME: put left */
507 		return r;
508 	}
509 
510 	pn = dm_block_data(new_parent);
511 	ln = dm_block_data(left);
512 	rn = dm_block_data(right);
513 
514 	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
515 	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
516 
517 	ln->header.flags = pn->header.flags;
518 	ln->header.nr_entries = cpu_to_le32(nr_left);
519 	ln->header.max_entries = pn->header.max_entries;
520 	ln->header.value_size = pn->header.value_size;
521 
522 	rn->header.flags = pn->header.flags;
523 	rn->header.nr_entries = cpu_to_le32(nr_right);
524 	rn->header.max_entries = pn->header.max_entries;
525 	rn->header.value_size = pn->header.value_size;
526 
527 	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
528 	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
529 
530 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 		sizeof(__le64) : s->info->value_type.size;
532 	memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
533 	memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
534 	       nr_right * size);
535 
536 	/* new_parent should just point to l and r now */
537 	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
538 	pn->header.nr_entries = cpu_to_le32(2);
539 	pn->header.max_entries = cpu_to_le32(
540 		calc_max_entries(sizeof(__le64),
541 				 dm_bm_block_size(
542 					 dm_tm_get_bm(s->info->tm))));
543 	pn->header.value_size = cpu_to_le32(sizeof(__le64));
544 
545 	val = cpu_to_le64(dm_block_location(left));
546 	__dm_bless_for_disk(&val);
547 	pn->keys[0] = ln->keys[0];
548 	memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
549 
550 	val = cpu_to_le64(dm_block_location(right));
551 	__dm_bless_for_disk(&val);
552 	pn->keys[1] = rn->keys[0];
553 	memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
554 
555 	/*
556 	 * rejig the spine.  This is ugly, since it knows too
557 	 * much about the spine
558 	 */
559 	if (s->nodes[0] != new_parent) {
560 		unlock_block(s->info, s->nodes[0]);
561 		s->nodes[0] = new_parent;
562 	}
563 	if (key < le64_to_cpu(rn->keys[0])) {
564 		unlock_block(s->info, right);
565 		s->nodes[1] = left;
566 	} else {
567 		unlock_block(s->info, left);
568 		s->nodes[1] = right;
569 	}
570 	s->count = 2;
571 
572 	return 0;
573 }
574 
575 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
576 			    struct dm_btree_value_type *vt,
577 			    uint64_t key, unsigned *index)
578 {
579 	int r, i = *index, top = 1;
580 	struct node *node;
581 
582 	for (;;) {
583 		r = shadow_step(s, root, vt);
584 		if (r < 0)
585 			return r;
586 
587 		node = dm_block_data(shadow_current(s));
588 
589 		/*
590 		 * We have to patch up the parent node, ugly, but I don't
591 		 * see a way to do this automatically as part of the spine
592 		 * op.
593 		 */
594 		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
595 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596 
597 			__dm_bless_for_disk(&location);
598 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
599 				    &location, sizeof(__le64));
600 		}
601 
602 		node = dm_block_data(shadow_current(s));
603 
604 		if (node->header.nr_entries == node->header.max_entries) {
605 			if (top)
606 				r = btree_split_beneath(s, key);
607 			else
608 				r = btree_split_sibling(s, root, i, key);
609 
610 			if (r < 0)
611 				return r;
612 		}
613 
614 		node = dm_block_data(shadow_current(s));
615 
616 		i = lower_bound(node, key);
617 
618 		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
619 			break;
620 
621 		if (i < 0) {
622 			/* change the bounds on the lowest key */
623 			node->keys[0] = cpu_to_le64(key);
624 			i = 0;
625 		}
626 
627 		root = value64(node, i);
628 		top = 0;
629 	}
630 
631 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
632 		i++;
633 
634 	*index = i;
635 	return 0;
636 }
637 
638 static int insert(struct dm_btree_info *info, dm_block_t root,
639 		  uint64_t *keys, void *value, dm_block_t *new_root,
640 		  int *inserted)
641 		  __dm_written_to_disk(value)
642 {
643 	int r, need_insert;
644 	unsigned level, index = -1, last_level = info->levels - 1;
645 	dm_block_t block = root;
646 	struct shadow_spine spine;
647 	struct node *n;
648 	struct dm_btree_value_type le64_type;
649 
650 	le64_type.context = NULL;
651 	le64_type.size = sizeof(__le64);
652 	le64_type.inc = NULL;
653 	le64_type.dec = NULL;
654 	le64_type.equal = NULL;
655 
656 	init_shadow_spine(&spine, info);
657 
658 	for (level = 0; level < (info->levels - 1); level++) {
659 		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
660 		if (r < 0)
661 			goto bad;
662 
663 		n = dm_block_data(shadow_current(&spine));
664 		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
665 			       (le64_to_cpu(n->keys[index]) != keys[level]));
666 
667 		if (need_insert) {
668 			dm_block_t new_tree;
669 			__le64 new_le;
670 
671 			r = dm_btree_empty(info, &new_tree);
672 			if (r < 0)
673 				goto bad;
674 
675 			new_le = cpu_to_le64(new_tree);
676 			__dm_bless_for_disk(&new_le);
677 
678 			r = insert_at(sizeof(uint64_t), n, index,
679 				      keys[level], &new_le);
680 			if (r)
681 				goto bad;
682 		}
683 
684 		if (level < last_level)
685 			block = value64(n, index);
686 	}
687 
688 	r = btree_insert_raw(&spine, block, &info->value_type,
689 			     keys[level], &index);
690 	if (r < 0)
691 		goto bad;
692 
693 	n = dm_block_data(shadow_current(&spine));
694 	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
695 		       (le64_to_cpu(n->keys[index]) != keys[level]));
696 
697 	if (need_insert) {
698 		if (inserted)
699 			*inserted = 1;
700 
701 		r = insert_at(info->value_type.size, n, index,
702 			      keys[level], value);
703 		if (r)
704 			goto bad_unblessed;
705 	} else {
706 		if (inserted)
707 			*inserted = 0;
708 
709 		if (info->value_type.dec &&
710 		    (!info->value_type.equal ||
711 		     !info->value_type.equal(
712 			     info->value_type.context,
713 			     value_ptr(n, index, info->value_type.size),
714 			     value))) {
715 			info->value_type.dec(info->value_type.context,
716 					     value_ptr(n, index, info->value_type.size));
717 		}
718 		memcpy_disk(value_ptr(n, index, info->value_type.size),
719 			    value, info->value_type.size);
720 	}
721 
722 	*new_root = shadow_root(&spine);
723 	exit_shadow_spine(&spine);
724 
725 	return 0;
726 
727 bad:
728 	__dm_unbless_for_disk(value);
729 bad_unblessed:
730 	exit_shadow_spine(&spine);
731 	return r;
732 }
733 
734 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
735 		    uint64_t *keys, void *value, dm_block_t *new_root)
736 		    __dm_written_to_disk(value)
737 {
738 	return insert(info, root, keys, value, new_root, NULL);
739 }
740 EXPORT_SYMBOL_GPL(dm_btree_insert);
741 
742 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
743 			   uint64_t *keys, void *value, dm_block_t *new_root,
744 			   int *inserted)
745 			   __dm_written_to_disk(value)
746 {
747 	return insert(info, root, keys, value, new_root, inserted);
748 }
749 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
750 
751 /*----------------------------------------------------------------*/
752 
753 static int find_highest_key(struct ro_spine *s, dm_block_t block,
754 			    uint64_t *result_key, dm_block_t *next_block)
755 {
756 	int i, r;
757 	uint32_t flags;
758 
759 	do {
760 		r = ro_step(s, block);
761 		if (r < 0)
762 			return r;
763 
764 		flags = le32_to_cpu(ro_node(s)->header.flags);
765 		i = le32_to_cpu(ro_node(s)->header.nr_entries);
766 		if (!i)
767 			return -ENODATA;
768 		else
769 			i--;
770 
771 		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
772 		if (next_block || flags & INTERNAL_NODE)
773 			block = value64(ro_node(s), i);
774 
775 	} while (flags & INTERNAL_NODE);
776 
777 	if (next_block)
778 		*next_block = block;
779 	return 0;
780 }
781 
782 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
783 			      uint64_t *result_keys)
784 {
785 	int r = 0, count = 0, level;
786 	struct ro_spine spine;
787 
788 	init_ro_spine(&spine, info);
789 	for (level = 0; level < info->levels; level++) {
790 		r = find_highest_key(&spine, root, result_keys + level,
791 				     level == info->levels - 1 ? NULL : &root);
792 		if (r == -ENODATA) {
793 			r = 0;
794 			break;
795 
796 		} else if (r)
797 			break;
798 
799 		count++;
800 	}
801 	exit_ro_spine(&spine);
802 
803 	return r ? r : count;
804 }
805 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
806