xref: /linux/fs/xfs/libxfs/xfs_iext_tree.c (revision e3b9f1e81de2083f359bacd2a94bf1c024f2ede0)
1 /*
2  * Copyright (c) 2017 Christoph Hellwig.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  */
13 
14 #include <linux/cache.h>
15 #include <linux/kernel.h>
16 #include <linux/slab.h>
17 #include "xfs.h"
18 #include "xfs_format.h"
19 #include "xfs_bit.h"
20 #include "xfs_log_format.h"
21 #include "xfs_inode.h"
22 #include "xfs_inode_fork.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_mount.h"
25 #include "xfs_trace.h"
26 
27 /*
28  * In-core extent record layout:
29  *
30  * +-------+----------------------------+
31  * | 00:53 | all 54 bits of startoff    |
32  * | 54:63 | low 10 bits of startblock  |
33  * +-------+----------------------------+
34  * | 00:20 | all 21 bits of length      |
35  * |    21 | unwritten extent bit       |
36  * | 22:63 | high 42 bits of startblock |
37  * +-------+----------------------------+
38  */
39 #define XFS_IEXT_STARTOFF_MASK		xfs_mask64lo(BMBT_STARTOFF_BITLEN)
40 #define XFS_IEXT_LENGTH_MASK		xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
41 #define XFS_IEXT_STARTBLOCK_MASK	xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
42 
43 struct xfs_iext_rec {
44 	uint64_t			lo;
45 	uint64_t			hi;
46 };
47 
48 /*
49  * Given that the length can't be a zero, only an empty hi value indicates an
50  * unused record.
51  */
52 static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
53 {
54 	return rec->hi == 0;
55 }
56 
57 static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
58 {
59 	rec->lo = 0;
60 	rec->hi = 0;
61 }
62 
63 static void
64 xfs_iext_set(
65 	struct xfs_iext_rec	*rec,
66 	struct xfs_bmbt_irec	*irec)
67 {
68 	ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
69 	ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
70 	ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
71 
72 	rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
73 	rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
74 
75 	rec->lo |= (irec->br_startblock << 54);
76 	rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
77 
78 	if (irec->br_state == XFS_EXT_UNWRITTEN)
79 		rec->hi |= (1 << 21);
80 }
81 
82 static void
83 xfs_iext_get(
84 	struct xfs_bmbt_irec	*irec,
85 	struct xfs_iext_rec	*rec)
86 {
87 	irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
88 	irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
89 
90 	irec->br_startblock = rec->lo >> 54;
91 	irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
92 
93 	if (rec->hi & (1 << 21))
94 		irec->br_state = XFS_EXT_UNWRITTEN;
95 	else
96 		irec->br_state = XFS_EXT_NORM;
97 }
98 
99 enum {
100 	NODE_SIZE	= 256,
101 	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
102 	RECS_PER_LEAF	= (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
103 				sizeof(struct xfs_iext_rec),
104 };
105 
106 /*
107  * In-core extent btree block layout:
108  *
109  * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
110  *
111  * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
112  * contain the startoffset, blockcount, startblock and unwritten extent flag.
113  * See above for the exact format, followed by pointers to the previous and next
114  * leaf blocks (if there are any).
115  *
116  * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
117  * by an equal number of pointers to the btree blocks at the next lower level.
118  *
119  *		+-------+-------+-------+-------+-------+----------+----------+
120  * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
121  *		+-------+-------+-------+-------+-------+----------+----------+
122  *
123  *		+-------+-------+-------+-------+-------+-------+------+-------+
124  * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
125  *		+-------+-------+-------+-------+-------+-------+------+-------+
126  */
127 struct xfs_iext_node {
128 	uint64_t		keys[KEYS_PER_NODE];
129 #define XFS_IEXT_KEY_INVALID	(1ULL << 63)
130 	void			*ptrs[KEYS_PER_NODE];
131 };
132 
133 struct xfs_iext_leaf {
134 	struct xfs_iext_rec	recs[RECS_PER_LEAF];
135 	struct xfs_iext_leaf	*prev;
136 	struct xfs_iext_leaf	*next;
137 };
138 
139 inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
140 {
141 	return ifp->if_bytes / sizeof(struct xfs_iext_rec);
142 }
143 
144 static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
145 {
146 	if (ifp->if_height == 1)
147 		return xfs_iext_count(ifp);
148 	return RECS_PER_LEAF;
149 }
150 
151 static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
152 {
153 	return &cur->leaf->recs[cur->pos];
154 }
155 
156 static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
157 		struct xfs_iext_cursor *cur)
158 {
159 	if (!cur->leaf)
160 		return false;
161 	if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
162 		return false;
163 	if (xfs_iext_rec_is_empty(cur_rec(cur)))
164 		return false;
165 	return true;
166 }
167 
168 static void *
169 xfs_iext_find_first_leaf(
170 	struct xfs_ifork	*ifp)
171 {
172 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
173 	int			height;
174 
175 	if (!ifp->if_height)
176 		return NULL;
177 
178 	for (height = ifp->if_height; height > 1; height--) {
179 		node = node->ptrs[0];
180 		ASSERT(node);
181 	}
182 
183 	return node;
184 }
185 
186 static void *
187 xfs_iext_find_last_leaf(
188 	struct xfs_ifork	*ifp)
189 {
190 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
191 	int			height, i;
192 
193 	if (!ifp->if_height)
194 		return NULL;
195 
196 	for (height = ifp->if_height; height > 1; height--) {
197 		for (i = 1; i < KEYS_PER_NODE; i++)
198 			if (!node->ptrs[i])
199 				break;
200 		node = node->ptrs[i - 1];
201 		ASSERT(node);
202 	}
203 
204 	return node;
205 }
206 
207 void
208 xfs_iext_first(
209 	struct xfs_ifork	*ifp,
210 	struct xfs_iext_cursor	*cur)
211 {
212 	cur->pos = 0;
213 	cur->leaf = xfs_iext_find_first_leaf(ifp);
214 }
215 
216 void
217 xfs_iext_last(
218 	struct xfs_ifork	*ifp,
219 	struct xfs_iext_cursor	*cur)
220 {
221 	int			i;
222 
223 	cur->leaf = xfs_iext_find_last_leaf(ifp);
224 	if (!cur->leaf) {
225 		cur->pos = 0;
226 		return;
227 	}
228 
229 	for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
230 		if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
231 			break;
232 	}
233 	cur->pos = i - 1;
234 }
235 
236 void
237 xfs_iext_next(
238 	struct xfs_ifork	*ifp,
239 	struct xfs_iext_cursor	*cur)
240 {
241 	if (!cur->leaf) {
242 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
243 		xfs_iext_first(ifp, cur);
244 		return;
245 	}
246 
247 	ASSERT(cur->pos >= 0);
248 	ASSERT(cur->pos < xfs_iext_max_recs(ifp));
249 
250 	cur->pos++;
251 	if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
252 	    cur->leaf->next) {
253 		cur->leaf = cur->leaf->next;
254 		cur->pos = 0;
255 	}
256 }
257 
258 void
259 xfs_iext_prev(
260 	struct xfs_ifork	*ifp,
261 	struct xfs_iext_cursor	*cur)
262 {
263 	if (!cur->leaf) {
264 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
265 		xfs_iext_last(ifp, cur);
266 		return;
267 	}
268 
269 	ASSERT(cur->pos >= 0);
270 	ASSERT(cur->pos <= RECS_PER_LEAF);
271 
272 recurse:
273 	do {
274 		cur->pos--;
275 		if (xfs_iext_valid(ifp, cur))
276 			return;
277 	} while (cur->pos > 0);
278 
279 	if (ifp->if_height > 1 && cur->leaf->prev) {
280 		cur->leaf = cur->leaf->prev;
281 		cur->pos = RECS_PER_LEAF;
282 		goto recurse;
283 	}
284 }
285 
286 static inline int
287 xfs_iext_key_cmp(
288 	struct xfs_iext_node	*node,
289 	int			n,
290 	xfs_fileoff_t		offset)
291 {
292 	if (node->keys[n] > offset)
293 		return 1;
294 	if (node->keys[n] < offset)
295 		return -1;
296 	return 0;
297 }
298 
299 static inline int
300 xfs_iext_rec_cmp(
301 	struct xfs_iext_rec	*rec,
302 	xfs_fileoff_t		offset)
303 {
304 	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
305 	uint32_t		rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
306 
307 	if (rec_offset > offset)
308 		return 1;
309 	if (rec_offset + rec_len <= offset)
310 		return -1;
311 	return 0;
312 }
313 
314 static void *
315 xfs_iext_find_level(
316 	struct xfs_ifork	*ifp,
317 	xfs_fileoff_t		offset,
318 	int			level)
319 {
320 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
321 	int			height, i;
322 
323 	if (!ifp->if_height)
324 		return NULL;
325 
326 	for (height = ifp->if_height; height > level; height--) {
327 		for (i = 1; i < KEYS_PER_NODE; i++)
328 			if (xfs_iext_key_cmp(node, i, offset) > 0)
329 				break;
330 
331 		node = node->ptrs[i - 1];
332 		if (!node)
333 			break;
334 	}
335 
336 	return node;
337 }
338 
339 static int
340 xfs_iext_node_pos(
341 	struct xfs_iext_node	*node,
342 	xfs_fileoff_t		offset)
343 {
344 	int			i;
345 
346 	for (i = 1; i < KEYS_PER_NODE; i++) {
347 		if (xfs_iext_key_cmp(node, i, offset) > 0)
348 			break;
349 	}
350 
351 	return i - 1;
352 }
353 
354 static int
355 xfs_iext_node_insert_pos(
356 	struct xfs_iext_node	*node,
357 	xfs_fileoff_t		offset)
358 {
359 	int			i;
360 
361 	for (i = 0; i < KEYS_PER_NODE; i++) {
362 		if (xfs_iext_key_cmp(node, i, offset) > 0)
363 			return i;
364 	}
365 
366 	return KEYS_PER_NODE;
367 }
368 
369 static int
370 xfs_iext_node_nr_entries(
371 	struct xfs_iext_node	*node,
372 	int			start)
373 {
374 	int			i;
375 
376 	for (i = start; i < KEYS_PER_NODE; i++) {
377 		if (node->keys[i] == XFS_IEXT_KEY_INVALID)
378 			break;
379 	}
380 
381 	return i;
382 }
383 
384 static int
385 xfs_iext_leaf_nr_entries(
386 	struct xfs_ifork	*ifp,
387 	struct xfs_iext_leaf	*leaf,
388 	int			start)
389 {
390 	int			i;
391 
392 	for (i = start; i < xfs_iext_max_recs(ifp); i++) {
393 		if (xfs_iext_rec_is_empty(&leaf->recs[i]))
394 			break;
395 	}
396 
397 	return i;
398 }
399 
400 static inline uint64_t
401 xfs_iext_leaf_key(
402 	struct xfs_iext_leaf	*leaf,
403 	int			n)
404 {
405 	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
406 }
407 
408 static void
409 xfs_iext_grow(
410 	struct xfs_ifork	*ifp)
411 {
412 	struct xfs_iext_node	*node = kmem_zalloc(NODE_SIZE, KM_NOFS);
413 	int			i;
414 
415 	if (ifp->if_height == 1) {
416 		struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
417 
418 		node->keys[0] = xfs_iext_leaf_key(prev, 0);
419 		node->ptrs[0] = prev;
420 	} else  {
421 		struct xfs_iext_node *prev = ifp->if_u1.if_root;
422 
423 		ASSERT(ifp->if_height > 1);
424 
425 		node->keys[0] = prev->keys[0];
426 		node->ptrs[0] = prev;
427 	}
428 
429 	for (i = 1; i < KEYS_PER_NODE; i++)
430 		node->keys[i] = XFS_IEXT_KEY_INVALID;
431 
432 	ifp->if_u1.if_root = node;
433 	ifp->if_height++;
434 }
435 
436 static void
437 xfs_iext_update_node(
438 	struct xfs_ifork	*ifp,
439 	xfs_fileoff_t		old_offset,
440 	xfs_fileoff_t		new_offset,
441 	int			level,
442 	void			*ptr)
443 {
444 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
445 	int			height, i;
446 
447 	for (height = ifp->if_height; height > level; height--) {
448 		for (i = 0; i < KEYS_PER_NODE; i++) {
449 			if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
450 				break;
451 			if (node->keys[i] == old_offset)
452 				node->keys[i] = new_offset;
453 		}
454 		node = node->ptrs[i - 1];
455 		ASSERT(node);
456 	}
457 
458 	ASSERT(node == ptr);
459 }
460 
461 static struct xfs_iext_node *
462 xfs_iext_split_node(
463 	struct xfs_iext_node	**nodep,
464 	int			*pos,
465 	int			*nr_entries)
466 {
467 	struct xfs_iext_node	*node = *nodep;
468 	struct xfs_iext_node	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
469 	const int		nr_move = KEYS_PER_NODE / 2;
470 	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
471 	int			i = 0;
472 
473 	/* for sequential append operations just spill over into the new node */
474 	if (*pos == KEYS_PER_NODE) {
475 		*nodep = new;
476 		*pos = 0;
477 		*nr_entries = 0;
478 		goto done;
479 	}
480 
481 
482 	for (i = 0; i < nr_move; i++) {
483 		new->keys[i] = node->keys[nr_keep + i];
484 		new->ptrs[i] = node->ptrs[nr_keep + i];
485 
486 		node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
487 		node->ptrs[nr_keep + i] = NULL;
488 	}
489 
490 	if (*pos >= nr_keep) {
491 		*nodep = new;
492 		*pos -= nr_keep;
493 		*nr_entries = nr_move;
494 	} else {
495 		*nr_entries = nr_keep;
496 	}
497 done:
498 	for (; i < KEYS_PER_NODE; i++)
499 		new->keys[i] = XFS_IEXT_KEY_INVALID;
500 	return new;
501 }
502 
503 static void
504 xfs_iext_insert_node(
505 	struct xfs_ifork	*ifp,
506 	uint64_t		offset,
507 	void			*ptr,
508 	int			level)
509 {
510 	struct xfs_iext_node	*node, *new;
511 	int			i, pos, nr_entries;
512 
513 again:
514 	if (ifp->if_height < level)
515 		xfs_iext_grow(ifp);
516 
517 	new = NULL;
518 	node = xfs_iext_find_level(ifp, offset, level);
519 	pos = xfs_iext_node_insert_pos(node, offset);
520 	nr_entries = xfs_iext_node_nr_entries(node, pos);
521 
522 	ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
523 	ASSERT(nr_entries <= KEYS_PER_NODE);
524 
525 	if (nr_entries == KEYS_PER_NODE)
526 		new = xfs_iext_split_node(&node, &pos, &nr_entries);
527 
528 	/*
529 	 * Update the pointers in higher levels if the first entry changes
530 	 * in an existing node.
531 	 */
532 	if (node != new && pos == 0 && nr_entries > 0)
533 		xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
534 
535 	for (i = nr_entries; i > pos; i--) {
536 		node->keys[i] = node->keys[i - 1];
537 		node->ptrs[i] = node->ptrs[i - 1];
538 	}
539 	node->keys[pos] = offset;
540 	node->ptrs[pos] = ptr;
541 
542 	if (new) {
543 		offset = new->keys[0];
544 		ptr = new;
545 		level++;
546 		goto again;
547 	}
548 }
549 
550 static struct xfs_iext_leaf *
551 xfs_iext_split_leaf(
552 	struct xfs_iext_cursor	*cur,
553 	int			*nr_entries)
554 {
555 	struct xfs_iext_leaf	*leaf = cur->leaf;
556 	struct xfs_iext_leaf	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
557 	const int		nr_move = RECS_PER_LEAF / 2;
558 	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
559 	int			i;
560 
561 	/* for sequential append operations just spill over into the new node */
562 	if (cur->pos == RECS_PER_LEAF) {
563 		cur->leaf = new;
564 		cur->pos = 0;
565 		*nr_entries = 0;
566 		goto done;
567 	}
568 
569 	for (i = 0; i < nr_move; i++) {
570 		new->recs[i] = leaf->recs[nr_keep + i];
571 		xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
572 	}
573 
574 	if (cur->pos >= nr_keep) {
575 		cur->leaf = new;
576 		cur->pos -= nr_keep;
577 		*nr_entries = nr_move;
578 	} else {
579 		*nr_entries = nr_keep;
580 	}
581 done:
582 	if (leaf->next)
583 		leaf->next->prev = new;
584 	new->next = leaf->next;
585 	new->prev = leaf;
586 	leaf->next = new;
587 	return new;
588 }
589 
590 static void
591 xfs_iext_alloc_root(
592 	struct xfs_ifork	*ifp,
593 	struct xfs_iext_cursor	*cur)
594 {
595 	ASSERT(ifp->if_bytes == 0);
596 
597 	ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
598 	ifp->if_height = 1;
599 
600 	/* now that we have a node step into it */
601 	cur->leaf = ifp->if_u1.if_root;
602 	cur->pos = 0;
603 }
604 
605 static void
606 xfs_iext_realloc_root(
607 	struct xfs_ifork	*ifp,
608 	struct xfs_iext_cursor	*cur)
609 {
610 	size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
611 	void *new;
612 
613 	/* account for the prev/next pointers */
614 	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
615 		new_size = NODE_SIZE;
616 
617 	new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
618 	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
619 	ifp->if_u1.if_root = new;
620 	cur->leaf = new;
621 }
622 
623 void
624 xfs_iext_insert(
625 	struct xfs_inode	*ip,
626 	struct xfs_iext_cursor	*cur,
627 	struct xfs_bmbt_irec	*irec,
628 	int			state)
629 {
630 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
631 	xfs_fileoff_t		offset = irec->br_startoff;
632 	struct xfs_iext_leaf	*new = NULL;
633 	int			nr_entries, i;
634 
635 	if (ifp->if_height == 0)
636 		xfs_iext_alloc_root(ifp, cur);
637 	else if (ifp->if_height == 1)
638 		xfs_iext_realloc_root(ifp, cur);
639 
640 	nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
641 	ASSERT(nr_entries <= RECS_PER_LEAF);
642 	ASSERT(cur->pos >= nr_entries ||
643 	       xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
644 
645 	if (nr_entries == RECS_PER_LEAF)
646 		new = xfs_iext_split_leaf(cur, &nr_entries);
647 
648 	/*
649 	 * Update the pointers in higher levels if the first entry changes
650 	 * in an existing node.
651 	 */
652 	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
653 		xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
654 				offset, 1, cur->leaf);
655 	}
656 
657 	for (i = nr_entries; i > cur->pos; i--)
658 		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
659 	xfs_iext_set(cur_rec(cur), irec);
660 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
661 
662 	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
663 
664 	if (new)
665 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
666 }
667 
668 static struct xfs_iext_node *
669 xfs_iext_rebalance_node(
670 	struct xfs_iext_node	*parent,
671 	int			*pos,
672 	struct xfs_iext_node	*node,
673 	int			nr_entries)
674 {
675 	/*
676 	 * If the neighbouring nodes are completely full, or have different
677 	 * parents, we might never be able to merge our node, and will only
678 	 * delete it once the number of entries hits zero.
679 	 */
680 	if (nr_entries == 0)
681 		return node;
682 
683 	if (*pos > 0) {
684 		struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
685 		int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
686 
687 		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
688 			for (i = 0; i < nr_entries; i++) {
689 				prev->keys[nr_prev + i] = node->keys[i];
690 				prev->ptrs[nr_prev + i] = node->ptrs[i];
691 			}
692 			return node;
693 		}
694 	}
695 
696 	if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
697 		struct xfs_iext_node *next = parent->ptrs[*pos + 1];
698 		int nr_next = xfs_iext_node_nr_entries(next, 0), i;
699 
700 		if (nr_entries + nr_next <= KEYS_PER_NODE) {
701 			/*
702 			 * Merge the next node into this node so that we don't
703 			 * have to do an additional update of the keys in the
704 			 * higher levels.
705 			 */
706 			for (i = 0; i < nr_next; i++) {
707 				node->keys[nr_entries + i] = next->keys[i];
708 				node->ptrs[nr_entries + i] = next->ptrs[i];
709 			}
710 
711 			++*pos;
712 			return next;
713 		}
714 	}
715 
716 	return NULL;
717 }
718 
719 static void
720 xfs_iext_remove_node(
721 	struct xfs_ifork	*ifp,
722 	xfs_fileoff_t		offset,
723 	void			*victim)
724 {
725 	struct xfs_iext_node	*node, *parent;
726 	int			level = 2, pos, nr_entries, i;
727 
728 	ASSERT(level <= ifp->if_height);
729 	node = xfs_iext_find_level(ifp, offset, level);
730 	pos = xfs_iext_node_pos(node, offset);
731 again:
732 	ASSERT(node->ptrs[pos]);
733 	ASSERT(node->ptrs[pos] == victim);
734 	kmem_free(victim);
735 
736 	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
737 	offset = node->keys[0];
738 	for (i = pos; i < nr_entries; i++) {
739 		node->keys[i] = node->keys[i + 1];
740 		node->ptrs[i] = node->ptrs[i + 1];
741 	}
742 	node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
743 	node->ptrs[nr_entries] = NULL;
744 
745 	if (pos == 0 && nr_entries > 0) {
746 		xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
747 		offset = node->keys[0];
748 	}
749 
750 	if (nr_entries >= KEYS_PER_NODE / 2)
751 		return;
752 
753 	if (level < ifp->if_height) {
754 		/*
755 		 * If we aren't at the root yet try to find a neighbour node to
756 		 * merge with (or delete the node if it is empty), and then
757 		 * recurse up to the next level.
758 		 */
759 		level++;
760 		parent = xfs_iext_find_level(ifp, offset, level);
761 		pos = xfs_iext_node_pos(parent, offset);
762 
763 		ASSERT(pos != KEYS_PER_NODE);
764 		ASSERT(parent->ptrs[pos] == node);
765 
766 		node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
767 		if (node) {
768 			victim = node;
769 			node = parent;
770 			goto again;
771 		}
772 	} else if (nr_entries == 1) {
773 		/*
774 		 * If we are at the root and only one entry is left we can just
775 		 * free this node and update the root pointer.
776 		 */
777 		ASSERT(node == ifp->if_u1.if_root);
778 		ifp->if_u1.if_root = node->ptrs[0];
779 		ifp->if_height--;
780 		kmem_free(node);
781 	}
782 }
783 
784 static void
785 xfs_iext_rebalance_leaf(
786 	struct xfs_ifork	*ifp,
787 	struct xfs_iext_cursor	*cur,
788 	struct xfs_iext_leaf	*leaf,
789 	xfs_fileoff_t		offset,
790 	int			nr_entries)
791 {
792 	/*
793 	 * If the neighbouring nodes are completely full we might never be able
794 	 * to merge our node, and will only delete it once the number of
795 	 * entries hits zero.
796 	 */
797 	if (nr_entries == 0)
798 		goto remove_node;
799 
800 	if (leaf->prev) {
801 		int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
802 
803 		if (nr_prev + nr_entries <= RECS_PER_LEAF) {
804 			for (i = 0; i < nr_entries; i++)
805 				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
806 
807 			if (cur->leaf == leaf) {
808 				cur->leaf = leaf->prev;
809 				cur->pos += nr_prev;
810 			}
811 			goto remove_node;
812 		}
813 	}
814 
815 	if (leaf->next) {
816 		int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
817 
818 		if (nr_entries + nr_next <= RECS_PER_LEAF) {
819 			/*
820 			 * Merge the next node into this node so that we don't
821 			 * have to do an additional update of the keys in the
822 			 * higher levels.
823 			 */
824 			for (i = 0; i < nr_next; i++) {
825 				leaf->recs[nr_entries + i] =
826 					leaf->next->recs[i];
827 			}
828 
829 			if (cur->leaf == leaf->next) {
830 				cur->leaf = leaf;
831 				cur->pos += nr_entries;
832 			}
833 
834 			offset = xfs_iext_leaf_key(leaf->next, 0);
835 			leaf = leaf->next;
836 			goto remove_node;
837 		}
838 	}
839 
840 	return;
841 remove_node:
842 	if (leaf->prev)
843 		leaf->prev->next = leaf->next;
844 	if (leaf->next)
845 		leaf->next->prev = leaf->prev;
846 	xfs_iext_remove_node(ifp, offset, leaf);
847 }
848 
849 static void
850 xfs_iext_free_last_leaf(
851 	struct xfs_ifork	*ifp)
852 {
853 	ifp->if_height--;
854 	kmem_free(ifp->if_u1.if_root);
855 	ifp->if_u1.if_root = NULL;
856 }
857 
858 void
859 xfs_iext_remove(
860 	struct xfs_inode	*ip,
861 	struct xfs_iext_cursor	*cur,
862 	int			state)
863 {
864 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
865 	struct xfs_iext_leaf	*leaf = cur->leaf;
866 	xfs_fileoff_t		offset = xfs_iext_leaf_key(leaf, 0);
867 	int			i, nr_entries;
868 
869 	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
870 
871 	ASSERT(ifp->if_height > 0);
872 	ASSERT(ifp->if_u1.if_root != NULL);
873 	ASSERT(xfs_iext_valid(ifp, cur));
874 
875 	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
876 	for (i = cur->pos; i < nr_entries; i++)
877 		leaf->recs[i] = leaf->recs[i + 1];
878 	xfs_iext_rec_clear(&leaf->recs[nr_entries]);
879 	ifp->if_bytes -= sizeof(struct xfs_iext_rec);
880 
881 	if (cur->pos == 0 && nr_entries > 0) {
882 		xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
883 				leaf);
884 		offset = xfs_iext_leaf_key(leaf, 0);
885 	} else if (cur->pos == nr_entries) {
886 		if (ifp->if_height > 1 && leaf->next)
887 			cur->leaf = leaf->next;
888 		else
889 			cur->leaf = NULL;
890 		cur->pos = 0;
891 	}
892 
893 	if (nr_entries >= RECS_PER_LEAF / 2)
894 		return;
895 
896 	if (ifp->if_height > 1)
897 		xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
898 	else if (nr_entries == 0)
899 		xfs_iext_free_last_leaf(ifp);
900 }
901 
902 /*
903  * Lookup the extent covering bno.
904  *
905  * If there is an extent covering bno return the extent index, and store the
906  * expanded extent structure in *gotp, and the extent cursor in *cur.
907  * If there is no extent covering bno, but there is an extent after it (e.g.
908  * it lies in a hole) return that extent in *gotp and its cursor in *cur
909  * instead.
910  * If bno is beyond the last extent return false, and return an invalid
911  * cursor value.
912  */
913 bool
914 xfs_iext_lookup_extent(
915 	struct xfs_inode	*ip,
916 	struct xfs_ifork	*ifp,
917 	xfs_fileoff_t		offset,
918 	struct xfs_iext_cursor	*cur,
919 	struct xfs_bmbt_irec	*gotp)
920 {
921 	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
922 
923 	cur->leaf = xfs_iext_find_level(ifp, offset, 1);
924 	if (!cur->leaf) {
925 		cur->pos = 0;
926 		return false;
927 	}
928 
929 	for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
930 		struct xfs_iext_rec *rec = cur_rec(cur);
931 
932 		if (xfs_iext_rec_is_empty(rec))
933 			break;
934 		if (xfs_iext_rec_cmp(rec, offset) >= 0)
935 			goto found;
936 	}
937 
938 	/* Try looking in the next node for an entry > offset */
939 	if (ifp->if_height == 1 || !cur->leaf->next)
940 		return false;
941 	cur->leaf = cur->leaf->next;
942 	cur->pos = 0;
943 	if (!xfs_iext_valid(ifp, cur))
944 		return false;
945 found:
946 	xfs_iext_get(gotp, cur_rec(cur));
947 	return true;
948 }
949 
950 /*
951  * Returns the last extent before end, and if this extent doesn't cover
952  * end, update end to the end of the extent.
953  */
954 bool
955 xfs_iext_lookup_extent_before(
956 	struct xfs_inode	*ip,
957 	struct xfs_ifork	*ifp,
958 	xfs_fileoff_t		*end,
959 	struct xfs_iext_cursor	*cur,
960 	struct xfs_bmbt_irec	*gotp)
961 {
962 	/* could be optimized to not even look up the next on a match.. */
963 	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
964 	    gotp->br_startoff <= *end - 1)
965 		return true;
966 	if (!xfs_iext_prev_extent(ifp, cur, gotp))
967 		return false;
968 	*end = gotp->br_startoff + gotp->br_blockcount;
969 	return true;
970 }
971 
972 void
973 xfs_iext_update_extent(
974 	struct xfs_inode	*ip,
975 	int			state,
976 	struct xfs_iext_cursor	*cur,
977 	struct xfs_bmbt_irec	*new)
978 {
979 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
980 
981 	if (cur->pos == 0) {
982 		struct xfs_bmbt_irec	old;
983 
984 		xfs_iext_get(&old, cur_rec(cur));
985 		if (new->br_startoff != old.br_startoff) {
986 			xfs_iext_update_node(ifp, old.br_startoff,
987 					new->br_startoff, 1, cur->leaf);
988 		}
989 	}
990 
991 	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
992 	xfs_iext_set(cur_rec(cur), new);
993 	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
994 }
995 
996 /*
997  * Return true if the cursor points at an extent and return the extent structure
998  * in gotp.  Else return false.
999  */
1000 bool
1001 xfs_iext_get_extent(
1002 	struct xfs_ifork	*ifp,
1003 	struct xfs_iext_cursor	*cur,
1004 	struct xfs_bmbt_irec	*gotp)
1005 {
1006 	if (!xfs_iext_valid(ifp, cur))
1007 		return false;
1008 	xfs_iext_get(gotp, cur_rec(cur));
1009 	return true;
1010 }
1011 
1012 /*
1013  * This is a recursive function, because of that we need to be extremely
1014  * careful with stack usage.
1015  */
1016 static void
1017 xfs_iext_destroy_node(
1018 	struct xfs_iext_node	*node,
1019 	int			level)
1020 {
1021 	int			i;
1022 
1023 	if (level > 1) {
1024 		for (i = 0; i < KEYS_PER_NODE; i++) {
1025 			if (node->keys[i] == XFS_IEXT_KEY_INVALID)
1026 				break;
1027 			xfs_iext_destroy_node(node->ptrs[i], level - 1);
1028 		}
1029 	}
1030 
1031 	kmem_free(node);
1032 }
1033 
1034 void
1035 xfs_iext_destroy(
1036 	struct xfs_ifork	*ifp)
1037 {
1038 	xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
1039 
1040 	ifp->if_bytes = 0;
1041 	ifp->if_height = 0;
1042 	ifp->if_u1.if_root = NULL;
1043 }
1044