xref: /linux/fs/xfs/libxfs/xfs_iext_tree.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017 Christoph Hellwig.
4  */
5 
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_bit.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trace.h"
15 
16 /*
17  * In-core extent record layout:
18  *
19  * +-------+----------------------------+
20  * | 00:53 | all 54 bits of startoff    |
21  * | 54:63 | low 10 bits of startblock  |
22  * +-------+----------------------------+
23  * | 00:20 | all 21 bits of length      |
24  * |    21 | unwritten extent bit       |
25  * | 22:63 | high 42 bits of startblock |
26  * +-------+----------------------------+
27  */
28 #define XFS_IEXT_STARTOFF_MASK		xfs_mask64lo(BMBT_STARTOFF_BITLEN)
29 #define XFS_IEXT_LENGTH_MASK		xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
30 #define XFS_IEXT_STARTBLOCK_MASK	xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
31 
32 struct xfs_iext_rec {
33 	uint64_t			lo;
34 	uint64_t			hi;
35 };
36 
37 /*
38  * Given that the length can't be a zero, only an empty hi value indicates an
39  * unused record.
40  */
41 static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
42 {
43 	return rec->hi == 0;
44 }
45 
46 static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
47 {
48 	rec->lo = 0;
49 	rec->hi = 0;
50 }
51 
52 static void
53 xfs_iext_set(
54 	struct xfs_iext_rec	*rec,
55 	struct xfs_bmbt_irec	*irec)
56 {
57 	ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
58 	ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
59 	ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
60 
61 	rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
62 	rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
63 
64 	rec->lo |= (irec->br_startblock << 54);
65 	rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
66 
67 	if (irec->br_state == XFS_EXT_UNWRITTEN)
68 		rec->hi |= (1 << 21);
69 }
70 
71 static void
72 xfs_iext_get(
73 	struct xfs_bmbt_irec	*irec,
74 	struct xfs_iext_rec	*rec)
75 {
76 	irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
77 	irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
78 
79 	irec->br_startblock = rec->lo >> 54;
80 	irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
81 
82 	if (rec->hi & (1 << 21))
83 		irec->br_state = XFS_EXT_UNWRITTEN;
84 	else
85 		irec->br_state = XFS_EXT_NORM;
86 }
87 
88 enum {
89 	NODE_SIZE	= 256,
90 	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
91 	RECS_PER_LEAF	= (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
92 				sizeof(struct xfs_iext_rec),
93 };
94 
95 /*
96  * In-core extent btree block layout:
97  *
98  * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
99  *
100  * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
101  * contain the startoffset, blockcount, startblock and unwritten extent flag.
102  * See above for the exact format, followed by pointers to the previous and next
103  * leaf blocks (if there are any).
104  *
105  * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
106  * by an equal number of pointers to the btree blocks at the next lower level.
107  *
108  *		+-------+-------+-------+-------+-------+----------+----------+
109  * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
110  *		+-------+-------+-------+-------+-------+----------+----------+
111  *
112  *		+-------+-------+-------+-------+-------+-------+------+-------+
113  * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
114  *		+-------+-------+-------+-------+-------+-------+------+-------+
115  */
116 struct xfs_iext_node {
117 	uint64_t		keys[KEYS_PER_NODE];
118 #define XFS_IEXT_KEY_INVALID	(1ULL << 63)
119 	void			*ptrs[KEYS_PER_NODE];
120 };
121 
122 struct xfs_iext_leaf {
123 	struct xfs_iext_rec	recs[RECS_PER_LEAF];
124 	struct xfs_iext_leaf	*prev;
125 	struct xfs_iext_leaf	*next;
126 };
127 
128 inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
129 {
130 	return ifp->if_bytes / sizeof(struct xfs_iext_rec);
131 }
132 
133 static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
134 {
135 	if (ifp->if_height == 1)
136 		return xfs_iext_count(ifp);
137 	return RECS_PER_LEAF;
138 }
139 
140 static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
141 {
142 	return &cur->leaf->recs[cur->pos];
143 }
144 
145 static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
146 		struct xfs_iext_cursor *cur)
147 {
148 	if (!cur->leaf)
149 		return false;
150 	if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
151 		return false;
152 	if (xfs_iext_rec_is_empty(cur_rec(cur)))
153 		return false;
154 	return true;
155 }
156 
157 static void *
158 xfs_iext_find_first_leaf(
159 	struct xfs_ifork	*ifp)
160 {
161 	struct xfs_iext_node	*node = ifp->if_data;
162 	int			height;
163 
164 	if (!ifp->if_height)
165 		return NULL;
166 
167 	for (height = ifp->if_height; height > 1; height--) {
168 		node = node->ptrs[0];
169 		ASSERT(node);
170 	}
171 
172 	return node;
173 }
174 
175 static void *
176 xfs_iext_find_last_leaf(
177 	struct xfs_ifork	*ifp)
178 {
179 	struct xfs_iext_node	*node = ifp->if_data;
180 	int			height, i;
181 
182 	if (!ifp->if_height)
183 		return NULL;
184 
185 	for (height = ifp->if_height; height > 1; height--) {
186 		for (i = 1; i < KEYS_PER_NODE; i++)
187 			if (!node->ptrs[i])
188 				break;
189 		node = node->ptrs[i - 1];
190 		ASSERT(node);
191 	}
192 
193 	return node;
194 }
195 
196 void
197 xfs_iext_first(
198 	struct xfs_ifork	*ifp,
199 	struct xfs_iext_cursor	*cur)
200 {
201 	cur->pos = 0;
202 	cur->leaf = xfs_iext_find_first_leaf(ifp);
203 }
204 
205 void
206 xfs_iext_last(
207 	struct xfs_ifork	*ifp,
208 	struct xfs_iext_cursor	*cur)
209 {
210 	int			i;
211 
212 	cur->leaf = xfs_iext_find_last_leaf(ifp);
213 	if (!cur->leaf) {
214 		cur->pos = 0;
215 		return;
216 	}
217 
218 	for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
219 		if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
220 			break;
221 	}
222 	cur->pos = i - 1;
223 }
224 
225 void
226 xfs_iext_next(
227 	struct xfs_ifork	*ifp,
228 	struct xfs_iext_cursor	*cur)
229 {
230 	if (!cur->leaf) {
231 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
232 		xfs_iext_first(ifp, cur);
233 		return;
234 	}
235 
236 	ASSERT(cur->pos >= 0);
237 	ASSERT(cur->pos < xfs_iext_max_recs(ifp));
238 
239 	cur->pos++;
240 	if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
241 	    cur->leaf->next) {
242 		cur->leaf = cur->leaf->next;
243 		cur->pos = 0;
244 	}
245 }
246 
247 void
248 xfs_iext_prev(
249 	struct xfs_ifork	*ifp,
250 	struct xfs_iext_cursor	*cur)
251 {
252 	if (!cur->leaf) {
253 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
254 		xfs_iext_last(ifp, cur);
255 		return;
256 	}
257 
258 	ASSERT(cur->pos >= 0);
259 	ASSERT(cur->pos <= RECS_PER_LEAF);
260 
261 recurse:
262 	do {
263 		cur->pos--;
264 		if (xfs_iext_valid(ifp, cur))
265 			return;
266 	} while (cur->pos > 0);
267 
268 	if (ifp->if_height > 1 && cur->leaf->prev) {
269 		cur->leaf = cur->leaf->prev;
270 		cur->pos = RECS_PER_LEAF;
271 		goto recurse;
272 	}
273 }
274 
275 static inline int
276 xfs_iext_key_cmp(
277 	struct xfs_iext_node	*node,
278 	int			n,
279 	xfs_fileoff_t		offset)
280 {
281 	if (node->keys[n] > offset)
282 		return 1;
283 	if (node->keys[n] < offset)
284 		return -1;
285 	return 0;
286 }
287 
288 static inline int
289 xfs_iext_rec_cmp(
290 	struct xfs_iext_rec	*rec,
291 	xfs_fileoff_t		offset)
292 {
293 	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
294 	uint32_t		rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
295 
296 	if (rec_offset > offset)
297 		return 1;
298 	if (rec_offset + rec_len <= offset)
299 		return -1;
300 	return 0;
301 }
302 
303 static void *
304 xfs_iext_find_level(
305 	struct xfs_ifork	*ifp,
306 	xfs_fileoff_t		offset,
307 	int			level)
308 {
309 	struct xfs_iext_node	*node = ifp->if_data;
310 	int			height, i;
311 
312 	if (!ifp->if_height)
313 		return NULL;
314 
315 	for (height = ifp->if_height; height > level; height--) {
316 		for (i = 1; i < KEYS_PER_NODE; i++)
317 			if (xfs_iext_key_cmp(node, i, offset) > 0)
318 				break;
319 
320 		node = node->ptrs[i - 1];
321 		if (!node)
322 			break;
323 	}
324 
325 	return node;
326 }
327 
328 static int
329 xfs_iext_node_pos(
330 	struct xfs_iext_node	*node,
331 	xfs_fileoff_t		offset)
332 {
333 	int			i;
334 
335 	for (i = 1; i < KEYS_PER_NODE; i++) {
336 		if (xfs_iext_key_cmp(node, i, offset) > 0)
337 			break;
338 	}
339 
340 	return i - 1;
341 }
342 
343 static int
344 xfs_iext_node_insert_pos(
345 	struct xfs_iext_node	*node,
346 	xfs_fileoff_t		offset)
347 {
348 	int			i;
349 
350 	for (i = 0; i < KEYS_PER_NODE; i++) {
351 		if (xfs_iext_key_cmp(node, i, offset) > 0)
352 			return i;
353 	}
354 
355 	return KEYS_PER_NODE;
356 }
357 
358 static int
359 xfs_iext_node_nr_entries(
360 	struct xfs_iext_node	*node,
361 	int			start)
362 {
363 	int			i;
364 
365 	for (i = start; i < KEYS_PER_NODE; i++) {
366 		if (node->keys[i] == XFS_IEXT_KEY_INVALID)
367 			break;
368 	}
369 
370 	return i;
371 }
372 
373 static int
374 xfs_iext_leaf_nr_entries(
375 	struct xfs_ifork	*ifp,
376 	struct xfs_iext_leaf	*leaf,
377 	int			start)
378 {
379 	int			i;
380 
381 	for (i = start; i < xfs_iext_max_recs(ifp); i++) {
382 		if (xfs_iext_rec_is_empty(&leaf->recs[i]))
383 			break;
384 	}
385 
386 	return i;
387 }
388 
389 static inline uint64_t
390 xfs_iext_leaf_key(
391 	struct xfs_iext_leaf	*leaf,
392 	int			n)
393 {
394 	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
395 }
396 
397 static inline void *
398 xfs_iext_alloc_node(
399 	int	size)
400 {
401 	return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
402 }
403 
404 static void
405 xfs_iext_grow(
406 	struct xfs_ifork	*ifp)
407 {
408 	struct xfs_iext_node	*node = xfs_iext_alloc_node(NODE_SIZE);
409 	int			i;
410 
411 	if (ifp->if_height == 1) {
412 		struct xfs_iext_leaf *prev = ifp->if_data;
413 
414 		node->keys[0] = xfs_iext_leaf_key(prev, 0);
415 		node->ptrs[0] = prev;
416 	} else  {
417 		struct xfs_iext_node *prev = ifp->if_data;
418 
419 		ASSERT(ifp->if_height > 1);
420 
421 		node->keys[0] = prev->keys[0];
422 		node->ptrs[0] = prev;
423 	}
424 
425 	for (i = 1; i < KEYS_PER_NODE; i++)
426 		node->keys[i] = XFS_IEXT_KEY_INVALID;
427 
428 	ifp->if_data = node;
429 	ifp->if_height++;
430 }
431 
432 static void
433 xfs_iext_update_node(
434 	struct xfs_ifork	*ifp,
435 	xfs_fileoff_t		old_offset,
436 	xfs_fileoff_t		new_offset,
437 	int			level,
438 	void			*ptr)
439 {
440 	struct xfs_iext_node	*node = ifp->if_data;
441 	int			height, i;
442 
443 	for (height = ifp->if_height; height > level; height--) {
444 		for (i = 0; i < KEYS_PER_NODE; i++) {
445 			if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
446 				break;
447 			if (node->keys[i] == old_offset)
448 				node->keys[i] = new_offset;
449 		}
450 		node = node->ptrs[i - 1];
451 		ASSERT(node);
452 	}
453 
454 	ASSERT(node == ptr);
455 }
456 
457 static struct xfs_iext_node *
458 xfs_iext_split_node(
459 	struct xfs_iext_node	**nodep,
460 	int			*pos,
461 	int			*nr_entries)
462 {
463 	struct xfs_iext_node	*node = *nodep;
464 	struct xfs_iext_node	*new = xfs_iext_alloc_node(NODE_SIZE);
465 	const int		nr_move = KEYS_PER_NODE / 2;
466 	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
467 	int			i = 0;
468 
469 	/* for sequential append operations just spill over into the new node */
470 	if (*pos == KEYS_PER_NODE) {
471 		*nodep = new;
472 		*pos = 0;
473 		*nr_entries = 0;
474 		goto done;
475 	}
476 
477 
478 	for (i = 0; i < nr_move; i++) {
479 		new->keys[i] = node->keys[nr_keep + i];
480 		new->ptrs[i] = node->ptrs[nr_keep + i];
481 
482 		node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
483 		node->ptrs[nr_keep + i] = NULL;
484 	}
485 
486 	if (*pos >= nr_keep) {
487 		*nodep = new;
488 		*pos -= nr_keep;
489 		*nr_entries = nr_move;
490 	} else {
491 		*nr_entries = nr_keep;
492 	}
493 done:
494 	for (; i < KEYS_PER_NODE; i++)
495 		new->keys[i] = XFS_IEXT_KEY_INVALID;
496 	return new;
497 }
498 
499 static void
500 xfs_iext_insert_node(
501 	struct xfs_ifork	*ifp,
502 	uint64_t		offset,
503 	void			*ptr,
504 	int			level)
505 {
506 	struct xfs_iext_node	*node, *new;
507 	int			i, pos, nr_entries;
508 
509 again:
510 	if (ifp->if_height < level)
511 		xfs_iext_grow(ifp);
512 
513 	new = NULL;
514 	node = xfs_iext_find_level(ifp, offset, level);
515 	pos = xfs_iext_node_insert_pos(node, offset);
516 	nr_entries = xfs_iext_node_nr_entries(node, pos);
517 
518 	ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
519 	ASSERT(nr_entries <= KEYS_PER_NODE);
520 
521 	if (nr_entries == KEYS_PER_NODE)
522 		new = xfs_iext_split_node(&node, &pos, &nr_entries);
523 
524 	/*
525 	 * Update the pointers in higher levels if the first entry changes
526 	 * in an existing node.
527 	 */
528 	if (node != new && pos == 0 && nr_entries > 0)
529 		xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
530 
531 	for (i = nr_entries; i > pos; i--) {
532 		node->keys[i] = node->keys[i - 1];
533 		node->ptrs[i] = node->ptrs[i - 1];
534 	}
535 	node->keys[pos] = offset;
536 	node->ptrs[pos] = ptr;
537 
538 	if (new) {
539 		offset = new->keys[0];
540 		ptr = new;
541 		level++;
542 		goto again;
543 	}
544 }
545 
546 static struct xfs_iext_leaf *
547 xfs_iext_split_leaf(
548 	struct xfs_iext_cursor	*cur,
549 	int			*nr_entries)
550 {
551 	struct xfs_iext_leaf	*leaf = cur->leaf;
552 	struct xfs_iext_leaf	*new = xfs_iext_alloc_node(NODE_SIZE);
553 	const int		nr_move = RECS_PER_LEAF / 2;
554 	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
555 	int			i;
556 
557 	/* for sequential append operations just spill over into the new node */
558 	if (cur->pos == RECS_PER_LEAF) {
559 		cur->leaf = new;
560 		cur->pos = 0;
561 		*nr_entries = 0;
562 		goto done;
563 	}
564 
565 	for (i = 0; i < nr_move; i++) {
566 		new->recs[i] = leaf->recs[nr_keep + i];
567 		xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
568 	}
569 
570 	if (cur->pos >= nr_keep) {
571 		cur->leaf = new;
572 		cur->pos -= nr_keep;
573 		*nr_entries = nr_move;
574 	} else {
575 		*nr_entries = nr_keep;
576 	}
577 done:
578 	if (leaf->next)
579 		leaf->next->prev = new;
580 	new->next = leaf->next;
581 	new->prev = leaf;
582 	leaf->next = new;
583 	return new;
584 }
585 
586 static void
587 xfs_iext_alloc_root(
588 	struct xfs_ifork	*ifp,
589 	struct xfs_iext_cursor	*cur)
590 {
591 	ASSERT(ifp->if_bytes == 0);
592 
593 	ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec));
594 	ifp->if_height = 1;
595 
596 	/* now that we have a node step into it */
597 	cur->leaf = ifp->if_data;
598 	cur->pos = 0;
599 }
600 
601 static void
602 xfs_iext_realloc_root(
603 	struct xfs_ifork	*ifp,
604 	struct xfs_iext_cursor	*cur)
605 {
606 	int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
607 	void *new;
608 
609 	/* account for the prev/next pointers */
610 	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
611 		new_size = NODE_SIZE;
612 
613 	new = krealloc(ifp->if_data, new_size,
614 			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
615 	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
616 	ifp->if_data = new;
617 	cur->leaf = new;
618 }
619 
620 /*
621  * Increment the sequence counter on extent tree changes. If we are on a COW
622  * fork, this allows the writeback code to skip looking for a COW extent if the
623  * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
624  * sequence counter is seen before the modifications to the extent tree itself
625  * take effect.
626  */
627 static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
628 {
629 	WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
630 }
631 
632 void
633 xfs_iext_insert_raw(
634 	struct xfs_ifork	*ifp,
635 	struct xfs_iext_cursor	*cur,
636 	struct xfs_bmbt_irec	*irec)
637 {
638 	xfs_fileoff_t		offset = irec->br_startoff;
639 	struct xfs_iext_leaf	*new = NULL;
640 	int			nr_entries, i;
641 
642 	xfs_iext_inc_seq(ifp);
643 
644 	if (ifp->if_height == 0)
645 		xfs_iext_alloc_root(ifp, cur);
646 	else if (ifp->if_height == 1)
647 		xfs_iext_realloc_root(ifp, cur);
648 
649 	nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
650 	ASSERT(nr_entries <= RECS_PER_LEAF);
651 	ASSERT(cur->pos >= nr_entries ||
652 	       xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
653 
654 	if (nr_entries == RECS_PER_LEAF)
655 		new = xfs_iext_split_leaf(cur, &nr_entries);
656 
657 	/*
658 	 * Update the pointers in higher levels if the first entry changes
659 	 * in an existing node.
660 	 */
661 	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
662 		xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
663 				offset, 1, cur->leaf);
664 	}
665 
666 	for (i = nr_entries; i > cur->pos; i--)
667 		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
668 	xfs_iext_set(cur_rec(cur), irec);
669 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
670 
671 	if (new)
672 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
673 }
674 
675 void
676 xfs_iext_insert(
677 	struct xfs_inode	*ip,
678 	struct xfs_iext_cursor	*cur,
679 	struct xfs_bmbt_irec	*irec,
680 	int			state)
681 {
682 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
683 
684 	xfs_iext_insert_raw(ifp, cur, irec);
685 	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
686 }
687 
688 static struct xfs_iext_node *
689 xfs_iext_rebalance_node(
690 	struct xfs_iext_node	*parent,
691 	int			*pos,
692 	struct xfs_iext_node	*node,
693 	int			nr_entries)
694 {
695 	/*
696 	 * If the neighbouring nodes are completely full, or have different
697 	 * parents, we might never be able to merge our node, and will only
698 	 * delete it once the number of entries hits zero.
699 	 */
700 	if (nr_entries == 0)
701 		return node;
702 
703 	if (*pos > 0) {
704 		struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
705 		int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
706 
707 		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
708 			for (i = 0; i < nr_entries; i++) {
709 				prev->keys[nr_prev + i] = node->keys[i];
710 				prev->ptrs[nr_prev + i] = node->ptrs[i];
711 			}
712 			return node;
713 		}
714 	}
715 
716 	if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
717 		struct xfs_iext_node *next = parent->ptrs[*pos + 1];
718 		int nr_next = xfs_iext_node_nr_entries(next, 0), i;
719 
720 		if (nr_entries + nr_next <= KEYS_PER_NODE) {
721 			/*
722 			 * Merge the next node into this node so that we don't
723 			 * have to do an additional update of the keys in the
724 			 * higher levels.
725 			 */
726 			for (i = 0; i < nr_next; i++) {
727 				node->keys[nr_entries + i] = next->keys[i];
728 				node->ptrs[nr_entries + i] = next->ptrs[i];
729 			}
730 
731 			++*pos;
732 			return next;
733 		}
734 	}
735 
736 	return NULL;
737 }
738 
739 static void
740 xfs_iext_remove_node(
741 	struct xfs_ifork	*ifp,
742 	xfs_fileoff_t		offset,
743 	void			*victim)
744 {
745 	struct xfs_iext_node	*node, *parent;
746 	int			level = 2, pos, nr_entries, i;
747 
748 	ASSERT(level <= ifp->if_height);
749 	node = xfs_iext_find_level(ifp, offset, level);
750 	pos = xfs_iext_node_pos(node, offset);
751 again:
752 	ASSERT(node->ptrs[pos]);
753 	ASSERT(node->ptrs[pos] == victim);
754 	kfree(victim);
755 
756 	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
757 	offset = node->keys[0];
758 	for (i = pos; i < nr_entries; i++) {
759 		node->keys[i] = node->keys[i + 1];
760 		node->ptrs[i] = node->ptrs[i + 1];
761 	}
762 	node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
763 	node->ptrs[nr_entries] = NULL;
764 
765 	if (pos == 0 && nr_entries > 0) {
766 		xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
767 		offset = node->keys[0];
768 	}
769 
770 	if (nr_entries >= KEYS_PER_NODE / 2)
771 		return;
772 
773 	if (level < ifp->if_height) {
774 		/*
775 		 * If we aren't at the root yet try to find a neighbour node to
776 		 * merge with (or delete the node if it is empty), and then
777 		 * recurse up to the next level.
778 		 */
779 		level++;
780 		parent = xfs_iext_find_level(ifp, offset, level);
781 		pos = xfs_iext_node_pos(parent, offset);
782 
783 		ASSERT(pos != KEYS_PER_NODE);
784 		ASSERT(parent->ptrs[pos] == node);
785 
786 		node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
787 		if (node) {
788 			victim = node;
789 			node = parent;
790 			goto again;
791 		}
792 	} else if (nr_entries == 1) {
793 		/*
794 		 * If we are at the root and only one entry is left we can just
795 		 * free this node and update the root pointer.
796 		 */
797 		ASSERT(node == ifp->if_data);
798 		ifp->if_data = node->ptrs[0];
799 		ifp->if_height--;
800 		kfree(node);
801 	}
802 }
803 
804 static void
805 xfs_iext_rebalance_leaf(
806 	struct xfs_ifork	*ifp,
807 	struct xfs_iext_cursor	*cur,
808 	struct xfs_iext_leaf	*leaf,
809 	xfs_fileoff_t		offset,
810 	int			nr_entries)
811 {
812 	/*
813 	 * If the neighbouring nodes are completely full we might never be able
814 	 * to merge our node, and will only delete it once the number of
815 	 * entries hits zero.
816 	 */
817 	if (nr_entries == 0)
818 		goto remove_node;
819 
820 	if (leaf->prev) {
821 		int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
822 
823 		if (nr_prev + nr_entries <= RECS_PER_LEAF) {
824 			for (i = 0; i < nr_entries; i++)
825 				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
826 
827 			if (cur->leaf == leaf) {
828 				cur->leaf = leaf->prev;
829 				cur->pos += nr_prev;
830 			}
831 			goto remove_node;
832 		}
833 	}
834 
835 	if (leaf->next) {
836 		int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
837 
838 		if (nr_entries + nr_next <= RECS_PER_LEAF) {
839 			/*
840 			 * Merge the next node into this node so that we don't
841 			 * have to do an additional update of the keys in the
842 			 * higher levels.
843 			 */
844 			for (i = 0; i < nr_next; i++) {
845 				leaf->recs[nr_entries + i] =
846 					leaf->next->recs[i];
847 			}
848 
849 			if (cur->leaf == leaf->next) {
850 				cur->leaf = leaf;
851 				cur->pos += nr_entries;
852 			}
853 
854 			offset = xfs_iext_leaf_key(leaf->next, 0);
855 			leaf = leaf->next;
856 			goto remove_node;
857 		}
858 	}
859 
860 	return;
861 remove_node:
862 	if (leaf->prev)
863 		leaf->prev->next = leaf->next;
864 	if (leaf->next)
865 		leaf->next->prev = leaf->prev;
866 	xfs_iext_remove_node(ifp, offset, leaf);
867 }
868 
869 static void
870 xfs_iext_free_last_leaf(
871 	struct xfs_ifork	*ifp)
872 {
873 	ifp->if_height--;
874 	kfree(ifp->if_data);
875 	ifp->if_data = NULL;
876 }
877 
878 void
879 xfs_iext_remove(
880 	struct xfs_inode	*ip,
881 	struct xfs_iext_cursor	*cur,
882 	int			state)
883 {
884 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
885 	struct xfs_iext_leaf	*leaf = cur->leaf;
886 	xfs_fileoff_t		offset = xfs_iext_leaf_key(leaf, 0);
887 	int			i, nr_entries;
888 
889 	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
890 
891 	ASSERT(ifp->if_height > 0);
892 	ASSERT(ifp->if_data != NULL);
893 	ASSERT(xfs_iext_valid(ifp, cur));
894 
895 	xfs_iext_inc_seq(ifp);
896 
897 	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
898 	for (i = cur->pos; i < nr_entries; i++)
899 		leaf->recs[i] = leaf->recs[i + 1];
900 	xfs_iext_rec_clear(&leaf->recs[nr_entries]);
901 	ifp->if_bytes -= sizeof(struct xfs_iext_rec);
902 
903 	if (cur->pos == 0 && nr_entries > 0) {
904 		xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
905 				leaf);
906 		offset = xfs_iext_leaf_key(leaf, 0);
907 	} else if (cur->pos == nr_entries) {
908 		if (ifp->if_height > 1 && leaf->next)
909 			cur->leaf = leaf->next;
910 		else
911 			cur->leaf = NULL;
912 		cur->pos = 0;
913 	}
914 
915 	if (nr_entries >= RECS_PER_LEAF / 2)
916 		return;
917 
918 	if (ifp->if_height > 1)
919 		xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
920 	else if (nr_entries == 0)
921 		xfs_iext_free_last_leaf(ifp);
922 }
923 
924 /*
925  * Lookup the extent covering bno.
926  *
927  * If there is an extent covering bno return the extent index, and store the
928  * expanded extent structure in *gotp, and the extent cursor in *cur.
929  * If there is no extent covering bno, but there is an extent after it (e.g.
930  * it lies in a hole) return that extent in *gotp and its cursor in *cur
931  * instead.
932  * If bno is beyond the last extent return false, and return an invalid
933  * cursor value.
934  */
935 bool
936 xfs_iext_lookup_extent(
937 	struct xfs_inode	*ip,
938 	struct xfs_ifork	*ifp,
939 	xfs_fileoff_t		offset,
940 	struct xfs_iext_cursor	*cur,
941 	struct xfs_bmbt_irec	*gotp)
942 {
943 	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
944 
945 	cur->leaf = xfs_iext_find_level(ifp, offset, 1);
946 	if (!cur->leaf) {
947 		cur->pos = 0;
948 		return false;
949 	}
950 
951 	for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
952 		struct xfs_iext_rec *rec = cur_rec(cur);
953 
954 		if (xfs_iext_rec_is_empty(rec))
955 			break;
956 		if (xfs_iext_rec_cmp(rec, offset) >= 0)
957 			goto found;
958 	}
959 
960 	/* Try looking in the next node for an entry > offset */
961 	if (ifp->if_height == 1 || !cur->leaf->next)
962 		return false;
963 	cur->leaf = cur->leaf->next;
964 	cur->pos = 0;
965 	if (!xfs_iext_valid(ifp, cur))
966 		return false;
967 found:
968 	xfs_iext_get(gotp, cur_rec(cur));
969 	return true;
970 }
971 
972 /*
973  * Returns the last extent before end, and if this extent doesn't cover
974  * end, update end to the end of the extent.
975  */
976 bool
977 xfs_iext_lookup_extent_before(
978 	struct xfs_inode	*ip,
979 	struct xfs_ifork	*ifp,
980 	xfs_fileoff_t		*end,
981 	struct xfs_iext_cursor	*cur,
982 	struct xfs_bmbt_irec	*gotp)
983 {
984 	/* could be optimized to not even look up the next on a match.. */
985 	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
986 	    gotp->br_startoff <= *end - 1)
987 		return true;
988 	if (!xfs_iext_prev_extent(ifp, cur, gotp))
989 		return false;
990 	*end = gotp->br_startoff + gotp->br_blockcount;
991 	return true;
992 }
993 
994 void
995 xfs_iext_update_extent(
996 	struct xfs_inode	*ip,
997 	int			state,
998 	struct xfs_iext_cursor	*cur,
999 	struct xfs_bmbt_irec	*new)
1000 {
1001 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
1002 
1003 	xfs_iext_inc_seq(ifp);
1004 
1005 	if (cur->pos == 0) {
1006 		struct xfs_bmbt_irec	old;
1007 
1008 		xfs_iext_get(&old, cur_rec(cur));
1009 		if (new->br_startoff != old.br_startoff) {
1010 			xfs_iext_update_node(ifp, old.br_startoff,
1011 					new->br_startoff, 1, cur->leaf);
1012 		}
1013 	}
1014 
1015 	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
1016 	xfs_iext_set(cur_rec(cur), new);
1017 	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
1018 }
1019 
1020 /*
1021  * Return true if the cursor points at an extent and return the extent structure
1022  * in gotp.  Else return false.
1023  */
1024 bool
1025 xfs_iext_get_extent(
1026 	struct xfs_ifork	*ifp,
1027 	struct xfs_iext_cursor	*cur,
1028 	struct xfs_bmbt_irec	*gotp)
1029 {
1030 	if (!xfs_iext_valid(ifp, cur))
1031 		return false;
1032 	xfs_iext_get(gotp, cur_rec(cur));
1033 	return true;
1034 }
1035 
1036 /*
1037  * This is a recursive function, because of that we need to be extremely
1038  * careful with stack usage.
1039  */
1040 static void
1041 xfs_iext_destroy_node(
1042 	struct xfs_iext_node	*node,
1043 	int			level)
1044 {
1045 	int			i;
1046 
1047 	if (level > 1) {
1048 		for (i = 0; i < KEYS_PER_NODE; i++) {
1049 			if (node->keys[i] == XFS_IEXT_KEY_INVALID)
1050 				break;
1051 			xfs_iext_destroy_node(node->ptrs[i], level - 1);
1052 		}
1053 	}
1054 
1055 	kfree(node);
1056 }
1057 
1058 void
1059 xfs_iext_destroy(
1060 	struct xfs_ifork	*ifp)
1061 {
1062 	xfs_iext_destroy_node(ifp->if_data, ifp->if_height);
1063 
1064 	ifp->if_bytes = 0;
1065 	ifp->if_height = 0;
1066 	ifp->if_data = NULL;
1067 }
1068