xref: /linux/fs/xfs/libxfs/xfs_btree.c (revision d7bf4786b5250b0e490a937d1f8a16ee3a54adbe)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_btree.h"
18 #include "xfs_errortag.h"
19 #include "xfs_error.h"
20 #include "xfs_trace.h"
21 #include "xfs_alloc.h"
22 #include "xfs_log.h"
23 #include "xfs_btree_staging.h"
24 #include "xfs_ag.h"
25 #include "xfs_alloc_btree.h"
26 #include "xfs_ialloc_btree.h"
27 #include "xfs_bmap_btree.h"
28 #include "xfs_rmap_btree.h"
29 #include "xfs_refcount_btree.h"
30 #include "xfs_health.h"
31 #include "xfs_buf_mem.h"
32 #include "xfs_btree_mem.h"
33 #include "xfs_rtrmap_btree.h"
34 #include "xfs_bmap.h"
35 #include "xfs_rmap.h"
36 #include "xfs_quota.h"
37 #include "xfs_metafile.h"
38 #include "xfs_rtrefcount_btree.h"
39 
40 /*
41  * Btree magic numbers.
42  */
43 uint32_t
44 xfs_btree_magic(
45 	struct xfs_mount		*mp,
46 	const struct xfs_btree_ops	*ops)
47 {
48 	int				idx = xfs_has_crc(mp) ? 1 : 0;
49 	__be32				magic = ops->buf_ops->magic[idx];
50 
51 	/* Ensure we asked for crc for crc-only magics. */
52 	ASSERT(magic != 0);
53 	return be32_to_cpu(magic);
54 }
55 
56 /*
57  * These sibling pointer checks are optimised for null sibling pointers. This
58  * happens a lot, and we don't need to byte swap at runtime if the sibling
59  * pointer is NULL.
60  *
61  * These are explicitly marked at inline because the cost of calling them as
62  * functions instead of inlining them is about 36 bytes extra code per call site
63  * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
64  * two sibling check functions reduces the compiled code size by over 300
65  * bytes.
66  */
67 static inline xfs_failaddr_t
68 xfs_btree_check_fsblock_siblings(
69 	struct xfs_mount	*mp,
70 	xfs_fsblock_t		fsb,
71 	__be64			dsibling)
72 {
73 	xfs_fsblock_t		sibling;
74 
75 	if (dsibling == cpu_to_be64(NULLFSBLOCK))
76 		return NULL;
77 
78 	sibling = be64_to_cpu(dsibling);
79 	if (sibling == fsb)
80 		return __this_address;
81 	if (!xfs_verify_fsbno(mp, sibling))
82 		return __this_address;
83 	return NULL;
84 }
85 
86 static inline xfs_failaddr_t
87 xfs_btree_check_memblock_siblings(
88 	struct xfs_buftarg	*btp,
89 	xfbno_t			bno,
90 	__be64			dsibling)
91 {
92 	xfbno_t			sibling;
93 
94 	if (dsibling == cpu_to_be64(NULLFSBLOCK))
95 		return NULL;
96 
97 	sibling = be64_to_cpu(dsibling);
98 	if (sibling == bno)
99 		return __this_address;
100 	if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
101 		return __this_address;
102 	return NULL;
103 }
104 
105 static inline xfs_failaddr_t
106 xfs_btree_check_agblock_siblings(
107 	struct xfs_perag	*pag,
108 	xfs_agblock_t		agbno,
109 	__be32			dsibling)
110 {
111 	xfs_agblock_t		sibling;
112 
113 	if (dsibling == cpu_to_be32(NULLAGBLOCK))
114 		return NULL;
115 
116 	sibling = be32_to_cpu(dsibling);
117 	if (sibling == agbno)
118 		return __this_address;
119 	if (!xfs_verify_agbno(pag, sibling))
120 		return __this_address;
121 	return NULL;
122 }
123 
124 static xfs_failaddr_t
125 __xfs_btree_check_lblock_hdr(
126 	struct xfs_btree_cur	*cur,
127 	struct xfs_btree_block	*block,
128 	int			level,
129 	struct xfs_buf		*bp)
130 {
131 	struct xfs_mount	*mp = cur->bc_mp;
132 
133 	if (xfs_has_crc(mp)) {
134 		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
135 			return __this_address;
136 		if (block->bb_u.l.bb_blkno !=
137 		    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
138 			return __this_address;
139 		if (block->bb_u.l.bb_pad != cpu_to_be32(0))
140 			return __this_address;
141 	}
142 
143 	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
144 		return __this_address;
145 	if (be16_to_cpu(block->bb_level) != level)
146 		return __this_address;
147 	if (be16_to_cpu(block->bb_numrecs) >
148 	    cur->bc_ops->get_maxrecs(cur, level))
149 		return __this_address;
150 
151 	return NULL;
152 }
153 
154 /*
155  * Check a long btree block header.  Return the address of the failing check,
156  * or NULL if everything is ok.
157  */
158 static xfs_failaddr_t
159 __xfs_btree_check_fsblock(
160 	struct xfs_btree_cur	*cur,
161 	struct xfs_btree_block	*block,
162 	int			level,
163 	struct xfs_buf		*bp)
164 {
165 	struct xfs_mount	*mp = cur->bc_mp;
166 	xfs_failaddr_t		fa;
167 	xfs_fsblock_t		fsb;
168 
169 	fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
170 	if (fa)
171 		return fa;
172 
173 	/*
174 	 * For inode-rooted btrees, the root block sits in the inode fork.  In
175 	 * that case bp is NULL, and the block must not have any siblings.
176 	 */
177 	if (!bp) {
178 		if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
179 			return __this_address;
180 		if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
181 			return __this_address;
182 		return NULL;
183 	}
184 
185 	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
186 	fa = xfs_btree_check_fsblock_siblings(mp, fsb,
187 			block->bb_u.l.bb_leftsib);
188 	if (!fa)
189 		fa = xfs_btree_check_fsblock_siblings(mp, fsb,
190 				block->bb_u.l.bb_rightsib);
191 	return fa;
192 }
193 
194 /*
195  * Check an in-memory btree block header.  Return the address of the failing
196  * check, or NULL if everything is ok.
197  */
198 static xfs_failaddr_t
199 __xfs_btree_check_memblock(
200 	struct xfs_btree_cur	*cur,
201 	struct xfs_btree_block	*block,
202 	int			level,
203 	struct xfs_buf		*bp)
204 {
205 	struct xfs_buftarg	*btp = cur->bc_mem.xfbtree->target;
206 	xfs_failaddr_t		fa;
207 	xfbno_t			bno;
208 
209 	fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
210 	if (fa)
211 		return fa;
212 
213 	bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
214 	fa = xfs_btree_check_memblock_siblings(btp, bno,
215 			block->bb_u.l.bb_leftsib);
216 	if (!fa)
217 		fa = xfs_btree_check_memblock_siblings(btp, bno,
218 				block->bb_u.l.bb_rightsib);
219 	return fa;
220 }
221 
222 /*
223  * Check a short btree block header.  Return the address of the failing check,
224  * or NULL if everything is ok.
225  */
226 static xfs_failaddr_t
227 __xfs_btree_check_agblock(
228 	struct xfs_btree_cur	*cur,
229 	struct xfs_btree_block	*block,
230 	int			level,
231 	struct xfs_buf		*bp)
232 {
233 	struct xfs_mount	*mp = cur->bc_mp;
234 	struct xfs_perag	*pag = to_perag(cur->bc_group);
235 	xfs_failaddr_t		fa;
236 	xfs_agblock_t		agbno;
237 
238 	if (xfs_has_crc(mp)) {
239 		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
240 			return __this_address;
241 		if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
242 			return __this_address;
243 	}
244 
245 	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
246 		return __this_address;
247 	if (be16_to_cpu(block->bb_level) != level)
248 		return __this_address;
249 	if (be16_to_cpu(block->bb_numrecs) >
250 	    cur->bc_ops->get_maxrecs(cur, level))
251 		return __this_address;
252 
253 	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
254 	fa = xfs_btree_check_agblock_siblings(pag, agbno,
255 			block->bb_u.s.bb_leftsib);
256 	if (!fa)
257 		fa = xfs_btree_check_agblock_siblings(pag, agbno,
258 				block->bb_u.s.bb_rightsib);
259 	return fa;
260 }
261 
262 /*
263  * Internal btree block check.
264  *
265  * Return NULL if the block is ok or the address of the failed check otherwise.
266  */
267 xfs_failaddr_t
268 __xfs_btree_check_block(
269 	struct xfs_btree_cur	*cur,
270 	struct xfs_btree_block	*block,
271 	int			level,
272 	struct xfs_buf		*bp)
273 {
274 	switch (cur->bc_ops->type) {
275 	case XFS_BTREE_TYPE_MEM:
276 		return __xfs_btree_check_memblock(cur, block, level, bp);
277 	case XFS_BTREE_TYPE_AG:
278 		return __xfs_btree_check_agblock(cur, block, level, bp);
279 	case XFS_BTREE_TYPE_INODE:
280 		return __xfs_btree_check_fsblock(cur, block, level, bp);
281 	default:
282 		ASSERT(0);
283 		return __this_address;
284 	}
285 }
286 
287 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
288 {
289 	if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
290 		return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
291 	return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
292 }
293 
294 /*
295  * Debug routine: check that block header is ok.
296  */
297 int
298 xfs_btree_check_block(
299 	struct xfs_btree_cur	*cur,	/* btree cursor */
300 	struct xfs_btree_block	*block,	/* generic btree block pointer */
301 	int			level,	/* level of the btree block */
302 	struct xfs_buf		*bp)	/* buffer containing block, if any */
303 {
304 	struct xfs_mount	*mp = cur->bc_mp;
305 	xfs_failaddr_t		fa;
306 
307 	fa = __xfs_btree_check_block(cur, block, level, bp);
308 	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
309 	    XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
310 		if (bp)
311 			trace_xfs_btree_corrupt(bp, _RET_IP_);
312 		xfs_btree_mark_sick(cur);
313 		return -EFSCORRUPTED;
314 	}
315 	return 0;
316 }
317 
318 int
319 __xfs_btree_check_ptr(
320 	struct xfs_btree_cur		*cur,
321 	const union xfs_btree_ptr	*ptr,
322 	int				index,
323 	int				level)
324 {
325 	if (level <= 0)
326 		return -EFSCORRUPTED;
327 
328 	switch (cur->bc_ops->type) {
329 	case XFS_BTREE_TYPE_MEM:
330 		if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
331 				be64_to_cpu((&ptr->l)[index])))
332 			return -EFSCORRUPTED;
333 		break;
334 	case XFS_BTREE_TYPE_INODE:
335 		if (!xfs_verify_fsbno(cur->bc_mp,
336 				be64_to_cpu((&ptr->l)[index])))
337 			return -EFSCORRUPTED;
338 		break;
339 	case XFS_BTREE_TYPE_AG:
340 		if (!xfs_verify_agbno(to_perag(cur->bc_group),
341 				be32_to_cpu((&ptr->s)[index])))
342 			return -EFSCORRUPTED;
343 		break;
344 	}
345 
346 	return 0;
347 }
348 
349 /*
350  * Check that a given (indexed) btree pointer at a certain level of a
351  * btree is valid and doesn't point past where it should.
352  */
353 static int
354 xfs_btree_check_ptr(
355 	struct xfs_btree_cur		*cur,
356 	const union xfs_btree_ptr	*ptr,
357 	int				index,
358 	int				level)
359 {
360 	int				error;
361 
362 	error = __xfs_btree_check_ptr(cur, ptr, index, level);
363 	if (error) {
364 		switch (cur->bc_ops->type) {
365 		case XFS_BTREE_TYPE_MEM:
366 			xfs_err(cur->bc_mp,
367 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
368 				cur->bc_ops->name, cur->bc_flags, level, index,
369 				__this_address);
370 			break;
371 		case XFS_BTREE_TYPE_INODE:
372 			xfs_err(cur->bc_mp,
373 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
374 				cur->bc_ino.ip->i_ino,
375 				cur->bc_ino.whichfork, cur->bc_ops->name,
376 				level, index);
377 			break;
378 		case XFS_BTREE_TYPE_AG:
379 			xfs_err(cur->bc_mp,
380 "AG %u: Corrupt %sbt pointer at level %d index %d.",
381 				cur->bc_group->xg_gno, cur->bc_ops->name,
382 				level, index);
383 			break;
384 		}
385 		xfs_btree_mark_sick(cur);
386 	}
387 
388 	return error;
389 }
390 
391 #ifdef DEBUG
392 # define xfs_btree_debug_check_ptr	xfs_btree_check_ptr
393 #else
394 # define xfs_btree_debug_check_ptr(...)	(0)
395 #endif
396 
397 /*
398  * Calculate CRC on the whole btree block and stuff it into the
399  * long-form btree header.
400  *
401  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
402  * it into the buffer so recovery knows what the last modification was that made
403  * it to disk.
404  */
405 void
406 xfs_btree_fsblock_calc_crc(
407 	struct xfs_buf		*bp)
408 {
409 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
410 	struct xfs_buf_log_item	*bip = bp->b_log_item;
411 
412 	if (!xfs_has_crc(bp->b_mount))
413 		return;
414 	if (bip)
415 		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
416 	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
417 }
418 
419 bool
420 xfs_btree_fsblock_verify_crc(
421 	struct xfs_buf		*bp)
422 {
423 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
424 	struct xfs_mount	*mp = bp->b_mount;
425 
426 	if (xfs_has_crc(mp)) {
427 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
428 			return false;
429 		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
430 	}
431 
432 	return true;
433 }
434 
435 /*
436  * Calculate CRC on the whole btree block and stuff it into the
437  * short-form btree header.
438  *
439  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
440  * it into the buffer so recovery knows what the last modification was that made
441  * it to disk.
442  */
443 void
444 xfs_btree_agblock_calc_crc(
445 	struct xfs_buf		*bp)
446 {
447 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
448 	struct xfs_buf_log_item	*bip = bp->b_log_item;
449 
450 	if (!xfs_has_crc(bp->b_mount))
451 		return;
452 	if (bip)
453 		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
454 	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
455 }
456 
457 bool
458 xfs_btree_agblock_verify_crc(
459 	struct xfs_buf		*bp)
460 {
461 	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
462 	struct xfs_mount	*mp = bp->b_mount;
463 
464 	if (xfs_has_crc(mp)) {
465 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
466 			return false;
467 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
468 	}
469 
470 	return true;
471 }
472 
473 static int
474 xfs_btree_free_block(
475 	struct xfs_btree_cur	*cur,
476 	struct xfs_buf		*bp)
477 {
478 	int			error;
479 
480 	trace_xfs_btree_free_block(cur, bp);
481 
482 	/*
483 	 * Don't allow block freeing for a staging cursor, because staging
484 	 * cursors do not support regular btree modifications.
485 	 */
486 	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
487 		ASSERT(0);
488 		return -EFSCORRUPTED;
489 	}
490 
491 	error = cur->bc_ops->free_block(cur, bp);
492 	if (!error) {
493 		xfs_trans_binval(cur->bc_tp, bp);
494 		XFS_BTREE_STATS_INC(cur, free);
495 	}
496 	return error;
497 }
498 
499 /*
500  * Delete the btree cursor.
501  */
502 void
503 xfs_btree_del_cursor(
504 	struct xfs_btree_cur	*cur,		/* btree cursor */
505 	int			error)		/* del because of error */
506 {
507 	int			i;		/* btree level */
508 
509 	/*
510 	 * Clear the buffer pointers and release the buffers. If we're doing
511 	 * this because of an error, inspect all of the entries in the bc_bufs
512 	 * array for buffers to be unlocked. This is because some of the btree
513 	 * code works from level n down to 0, and if we get an error along the
514 	 * way we won't have initialized all the entries down to 0.
515 	 */
516 	for (i = 0; i < cur->bc_nlevels; i++) {
517 		if (cur->bc_levels[i].bp)
518 			xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
519 		else if (!error)
520 			break;
521 	}
522 
523 	/*
524 	 * If we are doing a BMBT update, the number of unaccounted blocks
525 	 * allocated during this cursor life time should be zero. If it's not
526 	 * zero, then we should be shut down or on our way to shutdown due to
527 	 * cancelling a dirty transaction on error.
528 	 */
529 	ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
530 	       xfs_is_shutdown(cur->bc_mp) || error != 0);
531 
532 	if (cur->bc_group)
533 		xfs_group_put(cur->bc_group);
534 	kmem_cache_free(cur->bc_cache, cur);
535 }
536 
537 /* Return the buffer target for this btree's buffer. */
538 static inline struct xfs_buftarg *
539 xfs_btree_buftarg(
540 	struct xfs_btree_cur	*cur)
541 {
542 	if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
543 		return cur->bc_mem.xfbtree->target;
544 	return cur->bc_mp->m_ddev_targp;
545 }
546 
547 /* Return the block size (in units of 512b sectors) for this btree. */
548 static inline unsigned int
549 xfs_btree_bbsize(
550 	struct xfs_btree_cur	*cur)
551 {
552 	if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
553 		return XFBNO_BBSIZE;
554 	return cur->bc_mp->m_bsize;
555 }
556 
557 /*
558  * Duplicate the btree cursor.
559  * Allocate a new one, copy the record, re-get the buffers.
560  */
561 int						/* error */
562 xfs_btree_dup_cursor(
563 	struct xfs_btree_cur	*cur,		/* input cursor */
564 	struct xfs_btree_cur	**ncur)		/* output cursor */
565 {
566 	struct xfs_mount	*mp = cur->bc_mp;
567 	struct xfs_trans	*tp = cur->bc_tp;
568 	struct xfs_buf		*bp;
569 	struct xfs_btree_cur	*new;
570 	int			error;
571 	int			i;
572 
573 	/*
574 	 * Don't allow staging cursors to be duplicated because they're supposed
575 	 * to be kept private to a single thread.
576 	 */
577 	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
578 		ASSERT(0);
579 		return -EFSCORRUPTED;
580 	}
581 
582 	/*
583 	 * Allocate a new cursor like the old one.
584 	 */
585 	new = cur->bc_ops->dup_cursor(cur);
586 
587 	/*
588 	 * Copy the record currently in the cursor.
589 	 */
590 	new->bc_rec = cur->bc_rec;
591 
592 	/*
593 	 * For each level current, re-get the buffer and copy the ptr value.
594 	 */
595 	for (i = 0; i < new->bc_nlevels; i++) {
596 		new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
597 		new->bc_levels[i].ra = cur->bc_levels[i].ra;
598 		bp = cur->bc_levels[i].bp;
599 		if (bp) {
600 			error = xfs_trans_read_buf(mp, tp,
601 					xfs_btree_buftarg(cur),
602 					xfs_buf_daddr(bp),
603 					xfs_btree_bbsize(cur), 0, &bp,
604 					cur->bc_ops->buf_ops);
605 			if (xfs_metadata_is_sick(error))
606 				xfs_btree_mark_sick(new);
607 			if (error) {
608 				xfs_btree_del_cursor(new, error);
609 				*ncur = NULL;
610 				return error;
611 			}
612 		}
613 		new->bc_levels[i].bp = bp;
614 	}
615 	*ncur = new;
616 	return 0;
617 }
618 
619 /*
620  * XFS btree block layout and addressing:
621  *
622  * There are two types of blocks in the btree: leaf and non-leaf blocks.
623  *
624  * The leaf record start with a header then followed by records containing
625  * the values.  A non-leaf block also starts with the same header, and
626  * then first contains lookup keys followed by an equal number of pointers
627  * to the btree blocks at the previous level.
628  *
629  *		+--------+-------+-------+-------+-------+-------+-------+
630  * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
631  *		+--------+-------+-------+-------+-------+-------+-------+
632  *
633  *		+--------+-------+-------+-------+-------+-------+-------+
634  * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
635  *		+--------+-------+-------+-------+-------+-------+-------+
636  *
637  * The header is called struct xfs_btree_block for reasons better left unknown
638  * and comes in different versions for short (32bit) and long (64bit) block
639  * pointers.  The record and key structures are defined by the btree instances
640  * and opaque to the btree core.  The block pointers are simple disk endian
641  * integers, available in a short (32bit) and long (64bit) variant.
642  *
643  * The helpers below calculate the offset of a given record, key or pointer
644  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
645  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
646  * inside the btree block is done using indices starting at one, not zero!
647  *
648  * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
649  * overlapping intervals.  In such a tree, records are still sorted lowest to
650  * highest and indexed by the smallest key value that refers to the record.
651  * However, nodes are different: each pointer has two associated keys -- one
652  * indexing the lowest key available in the block(s) below (the same behavior
653  * as the key in a regular btree) and another indexing the highest key
654  * available in the block(s) below.  Because records are /not/ sorted by the
655  * highest key, all leaf block updates require us to compute the highest key
656  * that matches any record in the leaf and to recursively update the high keys
657  * in the nodes going further up in the tree, if necessary.  Nodes look like
658  * this:
659  *
660  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
661  * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
662  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
663  *
664  * To perform an interval query on an overlapped tree, perform the usual
665  * depth-first search and use the low and high keys to decide if we can skip
666  * that particular node.  If a leaf node is reached, return the records that
667  * intersect the interval.  Note that an interval query may return numerous
668  * entries.  For a non-overlapped tree, simply search for the record associated
669  * with the lowest key and iterate forward until a non-matching record is
670  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
671  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
672  * more detail.
673  *
674  * Why do we care about overlapping intervals?  Let's say you have a bunch of
675  * reverse mapping records on a reflink filesystem:
676  *
677  * 1: +- file A startblock B offset C length D -----------+
678  * 2:      +- file E startblock F offset G length H --------------+
679  * 3:      +- file I startblock F offset J length K --+
680  * 4:                                                        +- file L... --+
681  *
682  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
683  * we'd simply increment the length of record 1.  But how do we find the record
684  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
685  * record 3 because the keys are ordered first by startblock.  An interval
686  * query would return records 1 and 2 because they both overlap (B+D-1), and
687  * from that we can pick out record 1 as the appropriate left neighbor.
688  *
689  * In the non-overlapped case you can do a LE lookup and decrement the cursor
690  * because a record's interval must end before the next record.
691  */
692 
693 /*
694  * Return size of the btree block header for this btree instance.
695  */
696 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
697 {
698 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
699 		if (xfs_has_crc(cur->bc_mp))
700 			return XFS_BTREE_LBLOCK_CRC_LEN;
701 		return XFS_BTREE_LBLOCK_LEN;
702 	}
703 	if (xfs_has_crc(cur->bc_mp))
704 		return XFS_BTREE_SBLOCK_CRC_LEN;
705 	return XFS_BTREE_SBLOCK_LEN;
706 }
707 
708 /*
709  * Calculate offset of the n-th record in a btree block.
710  */
711 STATIC size_t
712 xfs_btree_rec_offset(
713 	struct xfs_btree_cur	*cur,
714 	int			n)
715 {
716 	return xfs_btree_block_len(cur) +
717 		(n - 1) * cur->bc_ops->rec_len;
718 }
719 
720 /*
721  * Calculate offset of the n-th key in a btree block.
722  */
723 STATIC size_t
724 xfs_btree_key_offset(
725 	struct xfs_btree_cur	*cur,
726 	int			n)
727 {
728 	return xfs_btree_block_len(cur) +
729 		(n - 1) * cur->bc_ops->key_len;
730 }
731 
732 /*
733  * Calculate offset of the n-th high key in a btree block.
734  */
735 STATIC size_t
736 xfs_btree_high_key_offset(
737 	struct xfs_btree_cur	*cur,
738 	int			n)
739 {
740 	return xfs_btree_block_len(cur) +
741 		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
742 }
743 
744 /*
745  * Calculate offset of the n-th block pointer in a btree block.
746  */
747 STATIC size_t
748 xfs_btree_ptr_offset(
749 	struct xfs_btree_cur	*cur,
750 	int			n,
751 	int			level)
752 {
753 	return xfs_btree_block_len(cur) +
754 		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
755 		(n - 1) * cur->bc_ops->ptr_len;
756 }
757 
758 /*
759  * Return a pointer to the n-th record in the btree block.
760  */
761 union xfs_btree_rec *
762 xfs_btree_rec_addr(
763 	struct xfs_btree_cur	*cur,
764 	int			n,
765 	struct xfs_btree_block	*block)
766 {
767 	return (union xfs_btree_rec *)
768 		((char *)block + xfs_btree_rec_offset(cur, n));
769 }
770 
771 /*
772  * Return a pointer to the n-th key in the btree block.
773  */
774 union xfs_btree_key *
775 xfs_btree_key_addr(
776 	struct xfs_btree_cur	*cur,
777 	int			n,
778 	struct xfs_btree_block	*block)
779 {
780 	return (union xfs_btree_key *)
781 		((char *)block + xfs_btree_key_offset(cur, n));
782 }
783 
784 /*
785  * Return a pointer to the n-th high key in the btree block.
786  */
787 union xfs_btree_key *
788 xfs_btree_high_key_addr(
789 	struct xfs_btree_cur	*cur,
790 	int			n,
791 	struct xfs_btree_block	*block)
792 {
793 	return (union xfs_btree_key *)
794 		((char *)block + xfs_btree_high_key_offset(cur, n));
795 }
796 
797 /*
798  * Return a pointer to the n-th block pointer in the btree block.
799  */
800 union xfs_btree_ptr *
801 xfs_btree_ptr_addr(
802 	struct xfs_btree_cur	*cur,
803 	int			n,
804 	struct xfs_btree_block	*block)
805 {
806 	int			level = xfs_btree_get_level(block);
807 
808 	ASSERT(block->bb_level != 0);
809 
810 	return (union xfs_btree_ptr *)
811 		((char *)block + xfs_btree_ptr_offset(cur, n, level));
812 }
813 
814 struct xfs_ifork *
815 xfs_btree_ifork_ptr(
816 	struct xfs_btree_cur	*cur)
817 {
818 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
819 
820 	if (cur->bc_flags & XFS_BTREE_STAGING)
821 		return cur->bc_ino.ifake->if_fork;
822 	return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
823 }
824 
825 /*
826  * Get the root block which is stored in the inode.
827  *
828  * For now this btree implementation assumes the btree root is always
829  * stored in the if_broot field of an inode fork.
830  */
831 STATIC struct xfs_btree_block *
832 xfs_btree_get_iroot(
833 	struct xfs_btree_cur	*cur)
834 {
835 	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
836 
837 	return (struct xfs_btree_block *)ifp->if_broot;
838 }
839 
840 /*
841  * Retrieve the block pointer from the cursor at the given level.
842  * This may be an inode btree root or from a buffer.
843  */
844 struct xfs_btree_block *		/* generic btree block pointer */
845 xfs_btree_get_block(
846 	struct xfs_btree_cur	*cur,	/* btree cursor */
847 	int			level,	/* level in btree */
848 	struct xfs_buf		**bpp)	/* buffer containing the block */
849 {
850 	if (xfs_btree_at_iroot(cur, level)) {
851 		*bpp = NULL;
852 		return xfs_btree_get_iroot(cur);
853 	}
854 
855 	*bpp = cur->bc_levels[level].bp;
856 	return XFS_BUF_TO_BLOCK(*bpp);
857 }
858 
859 /*
860  * Change the cursor to point to the first record at the given level.
861  * Other levels are unaffected.
862  */
863 STATIC int				/* success=1, failure=0 */
864 xfs_btree_firstrec(
865 	struct xfs_btree_cur	*cur,	/* btree cursor */
866 	int			level)	/* level to change */
867 {
868 	struct xfs_btree_block	*block;	/* generic btree block pointer */
869 	struct xfs_buf		*bp;	/* buffer containing block */
870 
871 	/*
872 	 * Get the block pointer for this level.
873 	 */
874 	block = xfs_btree_get_block(cur, level, &bp);
875 	if (xfs_btree_check_block(cur, block, level, bp))
876 		return 0;
877 	/*
878 	 * It's empty, there is no such record.
879 	 */
880 	if (!block->bb_numrecs)
881 		return 0;
882 	/*
883 	 * Set the ptr value to 1, that's the first record/key.
884 	 */
885 	cur->bc_levels[level].ptr = 1;
886 	return 1;
887 }
888 
889 /*
890  * Change the cursor to point to the last record in the current block
891  * at the given level.  Other levels are unaffected.
892  */
893 STATIC int				/* success=1, failure=0 */
894 xfs_btree_lastrec(
895 	struct xfs_btree_cur	*cur,	/* btree cursor */
896 	int			level)	/* level to change */
897 {
898 	struct xfs_btree_block	*block;	/* generic btree block pointer */
899 	struct xfs_buf		*bp;	/* buffer containing block */
900 
901 	/*
902 	 * Get the block pointer for this level.
903 	 */
904 	block = xfs_btree_get_block(cur, level, &bp);
905 	if (xfs_btree_check_block(cur, block, level, bp))
906 		return 0;
907 	/*
908 	 * It's empty, there is no such record.
909 	 */
910 	if (!block->bb_numrecs)
911 		return 0;
912 	/*
913 	 * Set the ptr value to numrecs, that's the last record/key.
914 	 */
915 	cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
916 	return 1;
917 }
918 
919 /*
920  * Compute first and last byte offsets for the fields given.
921  * Interprets the offsets table, which contains struct field offsets.
922  */
923 void
924 xfs_btree_offsets(
925 	uint32_t	fields,		/* bitmask of fields */
926 	const short	*offsets,	/* table of field offsets */
927 	int		nbits,		/* number of bits to inspect */
928 	int		*first,		/* output: first byte offset */
929 	int		*last)		/* output: last byte offset */
930 {
931 	int		i;		/* current bit number */
932 	uint32_t	imask;		/* mask for current bit number */
933 
934 	ASSERT(fields != 0);
935 	/*
936 	 * Find the lowest bit, so the first byte offset.
937 	 */
938 	for (i = 0, imask = 1u; ; i++, imask <<= 1) {
939 		if (imask & fields) {
940 			*first = offsets[i];
941 			break;
942 		}
943 	}
944 	/*
945 	 * Find the highest bit, so the last byte offset.
946 	 */
947 	for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
948 		if (imask & fields) {
949 			*last = offsets[i + 1] - 1;
950 			break;
951 		}
952 	}
953 }
954 
955 STATIC int
956 xfs_btree_readahead_fsblock(
957 	struct xfs_btree_cur	*cur,
958 	int			lr,
959 	struct xfs_btree_block	*block)
960 {
961 	struct xfs_mount	*mp = cur->bc_mp;
962 	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
963 	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
964 	int			rval = 0;
965 
966 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
967 		xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
968 				mp->m_bsize, cur->bc_ops->buf_ops);
969 		rval++;
970 	}
971 
972 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
973 		xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
974 				mp->m_bsize, cur->bc_ops->buf_ops);
975 		rval++;
976 	}
977 
978 	return rval;
979 }
980 
981 STATIC int
982 xfs_btree_readahead_memblock(
983 	struct xfs_btree_cur	*cur,
984 	int			lr,
985 	struct xfs_btree_block	*block)
986 {
987 	struct xfs_buftarg	*btp = cur->bc_mem.xfbtree->target;
988 	xfbno_t			left = be64_to_cpu(block->bb_u.l.bb_leftsib);
989 	xfbno_t			right = be64_to_cpu(block->bb_u.l.bb_rightsib);
990 	int			rval = 0;
991 
992 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
993 		xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
994 				cur->bc_ops->buf_ops);
995 		rval++;
996 	}
997 
998 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
999 		xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
1000 				cur->bc_ops->buf_ops);
1001 		rval++;
1002 	}
1003 
1004 	return rval;
1005 }
1006 
1007 STATIC int
1008 xfs_btree_readahead_agblock(
1009 	struct xfs_btree_cur	*cur,
1010 	int			lr,
1011 	struct xfs_btree_block	*block)
1012 {
1013 	struct xfs_mount	*mp = cur->bc_mp;
1014 	struct xfs_perag	*pag = to_perag(cur->bc_group);
1015 	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
1016 	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
1017 	int			rval = 0;
1018 
1019 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
1020 		xfs_buf_readahead(mp->m_ddev_targp,
1021 				xfs_agbno_to_daddr(pag, left), mp->m_bsize,
1022 				cur->bc_ops->buf_ops);
1023 		rval++;
1024 	}
1025 
1026 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
1027 		xfs_buf_readahead(mp->m_ddev_targp,
1028 				xfs_agbno_to_daddr(pag, right), mp->m_bsize,
1029 				cur->bc_ops->buf_ops);
1030 		rval++;
1031 	}
1032 
1033 	return rval;
1034 }
1035 
1036 /*
1037  * Read-ahead btree blocks, at the given level.
1038  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
1039  */
1040 STATIC int
1041 xfs_btree_readahead(
1042 	struct xfs_btree_cur	*cur,		/* btree cursor */
1043 	int			lev,		/* level in btree */
1044 	int			lr)		/* left/right bits */
1045 {
1046 	struct xfs_btree_block	*block;
1047 
1048 	/*
1049 	 * No readahead needed if we are at the root level and the
1050 	 * btree root is stored in the inode.
1051 	 */
1052 	if (xfs_btree_at_iroot(cur, lev))
1053 		return 0;
1054 
1055 	if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
1056 		return 0;
1057 
1058 	cur->bc_levels[lev].ra |= lr;
1059 	block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
1060 
1061 	switch (cur->bc_ops->type) {
1062 	case XFS_BTREE_TYPE_AG:
1063 		return xfs_btree_readahead_agblock(cur, lr, block);
1064 	case XFS_BTREE_TYPE_INODE:
1065 		return xfs_btree_readahead_fsblock(cur, lr, block);
1066 	case XFS_BTREE_TYPE_MEM:
1067 		return xfs_btree_readahead_memblock(cur, lr, block);
1068 	default:
1069 		ASSERT(0);
1070 		return 0;
1071 	}
1072 }
1073 
1074 STATIC int
1075 xfs_btree_ptr_to_daddr(
1076 	struct xfs_btree_cur		*cur,
1077 	const union xfs_btree_ptr	*ptr,
1078 	xfs_daddr_t			*daddr)
1079 {
1080 	int			error;
1081 
1082 	error = xfs_btree_check_ptr(cur, ptr, 0, 1);
1083 	if (error)
1084 		return error;
1085 
1086 	switch (cur->bc_ops->type) {
1087 	case XFS_BTREE_TYPE_AG:
1088 		*daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group),
1089 				be32_to_cpu(ptr->s));
1090 		break;
1091 	case XFS_BTREE_TYPE_INODE:
1092 		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1093 		break;
1094 	case XFS_BTREE_TYPE_MEM:
1095 		*daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
1096 		break;
1097 	}
1098 	return 0;
1099 }
1100 
1101 /*
1102  * Readahead @count btree blocks at the given @ptr location.
1103  *
1104  * We don't need to care about long or short form btrees here as we have a
1105  * method of converting the ptr directly to a daddr available to us.
1106  */
1107 STATIC void
1108 xfs_btree_readahead_ptr(
1109 	struct xfs_btree_cur	*cur,
1110 	union xfs_btree_ptr	*ptr,
1111 	xfs_extlen_t		count)
1112 {
1113 	xfs_daddr_t		daddr;
1114 
1115 	if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1116 		return;
1117 	xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
1118 			xfs_btree_bbsize(cur) * count,
1119 			cur->bc_ops->buf_ops);
1120 }
1121 
1122 /*
1123  * Set the buffer for level "lev" in the cursor to bp, releasing
1124  * any previous buffer.
1125  */
1126 STATIC void
1127 xfs_btree_setbuf(
1128 	struct xfs_btree_cur	*cur,	/* btree cursor */
1129 	int			lev,	/* level in btree */
1130 	struct xfs_buf		*bp)	/* new buffer to set */
1131 {
1132 	struct xfs_btree_block	*b;	/* btree block */
1133 
1134 	if (cur->bc_levels[lev].bp)
1135 		xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
1136 	cur->bc_levels[lev].bp = bp;
1137 	cur->bc_levels[lev].ra = 0;
1138 
1139 	b = XFS_BUF_TO_BLOCK(bp);
1140 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1141 		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1142 			cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1143 		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1144 			cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1145 	} else {
1146 		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1147 			cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1148 		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1149 			cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1150 	}
1151 }
1152 
1153 bool
1154 xfs_btree_ptr_is_null(
1155 	struct xfs_btree_cur		*cur,
1156 	const union xfs_btree_ptr	*ptr)
1157 {
1158 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1159 		return ptr->l == cpu_to_be64(NULLFSBLOCK);
1160 	else
1161 		return ptr->s == cpu_to_be32(NULLAGBLOCK);
1162 }
1163 
1164 void
1165 xfs_btree_set_ptr_null(
1166 	struct xfs_btree_cur	*cur,
1167 	union xfs_btree_ptr	*ptr)
1168 {
1169 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1170 		ptr->l = cpu_to_be64(NULLFSBLOCK);
1171 	else
1172 		ptr->s = cpu_to_be32(NULLAGBLOCK);
1173 }
1174 
1175 static inline bool
1176 xfs_btree_ptrs_equal(
1177 	struct xfs_btree_cur		*cur,
1178 	union xfs_btree_ptr		*ptr1,
1179 	union xfs_btree_ptr		*ptr2)
1180 {
1181 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1182 		return ptr1->l == ptr2->l;
1183 	return ptr1->s == ptr2->s;
1184 }
1185 
1186 /*
1187  * Get/set/init sibling pointers
1188  */
1189 void
1190 xfs_btree_get_sibling(
1191 	struct xfs_btree_cur	*cur,
1192 	struct xfs_btree_block	*block,
1193 	union xfs_btree_ptr	*ptr,
1194 	int			lr)
1195 {
1196 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1197 
1198 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1199 		if (lr == XFS_BB_RIGHTSIB)
1200 			ptr->l = block->bb_u.l.bb_rightsib;
1201 		else
1202 			ptr->l = block->bb_u.l.bb_leftsib;
1203 	} else {
1204 		if (lr == XFS_BB_RIGHTSIB)
1205 			ptr->s = block->bb_u.s.bb_rightsib;
1206 		else
1207 			ptr->s = block->bb_u.s.bb_leftsib;
1208 	}
1209 }
1210 
1211 void
1212 xfs_btree_set_sibling(
1213 	struct xfs_btree_cur		*cur,
1214 	struct xfs_btree_block		*block,
1215 	const union xfs_btree_ptr	*ptr,
1216 	int				lr)
1217 {
1218 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1219 
1220 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1221 		if (lr == XFS_BB_RIGHTSIB)
1222 			block->bb_u.l.bb_rightsib = ptr->l;
1223 		else
1224 			block->bb_u.l.bb_leftsib = ptr->l;
1225 	} else {
1226 		if (lr == XFS_BB_RIGHTSIB)
1227 			block->bb_u.s.bb_rightsib = ptr->s;
1228 		else
1229 			block->bb_u.s.bb_leftsib = ptr->s;
1230 	}
1231 }
1232 
1233 static void
1234 __xfs_btree_init_block(
1235 	struct xfs_mount	*mp,
1236 	struct xfs_btree_block	*buf,
1237 	const struct xfs_btree_ops *ops,
1238 	xfs_daddr_t		blkno,
1239 	__u16			level,
1240 	__u16			numrecs,
1241 	__u64			owner)
1242 {
1243 	bool			crc = xfs_has_crc(mp);
1244 	__u32			magic = xfs_btree_magic(mp, ops);
1245 
1246 	buf->bb_magic = cpu_to_be32(magic);
1247 	buf->bb_level = cpu_to_be16(level);
1248 	buf->bb_numrecs = cpu_to_be16(numrecs);
1249 
1250 	if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1251 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1252 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1253 		if (crc) {
1254 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1255 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1256 			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1257 			buf->bb_u.l.bb_pad = 0;
1258 			buf->bb_u.l.bb_lsn = 0;
1259 		}
1260 	} else {
1261 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1262 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1263 		if (crc) {
1264 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1265 			/* owner is a 32 bit value on short blocks */
1266 			buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
1267 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1268 			buf->bb_u.s.bb_lsn = 0;
1269 		}
1270 	}
1271 }
1272 
1273 void
1274 xfs_btree_init_block(
1275 	struct xfs_mount	*mp,
1276 	struct xfs_btree_block	*block,
1277 	const struct xfs_btree_ops *ops,
1278 	__u16			level,
1279 	__u16			numrecs,
1280 	__u64			owner)
1281 {
1282 	__xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
1283 			numrecs, owner);
1284 }
1285 
1286 void
1287 xfs_btree_init_buf(
1288 	struct xfs_mount		*mp,
1289 	struct xfs_buf			*bp,
1290 	const struct xfs_btree_ops	*ops,
1291 	__u16				level,
1292 	__u16				numrecs,
1293 	__u64				owner)
1294 {
1295 	__xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
1296 			xfs_buf_daddr(bp), level, numrecs, owner);
1297 	bp->b_ops = ops->buf_ops;
1298 }
1299 
1300 static inline __u64
1301 xfs_btree_owner(
1302 	struct xfs_btree_cur    *cur)
1303 {
1304 	switch (cur->bc_ops->type) {
1305 	case XFS_BTREE_TYPE_MEM:
1306 		return cur->bc_mem.xfbtree->owner;
1307 	case XFS_BTREE_TYPE_INODE:
1308 		return cur->bc_ino.ip->i_ino;
1309 	case XFS_BTREE_TYPE_AG:
1310 		return cur->bc_group->xg_gno;
1311 	default:
1312 		ASSERT(0);
1313 		return 0;
1314 	}
1315 }
1316 
1317 void
1318 xfs_btree_init_block_cur(
1319 	struct xfs_btree_cur	*cur,
1320 	struct xfs_buf		*bp,
1321 	int			level,
1322 	int			numrecs)
1323 {
1324 	xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
1325 			xfs_btree_owner(cur));
1326 }
1327 
1328 STATIC void
1329 xfs_btree_buf_to_ptr(
1330 	struct xfs_btree_cur	*cur,
1331 	struct xfs_buf		*bp,
1332 	union xfs_btree_ptr	*ptr)
1333 {
1334 	switch (cur->bc_ops->type) {
1335 	case XFS_BTREE_TYPE_AG:
1336 		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1337 					xfs_buf_daddr(bp)));
1338 		break;
1339 	case XFS_BTREE_TYPE_INODE:
1340 		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1341 					xfs_buf_daddr(bp)));
1342 		break;
1343 	case XFS_BTREE_TYPE_MEM:
1344 		ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
1345 		break;
1346 	}
1347 }
1348 
1349 static inline void
1350 xfs_btree_set_refs(
1351 	struct xfs_btree_cur	*cur,
1352 	struct xfs_buf		*bp)
1353 {
1354 	xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
1355 }
1356 
1357 int
1358 xfs_btree_get_buf_block(
1359 	struct xfs_btree_cur		*cur,
1360 	const union xfs_btree_ptr	*ptr,
1361 	struct xfs_btree_block		**block,
1362 	struct xfs_buf			**bpp)
1363 {
1364 	xfs_daddr_t			d;
1365 	int				error;
1366 
1367 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1368 	if (error)
1369 		return error;
1370 	error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
1371 			xfs_btree_bbsize(cur), 0, bpp);
1372 	if (error)
1373 		return error;
1374 
1375 	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1376 	*block = XFS_BUF_TO_BLOCK(*bpp);
1377 	return 0;
1378 }
1379 
1380 /*
1381  * Read in the buffer at the given ptr and return the buffer and
1382  * the block pointer within the buffer.
1383  */
1384 int
1385 xfs_btree_read_buf_block(
1386 	struct xfs_btree_cur		*cur,
1387 	const union xfs_btree_ptr	*ptr,
1388 	int				flags,
1389 	struct xfs_btree_block		**block,
1390 	struct xfs_buf			**bpp)
1391 {
1392 	struct xfs_mount	*mp = cur->bc_mp;
1393 	xfs_daddr_t		d;
1394 	int			error;
1395 
1396 	/* need to sort out how callers deal with failures first */
1397 	ASSERT(!(flags & XBF_TRYLOCK));
1398 
1399 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1400 	if (error)
1401 		return error;
1402 	error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
1403 			xfs_btree_bbsize(cur), flags, bpp,
1404 			cur->bc_ops->buf_ops);
1405 	if (xfs_metadata_is_sick(error))
1406 		xfs_btree_mark_sick(cur);
1407 	if (error)
1408 		return error;
1409 
1410 	xfs_btree_set_refs(cur, *bpp);
1411 	*block = XFS_BUF_TO_BLOCK(*bpp);
1412 	return 0;
1413 }
1414 
1415 /*
1416  * Copy keys from one btree block to another.
1417  */
1418 void
1419 xfs_btree_copy_keys(
1420 	struct xfs_btree_cur		*cur,
1421 	union xfs_btree_key		*dst_key,
1422 	const union xfs_btree_key	*src_key,
1423 	int				numkeys)
1424 {
1425 	ASSERT(numkeys >= 0);
1426 	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1427 }
1428 
1429 /*
1430  * Copy records from one btree block to another.
1431  */
1432 STATIC void
1433 xfs_btree_copy_recs(
1434 	struct xfs_btree_cur	*cur,
1435 	union xfs_btree_rec	*dst_rec,
1436 	union xfs_btree_rec	*src_rec,
1437 	int			numrecs)
1438 {
1439 	ASSERT(numrecs >= 0);
1440 	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1441 }
1442 
1443 /*
1444  * Copy block pointers from one btree block to another.
1445  */
1446 void
1447 xfs_btree_copy_ptrs(
1448 	struct xfs_btree_cur	*cur,
1449 	union xfs_btree_ptr	*dst_ptr,
1450 	const union xfs_btree_ptr *src_ptr,
1451 	int			numptrs)
1452 {
1453 	ASSERT(numptrs >= 0);
1454 	memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
1455 }
1456 
1457 /*
1458  * Shift keys one index left/right inside a single btree block.
1459  */
1460 STATIC void
1461 xfs_btree_shift_keys(
1462 	struct xfs_btree_cur	*cur,
1463 	union xfs_btree_key	*key,
1464 	int			dir,
1465 	int			numkeys)
1466 {
1467 	char			*dst_key;
1468 
1469 	ASSERT(numkeys >= 0);
1470 	ASSERT(dir == 1 || dir == -1);
1471 
1472 	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1473 	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1474 }
1475 
1476 /*
1477  * Shift records one index left/right inside a single btree block.
1478  */
1479 STATIC void
1480 xfs_btree_shift_recs(
1481 	struct xfs_btree_cur	*cur,
1482 	union xfs_btree_rec	*rec,
1483 	int			dir,
1484 	int			numrecs)
1485 {
1486 	char			*dst_rec;
1487 
1488 	ASSERT(numrecs >= 0);
1489 	ASSERT(dir == 1 || dir == -1);
1490 
1491 	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1492 	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1493 }
1494 
1495 /*
1496  * Shift block pointers one index left/right inside a single btree block.
1497  */
1498 STATIC void
1499 xfs_btree_shift_ptrs(
1500 	struct xfs_btree_cur	*cur,
1501 	union xfs_btree_ptr	*ptr,
1502 	int			dir,
1503 	int			numptrs)
1504 {
1505 	char			*dst_ptr;
1506 
1507 	ASSERT(numptrs >= 0);
1508 	ASSERT(dir == 1 || dir == -1);
1509 
1510 	dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
1511 	memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
1512 }
1513 
1514 /*
1515  * Log key values from the btree block.
1516  */
1517 STATIC void
1518 xfs_btree_log_keys(
1519 	struct xfs_btree_cur	*cur,
1520 	struct xfs_buf		*bp,
1521 	int			first,
1522 	int			last)
1523 {
1524 
1525 	if (bp) {
1526 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1527 		xfs_trans_log_buf(cur->bc_tp, bp,
1528 				  xfs_btree_key_offset(cur, first),
1529 				  xfs_btree_key_offset(cur, last + 1) - 1);
1530 	} else {
1531 		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1532 				xfs_ilog_fbroot(cur->bc_ino.whichfork));
1533 	}
1534 }
1535 
1536 /*
1537  * Log record values from the btree block.
1538  */
1539 void
1540 xfs_btree_log_recs(
1541 	struct xfs_btree_cur	*cur,
1542 	struct xfs_buf		*bp,
1543 	int			first,
1544 	int			last)
1545 {
1546 	if (!bp) {
1547 		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1548 				xfs_ilog_fbroot(cur->bc_ino.whichfork));
1549 		return;
1550 	}
1551 
1552 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1553 	xfs_trans_log_buf(cur->bc_tp, bp,
1554 			  xfs_btree_rec_offset(cur, first),
1555 			  xfs_btree_rec_offset(cur, last + 1) - 1);
1556 }
1557 
1558 /*
1559  * Log block pointer fields from a btree block (nonleaf).
1560  */
1561 STATIC void
1562 xfs_btree_log_ptrs(
1563 	struct xfs_btree_cur	*cur,	/* btree cursor */
1564 	struct xfs_buf		*bp,	/* buffer containing btree block */
1565 	int			first,	/* index of first pointer to log */
1566 	int			last)	/* index of last pointer to log */
1567 {
1568 
1569 	if (bp) {
1570 		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1571 		int			level = xfs_btree_get_level(block);
1572 
1573 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1574 		xfs_trans_log_buf(cur->bc_tp, bp,
1575 				xfs_btree_ptr_offset(cur, first, level),
1576 				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1577 	} else {
1578 		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1579 			xfs_ilog_fbroot(cur->bc_ino.whichfork));
1580 	}
1581 
1582 }
1583 
1584 /*
1585  * Log fields from a btree block header.
1586  */
1587 void
1588 xfs_btree_log_block(
1589 	struct xfs_btree_cur	*cur,	/* btree cursor */
1590 	struct xfs_buf		*bp,	/* buffer containing btree block */
1591 	uint32_t		fields)	/* mask of fields: XFS_BB_... */
1592 {
1593 	int			first;	/* first byte offset logged */
1594 	int			last;	/* last byte offset logged */
1595 	static const short	soffsets[] = {	/* table of offsets (short) */
1596 		offsetof(struct xfs_btree_block, bb_magic),
1597 		offsetof(struct xfs_btree_block, bb_level),
1598 		offsetof(struct xfs_btree_block, bb_numrecs),
1599 		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1600 		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1601 		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1602 		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1603 		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1604 		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1605 		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1606 		XFS_BTREE_SBLOCK_CRC_LEN
1607 	};
1608 	static const short	loffsets[] = {	/* table of offsets (long) */
1609 		offsetof(struct xfs_btree_block, bb_magic),
1610 		offsetof(struct xfs_btree_block, bb_level),
1611 		offsetof(struct xfs_btree_block, bb_numrecs),
1612 		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1613 		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1614 		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1615 		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1616 		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1617 		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1618 		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1619 		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1620 		XFS_BTREE_LBLOCK_CRC_LEN
1621 	};
1622 
1623 	if (bp) {
1624 		int nbits;
1625 
1626 		if (xfs_has_crc(cur->bc_mp)) {
1627 			/*
1628 			 * We don't log the CRC when updating a btree
1629 			 * block but instead recreate it during log
1630 			 * recovery.  As the log buffers have checksums
1631 			 * of their own this is safe and avoids logging a crc
1632 			 * update in a lot of places.
1633 			 */
1634 			if (fields == XFS_BB_ALL_BITS)
1635 				fields = XFS_BB_ALL_BITS_CRC;
1636 			nbits = XFS_BB_NUM_BITS_CRC;
1637 		} else {
1638 			nbits = XFS_BB_NUM_BITS;
1639 		}
1640 		xfs_btree_offsets(fields,
1641 				  (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
1642 					loffsets : soffsets,
1643 				  nbits, &first, &last);
1644 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1645 		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1646 	} else {
1647 		xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1648 			xfs_ilog_fbroot(cur->bc_ino.whichfork));
1649 	}
1650 }
1651 
1652 /*
1653  * Increment cursor by one record at the level.
1654  * For nonzero levels the leaf-ward information is untouched.
1655  */
1656 int						/* error */
1657 xfs_btree_increment(
1658 	struct xfs_btree_cur	*cur,
1659 	int			level,
1660 	int			*stat)		/* success/failure */
1661 {
1662 	struct xfs_btree_block	*block;
1663 	union xfs_btree_ptr	ptr;
1664 	struct xfs_buf		*bp;
1665 	int			error;		/* error return value */
1666 	int			lev;
1667 
1668 	ASSERT(level < cur->bc_nlevels);
1669 
1670 	/* Read-ahead to the right at this level. */
1671 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1672 
1673 	/* Get a pointer to the btree block. */
1674 	block = xfs_btree_get_block(cur, level, &bp);
1675 
1676 #ifdef DEBUG
1677 	error = xfs_btree_check_block(cur, block, level, bp);
1678 	if (error)
1679 		goto error0;
1680 #endif
1681 
1682 	/* We're done if we remain in the block after the increment. */
1683 	if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
1684 		goto out1;
1685 
1686 	/* Fail if we just went off the right edge of the tree. */
1687 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1688 	if (xfs_btree_ptr_is_null(cur, &ptr))
1689 		goto out0;
1690 
1691 	XFS_BTREE_STATS_INC(cur, increment);
1692 
1693 	/*
1694 	 * March up the tree incrementing pointers.
1695 	 * Stop when we don't go off the right edge of a block.
1696 	 */
1697 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1698 		block = xfs_btree_get_block(cur, lev, &bp);
1699 
1700 #ifdef DEBUG
1701 		error = xfs_btree_check_block(cur, block, lev, bp);
1702 		if (error)
1703 			goto error0;
1704 #endif
1705 
1706 		if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
1707 			break;
1708 
1709 		/* Read-ahead the right block for the next loop. */
1710 		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1711 	}
1712 
1713 	/*
1714 	 * If we went off the root then we are either seriously
1715 	 * confused or have the tree root in an inode.
1716 	 */
1717 	if (lev == cur->bc_nlevels) {
1718 		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
1719 			goto out0;
1720 		ASSERT(0);
1721 		xfs_btree_mark_sick(cur);
1722 		error = -EFSCORRUPTED;
1723 		goto error0;
1724 	}
1725 	ASSERT(lev < cur->bc_nlevels);
1726 
1727 	/*
1728 	 * Now walk back down the tree, fixing up the cursor's buffer
1729 	 * pointers and key numbers.
1730 	 */
1731 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1732 		union xfs_btree_ptr	*ptrp;
1733 
1734 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1735 		--lev;
1736 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1737 		if (error)
1738 			goto error0;
1739 
1740 		xfs_btree_setbuf(cur, lev, bp);
1741 		cur->bc_levels[lev].ptr = 1;
1742 	}
1743 out1:
1744 	*stat = 1;
1745 	return 0;
1746 
1747 out0:
1748 	*stat = 0;
1749 	return 0;
1750 
1751 error0:
1752 	return error;
1753 }
1754 
1755 /*
1756  * Decrement cursor by one record at the level.
1757  * For nonzero levels the leaf-ward information is untouched.
1758  */
1759 int						/* error */
1760 xfs_btree_decrement(
1761 	struct xfs_btree_cur	*cur,
1762 	int			level,
1763 	int			*stat)		/* success/failure */
1764 {
1765 	struct xfs_btree_block	*block;
1766 	struct xfs_buf		*bp;
1767 	int			error;		/* error return value */
1768 	int			lev;
1769 	union xfs_btree_ptr	ptr;
1770 
1771 	ASSERT(level < cur->bc_nlevels);
1772 
1773 	/* Read-ahead to the left at this level. */
1774 	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1775 
1776 	/* We're done if we remain in the block after the decrement. */
1777 	if (--cur->bc_levels[level].ptr > 0)
1778 		goto out1;
1779 
1780 	/* Get a pointer to the btree block. */
1781 	block = xfs_btree_get_block(cur, level, &bp);
1782 
1783 #ifdef DEBUG
1784 	error = xfs_btree_check_block(cur, block, level, bp);
1785 	if (error)
1786 		goto error0;
1787 #endif
1788 
1789 	/* Fail if we just went off the left edge of the tree. */
1790 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1791 	if (xfs_btree_ptr_is_null(cur, &ptr))
1792 		goto out0;
1793 
1794 	XFS_BTREE_STATS_INC(cur, decrement);
1795 
1796 	/*
1797 	 * March up the tree decrementing pointers.
1798 	 * Stop when we don't go off the left edge of a block.
1799 	 */
1800 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1801 		if (--cur->bc_levels[lev].ptr > 0)
1802 			break;
1803 		/* Read-ahead the left block for the next loop. */
1804 		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1805 	}
1806 
1807 	/*
1808 	 * If we went off the root then we are seriously confused.
1809 	 * or the root of the tree is in an inode.
1810 	 */
1811 	if (lev == cur->bc_nlevels) {
1812 		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
1813 			goto out0;
1814 		ASSERT(0);
1815 		xfs_btree_mark_sick(cur);
1816 		error = -EFSCORRUPTED;
1817 		goto error0;
1818 	}
1819 	ASSERT(lev < cur->bc_nlevels);
1820 
1821 	/*
1822 	 * Now walk back down the tree, fixing up the cursor's buffer
1823 	 * pointers and key numbers.
1824 	 */
1825 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1826 		union xfs_btree_ptr	*ptrp;
1827 
1828 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1829 		--lev;
1830 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1831 		if (error)
1832 			goto error0;
1833 		xfs_btree_setbuf(cur, lev, bp);
1834 		cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
1835 	}
1836 out1:
1837 	*stat = 1;
1838 	return 0;
1839 
1840 out0:
1841 	*stat = 0;
1842 	return 0;
1843 
1844 error0:
1845 	return error;
1846 }
1847 
1848 /*
1849  * Check the btree block owner now that we have the context to know who the
1850  * real owner is.
1851  */
1852 static inline xfs_failaddr_t
1853 xfs_btree_check_block_owner(
1854 	struct xfs_btree_cur	*cur,
1855 	struct xfs_btree_block	*block)
1856 {
1857 	__u64			owner;
1858 
1859 	if (!xfs_has_crc(cur->bc_mp) ||
1860 	    (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
1861 		return NULL;
1862 
1863 	owner = xfs_btree_owner(cur);
1864 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1865 		if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
1866 			return __this_address;
1867 	} else {
1868 		if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
1869 			return __this_address;
1870 	}
1871 
1872 	return NULL;
1873 }
1874 
1875 int
1876 xfs_btree_lookup_get_block(
1877 	struct xfs_btree_cur		*cur,	/* btree cursor */
1878 	int				level,	/* level in the btree */
1879 	const union xfs_btree_ptr	*pp,	/* ptr to btree block */
1880 	struct xfs_btree_block		**blkp) /* return btree block */
1881 {
1882 	struct xfs_buf		*bp;	/* buffer pointer for btree block */
1883 	xfs_daddr_t		daddr;
1884 	int			error = 0;
1885 
1886 	/* special case the root block if in an inode */
1887 	if (xfs_btree_at_iroot(cur, level)) {
1888 		*blkp = xfs_btree_get_iroot(cur);
1889 		return 0;
1890 	}
1891 
1892 	/*
1893 	 * If the old buffer at this level for the disk address we are
1894 	 * looking for re-use it.
1895 	 *
1896 	 * Otherwise throw it away and get a new one.
1897 	 */
1898 	bp = cur->bc_levels[level].bp;
1899 	error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1900 	if (error)
1901 		return error;
1902 	if (bp && xfs_buf_daddr(bp) == daddr) {
1903 		*blkp = XFS_BUF_TO_BLOCK(bp);
1904 		return 0;
1905 	}
1906 
1907 	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1908 	if (error)
1909 		return error;
1910 
1911 	/* Check the inode owner since the verifiers don't. */
1912 	if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
1913 		goto out_bad;
1914 
1915 	/* Did we get the level we were looking for? */
1916 	if (be16_to_cpu((*blkp)->bb_level) != level)
1917 		goto out_bad;
1918 
1919 	/* Check that internal nodes have at least one record. */
1920 	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1921 		goto out_bad;
1922 
1923 	xfs_btree_setbuf(cur, level, bp);
1924 	return 0;
1925 
1926 out_bad:
1927 	*blkp = NULL;
1928 	xfs_buf_mark_corrupt(bp);
1929 	xfs_trans_brelse(cur->bc_tp, bp);
1930 	xfs_btree_mark_sick(cur);
1931 	return -EFSCORRUPTED;
1932 }
1933 
1934 /*
1935  * Get current search key.  For level 0 we don't actually have a key
1936  * structure so we make one up from the record.  For all other levels
1937  * we just return the right key.
1938  */
1939 STATIC union xfs_btree_key *
1940 xfs_lookup_get_search_key(
1941 	struct xfs_btree_cur	*cur,
1942 	int			level,
1943 	int			keyno,
1944 	struct xfs_btree_block	*block,
1945 	union xfs_btree_key	*kp)
1946 {
1947 	if (level == 0) {
1948 		cur->bc_ops->init_key_from_rec(kp,
1949 				xfs_btree_rec_addr(cur, keyno, block));
1950 		return kp;
1951 	}
1952 
1953 	return xfs_btree_key_addr(cur, keyno, block);
1954 }
1955 
1956 /*
1957  * Initialize a pointer to the root block.
1958  */
1959 void
1960 xfs_btree_init_ptr_from_cur(
1961 	struct xfs_btree_cur	*cur,
1962 	union xfs_btree_ptr	*ptr)
1963 {
1964 	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
1965 		/*
1966 		 * Inode-rooted btrees call xfs_btree_get_iroot to find the root
1967 		 * in xfs_btree_lookup_get_block and don't need a pointer here.
1968 		 */
1969 		ptr->l = 0;
1970 	} else if (cur->bc_flags & XFS_BTREE_STAGING) {
1971 		ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
1972 	} else {
1973 		cur->bc_ops->init_ptr_from_cur(cur, ptr);
1974 	}
1975 }
1976 
1977 /*
1978  * Lookup the record.  The cursor is made to point to it, based on dir.
1979  * stat is set to 0 if can't find any such record, 1 for success.
1980  */
1981 int					/* error */
1982 xfs_btree_lookup(
1983 	struct xfs_btree_cur	*cur,	/* btree cursor */
1984 	xfs_lookup_t		dir,	/* <=, ==, or >= */
1985 	int			*stat)	/* success/failure */
1986 {
1987 	struct xfs_btree_block	*block;	/* current btree block */
1988 	int64_t			diff;	/* difference for the current key */
1989 	int			error;	/* error return value */
1990 	int			keyno;	/* current key number */
1991 	int			level;	/* level in the btree */
1992 	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1993 	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1994 
1995 	XFS_BTREE_STATS_INC(cur, lookup);
1996 
1997 	/* No such thing as a zero-level tree. */
1998 	if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
1999 		xfs_btree_mark_sick(cur);
2000 		return -EFSCORRUPTED;
2001 	}
2002 
2003 	block = NULL;
2004 	keyno = 0;
2005 
2006 	/* initialise start pointer from cursor */
2007 	xfs_btree_init_ptr_from_cur(cur, &ptr);
2008 	pp = &ptr;
2009 
2010 	/*
2011 	 * Iterate over each level in the btree, starting at the root.
2012 	 * For each level above the leaves, find the key we need, based
2013 	 * on the lookup record, then follow the corresponding block
2014 	 * pointer down to the next level.
2015 	 */
2016 	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
2017 		/* Get the block we need to do the lookup on. */
2018 		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
2019 		if (error)
2020 			goto error0;
2021 
2022 		if (diff == 0) {
2023 			/*
2024 			 * If we already had a key match at a higher level, we
2025 			 * know we need to use the first entry in this block.
2026 			 */
2027 			keyno = 1;
2028 		} else {
2029 			/* Otherwise search this block. Do a binary search. */
2030 
2031 			int	high;	/* high entry number */
2032 			int	low;	/* low entry number */
2033 
2034 			/* Set low and high entry numbers, 1-based. */
2035 			low = 1;
2036 			high = xfs_btree_get_numrecs(block);
2037 			if (!high) {
2038 				/* Block is empty, must be an empty leaf. */
2039 				if (level != 0 || cur->bc_nlevels != 1) {
2040 					XFS_CORRUPTION_ERROR(__func__,
2041 							XFS_ERRLEVEL_LOW,
2042 							cur->bc_mp, block,
2043 							sizeof(*block));
2044 					xfs_btree_mark_sick(cur);
2045 					return -EFSCORRUPTED;
2046 				}
2047 
2048 				cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
2049 				*stat = 0;
2050 				return 0;
2051 			}
2052 
2053 			/* Binary search the block. */
2054 			while (low <= high) {
2055 				union xfs_btree_key	key;
2056 				union xfs_btree_key	*kp;
2057 
2058 				XFS_BTREE_STATS_INC(cur, compare);
2059 
2060 				/* keyno is average of low and high. */
2061 				keyno = (low + high) >> 1;
2062 
2063 				/* Get current search key */
2064 				kp = xfs_lookup_get_search_key(cur, level,
2065 						keyno, block, &key);
2066 
2067 				/*
2068 				 * Compute difference to get next direction:
2069 				 *  - less than, move right
2070 				 *  - greater than, move left
2071 				 *  - equal, we're done
2072 				 */
2073 				diff = cur->bc_ops->key_diff(cur, kp);
2074 				if (diff < 0)
2075 					low = keyno + 1;
2076 				else if (diff > 0)
2077 					high = keyno - 1;
2078 				else
2079 					break;
2080 			}
2081 		}
2082 
2083 		/*
2084 		 * If there are more levels, set up for the next level
2085 		 * by getting the block number and filling in the cursor.
2086 		 */
2087 		if (level > 0) {
2088 			/*
2089 			 * If we moved left, need the previous key number,
2090 			 * unless there isn't one.
2091 			 */
2092 			if (diff > 0 && --keyno < 1)
2093 				keyno = 1;
2094 			pp = xfs_btree_ptr_addr(cur, keyno, block);
2095 
2096 			error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
2097 			if (error)
2098 				goto error0;
2099 
2100 			cur->bc_levels[level].ptr = keyno;
2101 		}
2102 	}
2103 
2104 	/* Done with the search. See if we need to adjust the results. */
2105 	if (dir != XFS_LOOKUP_LE && diff < 0) {
2106 		keyno++;
2107 		/*
2108 		 * If ge search and we went off the end of the block, but it's
2109 		 * not the last block, we're in the wrong block.
2110 		 */
2111 		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
2112 		if (dir == XFS_LOOKUP_GE &&
2113 		    keyno > xfs_btree_get_numrecs(block) &&
2114 		    !xfs_btree_ptr_is_null(cur, &ptr)) {
2115 			int	i;
2116 
2117 			cur->bc_levels[0].ptr = keyno;
2118 			error = xfs_btree_increment(cur, 0, &i);
2119 			if (error)
2120 				goto error0;
2121 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
2122 				xfs_btree_mark_sick(cur);
2123 				return -EFSCORRUPTED;
2124 			}
2125 			*stat = 1;
2126 			return 0;
2127 		}
2128 	} else if (dir == XFS_LOOKUP_LE && diff > 0)
2129 		keyno--;
2130 	cur->bc_levels[0].ptr = keyno;
2131 
2132 	/* Return if we succeeded or not. */
2133 	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
2134 		*stat = 0;
2135 	else if (dir != XFS_LOOKUP_EQ || diff == 0)
2136 		*stat = 1;
2137 	else
2138 		*stat = 0;
2139 	return 0;
2140 
2141 error0:
2142 	return error;
2143 }
2144 
2145 /* Find the high key storage area from a regular key. */
2146 union xfs_btree_key *
2147 xfs_btree_high_key_from_key(
2148 	struct xfs_btree_cur	*cur,
2149 	union xfs_btree_key	*key)
2150 {
2151 	ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
2152 	return (union xfs_btree_key *)((char *)key +
2153 			(cur->bc_ops->key_len / 2));
2154 }
2155 
2156 /* Determine the low (and high if overlapped) keys of a leaf block */
2157 STATIC void
2158 xfs_btree_get_leaf_keys(
2159 	struct xfs_btree_cur	*cur,
2160 	struct xfs_btree_block	*block,
2161 	union xfs_btree_key	*key)
2162 {
2163 	union xfs_btree_key	max_hkey;
2164 	union xfs_btree_key	hkey;
2165 	union xfs_btree_rec	*rec;
2166 	union xfs_btree_key	*high;
2167 	int			n;
2168 
2169 	rec = xfs_btree_rec_addr(cur, 1, block);
2170 	cur->bc_ops->init_key_from_rec(key, rec);
2171 
2172 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2173 
2174 		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2175 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2176 			rec = xfs_btree_rec_addr(cur, n, block);
2177 			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2178 			if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey))
2179 				max_hkey = hkey;
2180 		}
2181 
2182 		high = xfs_btree_high_key_from_key(cur, key);
2183 		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2184 	}
2185 }
2186 
2187 /* Determine the low (and high if overlapped) keys of a node block */
2188 STATIC void
2189 xfs_btree_get_node_keys(
2190 	struct xfs_btree_cur	*cur,
2191 	struct xfs_btree_block	*block,
2192 	union xfs_btree_key	*key)
2193 {
2194 	union xfs_btree_key	*hkey;
2195 	union xfs_btree_key	*max_hkey;
2196 	union xfs_btree_key	*high;
2197 	int			n;
2198 
2199 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2200 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2201 				cur->bc_ops->key_len / 2);
2202 
2203 		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2204 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2205 			hkey = xfs_btree_high_key_addr(cur, n, block);
2206 			if (xfs_btree_keycmp_gt(cur, hkey, max_hkey))
2207 				max_hkey = hkey;
2208 		}
2209 
2210 		high = xfs_btree_high_key_from_key(cur, key);
2211 		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2212 	} else {
2213 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2214 				cur->bc_ops->key_len);
2215 	}
2216 }
2217 
2218 /* Derive the keys for any btree block. */
2219 void
2220 xfs_btree_get_keys(
2221 	struct xfs_btree_cur	*cur,
2222 	struct xfs_btree_block	*block,
2223 	union xfs_btree_key	*key)
2224 {
2225 	if (be16_to_cpu(block->bb_level) == 0)
2226 		xfs_btree_get_leaf_keys(cur, block, key);
2227 	else
2228 		xfs_btree_get_node_keys(cur, block, key);
2229 }
2230 
2231 /*
2232  * Decide if we need to update the parent keys of a btree block.  For
2233  * a standard btree this is only necessary if we're updating the first
2234  * record/key.  For an overlapping btree, we must always update the
2235  * keys because the highest key can be in any of the records or keys
2236  * in the block.
2237  */
2238 static inline bool
2239 xfs_btree_needs_key_update(
2240 	struct xfs_btree_cur	*cur,
2241 	int			ptr)
2242 {
2243 	return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
2244 }
2245 
2246 /*
2247  * Update the low and high parent keys of the given level, progressing
2248  * towards the root.  If force_all is false, stop if the keys for a given
2249  * level do not need updating.
2250  */
2251 STATIC int
2252 __xfs_btree_updkeys(
2253 	struct xfs_btree_cur	*cur,
2254 	int			level,
2255 	struct xfs_btree_block	*block,
2256 	struct xfs_buf		*bp0,
2257 	bool			force_all)
2258 {
2259 	union xfs_btree_key	key;	/* keys from current level */
2260 	union xfs_btree_key	*lkey;	/* keys from the next level up */
2261 	union xfs_btree_key	*hkey;
2262 	union xfs_btree_key	*nlkey;	/* keys from the next level up */
2263 	union xfs_btree_key	*nhkey;
2264 	struct xfs_buf		*bp;
2265 	int			ptr;
2266 
2267 	ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
2268 
2269 	/* Exit if there aren't any parent levels to update. */
2270 	if (level + 1 >= cur->bc_nlevels)
2271 		return 0;
2272 
2273 	trace_xfs_btree_updkeys(cur, level, bp0);
2274 
2275 	lkey = &key;
2276 	hkey = xfs_btree_high_key_from_key(cur, lkey);
2277 	xfs_btree_get_keys(cur, block, lkey);
2278 	for (level++; level < cur->bc_nlevels; level++) {
2279 #ifdef DEBUG
2280 		int		error;
2281 #endif
2282 		block = xfs_btree_get_block(cur, level, &bp);
2283 		trace_xfs_btree_updkeys(cur, level, bp);
2284 #ifdef DEBUG
2285 		error = xfs_btree_check_block(cur, block, level, bp);
2286 		if (error)
2287 			return error;
2288 #endif
2289 		ptr = cur->bc_levels[level].ptr;
2290 		nlkey = xfs_btree_key_addr(cur, ptr, block);
2291 		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2292 		if (!force_all &&
2293 		    xfs_btree_keycmp_eq(cur, nlkey, lkey) &&
2294 		    xfs_btree_keycmp_eq(cur, nhkey, hkey))
2295 			break;
2296 		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2297 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2298 		if (level + 1 >= cur->bc_nlevels)
2299 			break;
2300 		xfs_btree_get_node_keys(cur, block, lkey);
2301 	}
2302 
2303 	return 0;
2304 }
2305 
2306 /* Update all the keys from some level in cursor back to the root. */
2307 STATIC int
2308 xfs_btree_updkeys_force(
2309 	struct xfs_btree_cur	*cur,
2310 	int			level)
2311 {
2312 	struct xfs_buf		*bp;
2313 	struct xfs_btree_block	*block;
2314 
2315 	block = xfs_btree_get_block(cur, level, &bp);
2316 	return __xfs_btree_updkeys(cur, level, block, bp, true);
2317 }
2318 
2319 /*
2320  * Update the parent keys of the given level, progressing towards the root.
2321  */
2322 STATIC int
2323 xfs_btree_update_keys(
2324 	struct xfs_btree_cur	*cur,
2325 	int			level)
2326 {
2327 	struct xfs_btree_block	*block;
2328 	struct xfs_buf		*bp;
2329 	union xfs_btree_key	*kp;
2330 	union xfs_btree_key	key;
2331 	int			ptr;
2332 
2333 	ASSERT(level >= 0);
2334 
2335 	block = xfs_btree_get_block(cur, level, &bp);
2336 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
2337 		return __xfs_btree_updkeys(cur, level, block, bp, false);
2338 
2339 	/*
2340 	 * Go up the tree from this level toward the root.
2341 	 * At each level, update the key value to the value input.
2342 	 * Stop when we reach a level where the cursor isn't pointing
2343 	 * at the first entry in the block.
2344 	 */
2345 	xfs_btree_get_keys(cur, block, &key);
2346 	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2347 #ifdef DEBUG
2348 		int		error;
2349 #endif
2350 		block = xfs_btree_get_block(cur, level, &bp);
2351 #ifdef DEBUG
2352 		error = xfs_btree_check_block(cur, block, level, bp);
2353 		if (error)
2354 			return error;
2355 #endif
2356 		ptr = cur->bc_levels[level].ptr;
2357 		kp = xfs_btree_key_addr(cur, ptr, block);
2358 		xfs_btree_copy_keys(cur, kp, &key, 1);
2359 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2360 	}
2361 
2362 	return 0;
2363 }
2364 
2365 /*
2366  * Update the record referred to by cur to the value in the
2367  * given record. This either works (return 0) or gets an
2368  * EFSCORRUPTED error.
2369  */
2370 int
2371 xfs_btree_update(
2372 	struct xfs_btree_cur	*cur,
2373 	union xfs_btree_rec	*rec)
2374 {
2375 	struct xfs_btree_block	*block;
2376 	struct xfs_buf		*bp;
2377 	int			error;
2378 	int			ptr;
2379 	union xfs_btree_rec	*rp;
2380 
2381 	/* Pick up the current block. */
2382 	block = xfs_btree_get_block(cur, 0, &bp);
2383 
2384 #ifdef DEBUG
2385 	error = xfs_btree_check_block(cur, block, 0, bp);
2386 	if (error)
2387 		goto error0;
2388 #endif
2389 	/* Get the address of the rec to be updated. */
2390 	ptr = cur->bc_levels[0].ptr;
2391 	rp = xfs_btree_rec_addr(cur, ptr, block);
2392 
2393 	/* Fill in the new contents and log them. */
2394 	xfs_btree_copy_recs(cur, rp, rec, 1);
2395 	xfs_btree_log_recs(cur, bp, ptr, ptr);
2396 
2397 	/* Pass new key value up to our parent. */
2398 	if (xfs_btree_needs_key_update(cur, ptr)) {
2399 		error = xfs_btree_update_keys(cur, 0);
2400 		if (error)
2401 			goto error0;
2402 	}
2403 
2404 	return 0;
2405 
2406 error0:
2407 	return error;
2408 }
2409 
2410 /*
2411  * Move 1 record left from cur/level if possible.
2412  * Update cur to reflect the new path.
2413  */
2414 STATIC int					/* error */
2415 xfs_btree_lshift(
2416 	struct xfs_btree_cur	*cur,
2417 	int			level,
2418 	int			*stat)		/* success/failure */
2419 {
2420 	struct xfs_buf		*lbp;		/* left buffer pointer */
2421 	struct xfs_btree_block	*left;		/* left btree block */
2422 	int			lrecs;		/* left record count */
2423 	struct xfs_buf		*rbp;		/* right buffer pointer */
2424 	struct xfs_btree_block	*right;		/* right btree block */
2425 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2426 	int			rrecs;		/* right record count */
2427 	union xfs_btree_ptr	lptr;		/* left btree pointer */
2428 	union xfs_btree_key	*rkp = NULL;	/* right btree key */
2429 	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
2430 	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
2431 	int			error;		/* error return value */
2432 	int			i;
2433 
2434 	if (xfs_btree_at_iroot(cur, level))
2435 		goto out0;
2436 
2437 	/* Set up variables for this block as "right". */
2438 	right = xfs_btree_get_block(cur, level, &rbp);
2439 
2440 #ifdef DEBUG
2441 	error = xfs_btree_check_block(cur, right, level, rbp);
2442 	if (error)
2443 		goto error0;
2444 #endif
2445 
2446 	/* If we've got no left sibling then we can't shift an entry left. */
2447 	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2448 	if (xfs_btree_ptr_is_null(cur, &lptr))
2449 		goto out0;
2450 
2451 	/*
2452 	 * If the cursor entry is the one that would be moved, don't
2453 	 * do it... it's too complicated.
2454 	 */
2455 	if (cur->bc_levels[level].ptr <= 1)
2456 		goto out0;
2457 
2458 	/* Set up the left neighbor as "left". */
2459 	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2460 	if (error)
2461 		goto error0;
2462 
2463 	/* If it's full, it can't take another entry. */
2464 	lrecs = xfs_btree_get_numrecs(left);
2465 	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2466 		goto out0;
2467 
2468 	rrecs = xfs_btree_get_numrecs(right);
2469 
2470 	/*
2471 	 * We add one entry to the left side and remove one for the right side.
2472 	 * Account for it here, the changes will be updated on disk and logged
2473 	 * later.
2474 	 */
2475 	lrecs++;
2476 	rrecs--;
2477 
2478 	XFS_BTREE_STATS_INC(cur, lshift);
2479 	XFS_BTREE_STATS_ADD(cur, moves, 1);
2480 
2481 	/*
2482 	 * If non-leaf, copy a key and a ptr to the left block.
2483 	 * Log the changes to the left block.
2484 	 */
2485 	if (level > 0) {
2486 		/* It's a non-leaf.  Move keys and pointers. */
2487 		union xfs_btree_key	*lkp;	/* left btree key */
2488 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2489 
2490 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2491 		rkp = xfs_btree_key_addr(cur, 1, right);
2492 
2493 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2494 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2495 
2496 		error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2497 		if (error)
2498 			goto error0;
2499 
2500 		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2501 		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2502 
2503 		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2504 		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2505 
2506 		ASSERT(cur->bc_ops->keys_inorder(cur,
2507 			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2508 	} else {
2509 		/* It's a leaf.  Move records.  */
2510 		union xfs_btree_rec	*lrp;	/* left record pointer */
2511 
2512 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2513 		rrp = xfs_btree_rec_addr(cur, 1, right);
2514 
2515 		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2516 		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2517 
2518 		ASSERT(cur->bc_ops->recs_inorder(cur,
2519 			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2520 	}
2521 
2522 	xfs_btree_set_numrecs(left, lrecs);
2523 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2524 
2525 	xfs_btree_set_numrecs(right, rrecs);
2526 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2527 
2528 	/*
2529 	 * Slide the contents of right down one entry.
2530 	 */
2531 	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2532 	if (level > 0) {
2533 		/* It's a nonleaf. operate on keys and ptrs */
2534 		for (i = 0; i < rrecs; i++) {
2535 			error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2536 			if (error)
2537 				goto error0;
2538 		}
2539 
2540 		xfs_btree_shift_keys(cur,
2541 				xfs_btree_key_addr(cur, 2, right),
2542 				-1, rrecs);
2543 		xfs_btree_shift_ptrs(cur,
2544 				xfs_btree_ptr_addr(cur, 2, right),
2545 				-1, rrecs);
2546 
2547 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2548 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2549 	} else {
2550 		/* It's a leaf. operate on records */
2551 		xfs_btree_shift_recs(cur,
2552 			xfs_btree_rec_addr(cur, 2, right),
2553 			-1, rrecs);
2554 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2555 	}
2556 
2557 	/*
2558 	 * Using a temporary cursor, update the parent key values of the
2559 	 * block on the left.
2560 	 */
2561 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2562 		error = xfs_btree_dup_cursor(cur, &tcur);
2563 		if (error)
2564 			goto error0;
2565 		i = xfs_btree_firstrec(tcur, level);
2566 		if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2567 			xfs_btree_mark_sick(cur);
2568 			error = -EFSCORRUPTED;
2569 			goto error0;
2570 		}
2571 
2572 		error = xfs_btree_decrement(tcur, level, &i);
2573 		if (error)
2574 			goto error1;
2575 
2576 		/* Update the parent high keys of the left block, if needed. */
2577 		error = xfs_btree_update_keys(tcur, level);
2578 		if (error)
2579 			goto error1;
2580 
2581 		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2582 	}
2583 
2584 	/* Update the parent keys of the right block. */
2585 	error = xfs_btree_update_keys(cur, level);
2586 	if (error)
2587 		goto error0;
2588 
2589 	/* Slide the cursor value left one. */
2590 	cur->bc_levels[level].ptr--;
2591 
2592 	*stat = 1;
2593 	return 0;
2594 
2595 out0:
2596 	*stat = 0;
2597 	return 0;
2598 
2599 error0:
2600 	return error;
2601 
2602 error1:
2603 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2604 	return error;
2605 }
2606 
2607 /*
2608  * Move 1 record right from cur/level if possible.
2609  * Update cur to reflect the new path.
2610  */
2611 STATIC int					/* error */
2612 xfs_btree_rshift(
2613 	struct xfs_btree_cur	*cur,
2614 	int			level,
2615 	int			*stat)		/* success/failure */
2616 {
2617 	struct xfs_buf		*lbp;		/* left buffer pointer */
2618 	struct xfs_btree_block	*left;		/* left btree block */
2619 	struct xfs_buf		*rbp;		/* right buffer pointer */
2620 	struct xfs_btree_block	*right;		/* right btree block */
2621 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2622 	union xfs_btree_ptr	rptr;		/* right block pointer */
2623 	union xfs_btree_key	*rkp;		/* right btree key */
2624 	int			rrecs;		/* right record count */
2625 	int			lrecs;		/* left record count */
2626 	int			error;		/* error return value */
2627 	int			i;		/* loop counter */
2628 
2629 	if (xfs_btree_at_iroot(cur, level))
2630 		goto out0;
2631 
2632 	/* Set up variables for this block as "left". */
2633 	left = xfs_btree_get_block(cur, level, &lbp);
2634 
2635 #ifdef DEBUG
2636 	error = xfs_btree_check_block(cur, left, level, lbp);
2637 	if (error)
2638 		goto error0;
2639 #endif
2640 
2641 	/* If we've got no right sibling then we can't shift an entry right. */
2642 	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2643 	if (xfs_btree_ptr_is_null(cur, &rptr))
2644 		goto out0;
2645 
2646 	/*
2647 	 * If the cursor entry is the one that would be moved, don't
2648 	 * do it... it's too complicated.
2649 	 */
2650 	lrecs = xfs_btree_get_numrecs(left);
2651 	if (cur->bc_levels[level].ptr >= lrecs)
2652 		goto out0;
2653 
2654 	/* Set up the right neighbor as "right". */
2655 	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2656 	if (error)
2657 		goto error0;
2658 
2659 	/* If it's full, it can't take another entry. */
2660 	rrecs = xfs_btree_get_numrecs(right);
2661 	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2662 		goto out0;
2663 
2664 	XFS_BTREE_STATS_INC(cur, rshift);
2665 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2666 
2667 	/*
2668 	 * Make a hole at the start of the right neighbor block, then
2669 	 * copy the last left block entry to the hole.
2670 	 */
2671 	if (level > 0) {
2672 		/* It's a nonleaf. make a hole in the keys and ptrs */
2673 		union xfs_btree_key	*lkp;
2674 		union xfs_btree_ptr	*lpp;
2675 		union xfs_btree_ptr	*rpp;
2676 
2677 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2678 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2679 		rkp = xfs_btree_key_addr(cur, 1, right);
2680 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2681 
2682 		for (i = rrecs - 1; i >= 0; i--) {
2683 			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2684 			if (error)
2685 				goto error0;
2686 		}
2687 
2688 		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2689 		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2690 
2691 		error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2692 		if (error)
2693 			goto error0;
2694 
2695 		/* Now put the new data in, and log it. */
2696 		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2697 		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2698 
2699 		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2700 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2701 
2702 		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2703 			xfs_btree_key_addr(cur, 2, right)));
2704 	} else {
2705 		/* It's a leaf. make a hole in the records */
2706 		union xfs_btree_rec	*lrp;
2707 		union xfs_btree_rec	*rrp;
2708 
2709 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2710 		rrp = xfs_btree_rec_addr(cur, 1, right);
2711 
2712 		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2713 
2714 		/* Now put the new data in, and log it. */
2715 		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2716 		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2717 	}
2718 
2719 	/*
2720 	 * Decrement and log left's numrecs, bump and log right's numrecs.
2721 	 */
2722 	xfs_btree_set_numrecs(left, --lrecs);
2723 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2724 
2725 	xfs_btree_set_numrecs(right, ++rrecs);
2726 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2727 
2728 	/*
2729 	 * Using a temporary cursor, update the parent key values of the
2730 	 * block on the right.
2731 	 */
2732 	error = xfs_btree_dup_cursor(cur, &tcur);
2733 	if (error)
2734 		goto error0;
2735 	i = xfs_btree_lastrec(tcur, level);
2736 	if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2737 		xfs_btree_mark_sick(cur);
2738 		error = -EFSCORRUPTED;
2739 		goto error0;
2740 	}
2741 
2742 	error = xfs_btree_increment(tcur, level, &i);
2743 	if (error)
2744 		goto error1;
2745 
2746 	/* Update the parent high keys of the left block, if needed. */
2747 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2748 		error = xfs_btree_update_keys(cur, level);
2749 		if (error)
2750 			goto error1;
2751 	}
2752 
2753 	/* Update the parent keys of the right block. */
2754 	error = xfs_btree_update_keys(tcur, level);
2755 	if (error)
2756 		goto error1;
2757 
2758 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2759 
2760 	*stat = 1;
2761 	return 0;
2762 
2763 out0:
2764 	*stat = 0;
2765 	return 0;
2766 
2767 error0:
2768 	return error;
2769 
2770 error1:
2771 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2772 	return error;
2773 }
2774 
2775 static inline int
2776 xfs_btree_alloc_block(
2777 	struct xfs_btree_cur		*cur,
2778 	const union xfs_btree_ptr	*hint_block,
2779 	union xfs_btree_ptr		*new_block,
2780 	int				*stat)
2781 {
2782 	int				error;
2783 
2784 	/*
2785 	 * Don't allow block allocation for a staging cursor, because staging
2786 	 * cursors do not support regular btree modifications.
2787 	 *
2788 	 * Bulk loading uses a separate callback to obtain new blocks from a
2789 	 * preallocated list, which prevents ENOSPC failures during loading.
2790 	 */
2791 	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
2792 		ASSERT(0);
2793 		return -EFSCORRUPTED;
2794 	}
2795 
2796 	error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
2797 	trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
2798 	return error;
2799 }
2800 
2801 /*
2802  * Split cur/level block in half.
2803  * Return new block number and the key to its first
2804  * record (to be inserted into parent).
2805  */
2806 STATIC int					/* error */
2807 __xfs_btree_split(
2808 	struct xfs_btree_cur	*cur,
2809 	int			level,
2810 	union xfs_btree_ptr	*ptrp,
2811 	union xfs_btree_key	*key,
2812 	struct xfs_btree_cur	**curp,
2813 	int			*stat)		/* success/failure */
2814 {
2815 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2816 	struct xfs_buf		*lbp;		/* left buffer pointer */
2817 	struct xfs_btree_block	*left;		/* left btree block */
2818 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2819 	struct xfs_buf		*rbp;		/* right buffer pointer */
2820 	struct xfs_btree_block	*right;		/* right btree block */
2821 	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2822 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2823 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2824 	int			lrecs;
2825 	int			rrecs;
2826 	int			src_index;
2827 	int			error;		/* error return value */
2828 	int			i;
2829 
2830 	XFS_BTREE_STATS_INC(cur, split);
2831 
2832 	/* Set up left block (current one). */
2833 	left = xfs_btree_get_block(cur, level, &lbp);
2834 
2835 #ifdef DEBUG
2836 	error = xfs_btree_check_block(cur, left, level, lbp);
2837 	if (error)
2838 		goto error0;
2839 #endif
2840 
2841 	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2842 
2843 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2844 	error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
2845 	if (error)
2846 		goto error0;
2847 	if (*stat == 0)
2848 		goto out0;
2849 	XFS_BTREE_STATS_INC(cur, alloc);
2850 
2851 	/* Set up the new block as "right". */
2852 	error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2853 	if (error)
2854 		goto error0;
2855 
2856 	/* Fill in the btree header for the new right block. */
2857 	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2858 
2859 	/*
2860 	 * Split the entries between the old and the new block evenly.
2861 	 * Make sure that if there's an odd number of entries now, that
2862 	 * each new block will have the same number of entries.
2863 	 */
2864 	lrecs = xfs_btree_get_numrecs(left);
2865 	rrecs = lrecs / 2;
2866 	if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
2867 		rrecs++;
2868 	src_index = (lrecs - rrecs + 1);
2869 
2870 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2871 
2872 	/* Adjust numrecs for the later get_*_keys() calls. */
2873 	lrecs -= rrecs;
2874 	xfs_btree_set_numrecs(left, lrecs);
2875 	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2876 
2877 	/*
2878 	 * Copy btree block entries from the left block over to the
2879 	 * new block, the right. Update the right block and log the
2880 	 * changes.
2881 	 */
2882 	if (level > 0) {
2883 		/* It's a non-leaf.  Move keys and pointers. */
2884 		union xfs_btree_key	*lkp;	/* left btree key */
2885 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2886 		union xfs_btree_key	*rkp;	/* right btree key */
2887 		union xfs_btree_ptr	*rpp;	/* right address pointer */
2888 
2889 		lkp = xfs_btree_key_addr(cur, src_index, left);
2890 		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2891 		rkp = xfs_btree_key_addr(cur, 1, right);
2892 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2893 
2894 		for (i = src_index; i < rrecs; i++) {
2895 			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2896 			if (error)
2897 				goto error0;
2898 		}
2899 
2900 		/* Copy the keys & pointers to the new block. */
2901 		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2902 		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2903 
2904 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2905 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2906 
2907 		/* Stash the keys of the new block for later insertion. */
2908 		xfs_btree_get_node_keys(cur, right, key);
2909 	} else {
2910 		/* It's a leaf.  Move records.  */
2911 		union xfs_btree_rec	*lrp;	/* left record pointer */
2912 		union xfs_btree_rec	*rrp;	/* right record pointer */
2913 
2914 		lrp = xfs_btree_rec_addr(cur, src_index, left);
2915 		rrp = xfs_btree_rec_addr(cur, 1, right);
2916 
2917 		/* Copy records to the new block. */
2918 		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2919 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2920 
2921 		/* Stash the keys of the new block for later insertion. */
2922 		xfs_btree_get_leaf_keys(cur, right, key);
2923 	}
2924 
2925 	/*
2926 	 * Find the left block number by looking in the buffer.
2927 	 * Adjust sibling pointers.
2928 	 */
2929 	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2930 	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2931 	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2932 	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2933 
2934 	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2935 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2936 
2937 	/*
2938 	 * If there's a block to the new block's right, make that block
2939 	 * point back to right instead of to left.
2940 	 */
2941 	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2942 		error = xfs_btree_read_buf_block(cur, &rrptr,
2943 							0, &rrblock, &rrbp);
2944 		if (error)
2945 			goto error0;
2946 		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2947 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2948 	}
2949 
2950 	/* Update the parent high keys of the left block, if needed. */
2951 	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2952 		error = xfs_btree_update_keys(cur, level);
2953 		if (error)
2954 			goto error0;
2955 	}
2956 
2957 	/*
2958 	 * If the cursor is really in the right block, move it there.
2959 	 * If it's just pointing past the last entry in left, then we'll
2960 	 * insert there, so don't change anything in that case.
2961 	 */
2962 	if (cur->bc_levels[level].ptr > lrecs + 1) {
2963 		xfs_btree_setbuf(cur, level, rbp);
2964 		cur->bc_levels[level].ptr -= lrecs;
2965 	}
2966 	/*
2967 	 * If there are more levels, we'll need another cursor which refers
2968 	 * the right block, no matter where this cursor was.
2969 	 */
2970 	if (level + 1 < cur->bc_nlevels) {
2971 		error = xfs_btree_dup_cursor(cur, curp);
2972 		if (error)
2973 			goto error0;
2974 		(*curp)->bc_levels[level + 1].ptr++;
2975 	}
2976 	*ptrp = rptr;
2977 	*stat = 1;
2978 	return 0;
2979 out0:
2980 	*stat = 0;
2981 	return 0;
2982 
2983 error0:
2984 	return error;
2985 }
2986 
2987 #ifdef __KERNEL__
2988 struct xfs_btree_split_args {
2989 	struct xfs_btree_cur	*cur;
2990 	int			level;
2991 	union xfs_btree_ptr	*ptrp;
2992 	union xfs_btree_key	*key;
2993 	struct xfs_btree_cur	**curp;
2994 	int			*stat;		/* success/failure */
2995 	int			result;
2996 	bool			kswapd;	/* allocation in kswapd context */
2997 	struct completion	*done;
2998 	struct work_struct	work;
2999 };
3000 
3001 /*
3002  * Stack switching interfaces for allocation
3003  */
3004 static void
3005 xfs_btree_split_worker(
3006 	struct work_struct	*work)
3007 {
3008 	struct xfs_btree_split_args	*args = container_of(work,
3009 						struct xfs_btree_split_args, work);
3010 	unsigned long		pflags;
3011 	unsigned long		new_pflags = 0;
3012 
3013 	/*
3014 	 * we are in a transaction context here, but may also be doing work
3015 	 * in kswapd context, and hence we may need to inherit that state
3016 	 * temporarily to ensure that we don't block waiting for memory reclaim
3017 	 * in any way.
3018 	 */
3019 	if (args->kswapd)
3020 		new_pflags |= PF_MEMALLOC | PF_KSWAPD;
3021 
3022 	current_set_flags_nested(&pflags, new_pflags);
3023 	xfs_trans_set_context(args->cur->bc_tp);
3024 
3025 	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
3026 					 args->key, args->curp, args->stat);
3027 
3028 	xfs_trans_clear_context(args->cur->bc_tp);
3029 	current_restore_flags_nested(&pflags, new_pflags);
3030 
3031 	/*
3032 	 * Do not access args after complete() has run here. We don't own args
3033 	 * and the owner may run and free args before we return here.
3034 	 */
3035 	complete(args->done);
3036 
3037 }
3038 
3039 /*
3040  * BMBT split requests often come in with little stack to work on so we push
3041  * them off to a worker thread so there is lots of stack to use. For the other
3042  * btree types, just call directly to avoid the context switch overhead here.
3043  *
3044  * Care must be taken here - the work queue rescuer thread introduces potential
3045  * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
3046  * AGFs to allocate blocks. A task being run by the rescuer could attempt to
3047  * lock an AGF that is already locked by a task queued to run by the rescuer,
3048  * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
3049  * release it until the current thread it is running gains the lock.
3050  *
3051  * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
3052  * already locked to allocate from. The only place that doesn't hold an AGF
3053  * locked is unwritten extent conversion at IO completion, but that has already
3054  * been offloaded to a worker thread and hence has no stack consumption issues
3055  * we have to worry about.
3056  */
3057 STATIC int					/* error */
3058 xfs_btree_split(
3059 	struct xfs_btree_cur	*cur,
3060 	int			level,
3061 	union xfs_btree_ptr	*ptrp,
3062 	union xfs_btree_key	*key,
3063 	struct xfs_btree_cur	**curp,
3064 	int			*stat)		/* success/failure */
3065 {
3066 	struct xfs_btree_split_args	args;
3067 	DECLARE_COMPLETION_ONSTACK(done);
3068 
3069 	if (!xfs_btree_is_bmap(cur->bc_ops) ||
3070 	    cur->bc_tp->t_highest_agno == NULLAGNUMBER)
3071 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
3072 
3073 	args.cur = cur;
3074 	args.level = level;
3075 	args.ptrp = ptrp;
3076 	args.key = key;
3077 	args.curp = curp;
3078 	args.stat = stat;
3079 	args.done = &done;
3080 	args.kswapd = current_is_kswapd();
3081 	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
3082 	queue_work(xfs_alloc_wq, &args.work);
3083 	wait_for_completion(&done);
3084 	destroy_work_on_stack(&args.work);
3085 	return args.result;
3086 }
3087 #else
3088 #define xfs_btree_split	__xfs_btree_split
3089 #endif /* __KERNEL__ */
3090 
3091 /* Move the records from a root leaf block to a separate block. */
3092 STATIC void
3093 xfs_btree_promote_leaf_iroot(
3094 	struct xfs_btree_cur	*cur,
3095 	struct xfs_btree_block	*block,
3096 	struct xfs_buf		*cbp,
3097 	union xfs_btree_ptr	*cptr,
3098 	struct xfs_btree_block	*cblock)
3099 {
3100 	union xfs_btree_rec	*rp;
3101 	union xfs_btree_rec	*crp;
3102 	union xfs_btree_key	*kp;
3103 	union xfs_btree_ptr	*pp;
3104 	struct xfs_btree_block	*broot;
3105 	int			numrecs = xfs_btree_get_numrecs(block);
3106 
3107 	/* Copy the records from the leaf broot into the new child block. */
3108 	rp = xfs_btree_rec_addr(cur, 1, block);
3109 	crp = xfs_btree_rec_addr(cur, 1, cblock);
3110 	xfs_btree_copy_recs(cur, crp, rp, numrecs);
3111 
3112 	/*
3113 	 * Increment the tree height.
3114 	 *
3115 	 * Trickery here: The amount of memory that we need per record for the
3116 	 * ifork's btree root block may change when we convert the broot from a
3117 	 * leaf to a node block.  Free the existing leaf broot so that nobody
3118 	 * thinks we need to migrate node pointers when we realloc the broot
3119 	 * buffer after bumping nlevels.
3120 	 */
3121 	cur->bc_ops->broot_realloc(cur, 0);
3122 	cur->bc_nlevels++;
3123 	cur->bc_levels[1].ptr = 1;
3124 
3125 	/*
3126 	 * Allocate a new node broot and initialize it to point to the new
3127 	 * child block.
3128 	 */
3129 	broot = cur->bc_ops->broot_realloc(cur, 1);
3130 	xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops,
3131 			cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino);
3132 
3133 	pp = xfs_btree_ptr_addr(cur, 1, broot);
3134 	kp = xfs_btree_key_addr(cur, 1, broot);
3135 	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
3136 	xfs_btree_get_keys(cur, cblock, kp);
3137 
3138 	/* Attach the new block to the cursor and log it. */
3139 	xfs_btree_setbuf(cur, 0, cbp);
3140 	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3141 	xfs_btree_log_recs(cur, cbp, 1, numrecs);
3142 }
3143 
3144 /*
3145  * Move the keys and pointers from a root block to a separate block.
3146  *
3147  * Since the keyptr size does not change, all we have to do is increase the
3148  * tree height, copy the keyptrs to the new internal node (cblock), shrink
3149  * the root, and copy the pointers there.
3150  */
3151 STATIC int
3152 xfs_btree_promote_node_iroot(
3153 	struct xfs_btree_cur	*cur,
3154 	struct xfs_btree_block	*block,
3155 	int			level,
3156 	struct xfs_buf		*cbp,
3157 	union xfs_btree_ptr	*cptr,
3158 	struct xfs_btree_block	*cblock)
3159 {
3160 	union xfs_btree_key	*ckp;
3161 	union xfs_btree_key	*kp;
3162 	union xfs_btree_ptr	*cpp;
3163 	union xfs_btree_ptr	*pp;
3164 	int			i;
3165 	int			error;
3166 	int			numrecs = xfs_btree_get_numrecs(block);
3167 
3168 	/*
3169 	 * Increase tree height, adjusting the root block level to match.
3170 	 * We cannot change the root btree node size until we've copied the
3171 	 * block contents to the new child block.
3172 	 */
3173 	be16_add_cpu(&block->bb_level, 1);
3174 	cur->bc_nlevels++;
3175 	cur->bc_levels[level + 1].ptr = 1;
3176 
3177 	/*
3178 	 * Adjust the root btree record count, then copy the keys from the old
3179 	 * root to the new child block.
3180 	 */
3181 	xfs_btree_set_numrecs(block, 1);
3182 	kp = xfs_btree_key_addr(cur, 1, block);
3183 	ckp = xfs_btree_key_addr(cur, 1, cblock);
3184 	xfs_btree_copy_keys(cur, ckp, kp, numrecs);
3185 
3186 	/* Check the pointers and copy them to the new child block. */
3187 	pp = xfs_btree_ptr_addr(cur, 1, block);
3188 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3189 	for (i = 0; i < numrecs; i++) {
3190 		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3191 		if (error)
3192 			return error;
3193 	}
3194 	xfs_btree_copy_ptrs(cur, cpp, pp, numrecs);
3195 
3196 	/*
3197 	 * Set the first keyptr to point to the new child block, then shrink
3198 	 * the memory buffer for the root block.
3199 	 */
3200 	error = xfs_btree_debug_check_ptr(cur, cptr, 0, level);
3201 	if (error)
3202 		return error;
3203 	xfs_btree_copy_ptrs(cur, pp, cptr, 1);
3204 	xfs_btree_get_keys(cur, cblock, kp);
3205 
3206 	cur->bc_ops->broot_realloc(cur, 1);
3207 
3208 	/* Attach the new block to the cursor and log it. */
3209 	xfs_btree_setbuf(cur, level, cbp);
3210 	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3211 	xfs_btree_log_keys(cur, cbp, 1, numrecs);
3212 	xfs_btree_log_ptrs(cur, cbp, 1, numrecs);
3213 	return 0;
3214 }
3215 
3216 /*
3217  * Copy the old inode root contents into a real block and make the
3218  * broot point to it.
3219  */
3220 int						/* error */
3221 xfs_btree_new_iroot(
3222 	struct xfs_btree_cur	*cur,		/* btree cursor */
3223 	int			*logflags,	/* logging flags for inode */
3224 	int			*stat)		/* return status - 0 fail */
3225 {
3226 	struct xfs_buf		*cbp;		/* buffer for cblock */
3227 	struct xfs_btree_block	*block;		/* btree block */
3228 	struct xfs_btree_block	*cblock;	/* child btree block */
3229 	union xfs_btree_ptr	aptr;
3230 	union xfs_btree_ptr	nptr;		/* new block addr */
3231 	int			level;		/* btree level */
3232 	int			error;		/* error return code */
3233 
3234 	XFS_BTREE_STATS_INC(cur, newroot);
3235 
3236 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
3237 
3238 	level = cur->bc_nlevels - 1;
3239 
3240 	block = xfs_btree_get_iroot(cur);
3241 	ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS));
3242 	if (level > 0)
3243 		aptr = *xfs_btree_ptr_addr(cur, 1, block);
3244 	else
3245 		aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp,
3246 				cur->bc_ino.ip->i_ino));
3247 
3248 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3249 	error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat);
3250 	if (error)
3251 		goto error0;
3252 	if (*stat == 0)
3253 		return 0;
3254 
3255 	XFS_BTREE_STATS_INC(cur, alloc);
3256 
3257 	/* Copy the root into a real block. */
3258 	error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
3259 	if (error)
3260 		goto error0;
3261 
3262 	/*
3263 	 * we can't just memcpy() the root in for CRC enabled btree blocks.
3264 	 * In that case have to also ensure the blkno remains correct
3265 	 */
3266 	memcpy(cblock, block, xfs_btree_block_len(cur));
3267 	if (xfs_has_crc(cur->bc_mp)) {
3268 		__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
3269 		if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
3270 			cblock->bb_u.l.bb_blkno = bno;
3271 		else
3272 			cblock->bb_u.s.bb_blkno = bno;
3273 	}
3274 
3275 	if (level > 0) {
3276 		error = xfs_btree_promote_node_iroot(cur, block, level, cbp,
3277 				&nptr, cblock);
3278 		if (error)
3279 			goto error0;
3280 	} else {
3281 		xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock);
3282 	}
3283 
3284 	*logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
3285 	*stat = 1;
3286 	return 0;
3287 error0:
3288 	return error;
3289 }
3290 
3291 static void
3292 xfs_btree_set_root(
3293 	struct xfs_btree_cur		*cur,
3294 	const union xfs_btree_ptr	*ptr,
3295 	int				inc)
3296 {
3297 	if (cur->bc_flags & XFS_BTREE_STAGING) {
3298 		/* Update the btree root information for a per-AG fake root. */
3299 		cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
3300 		cur->bc_ag.afake->af_levels += inc;
3301 	} else {
3302 		cur->bc_ops->set_root(cur, ptr, inc);
3303 	}
3304 }
3305 
3306 /*
3307  * Allocate a new root block, fill it in.
3308  */
3309 STATIC int				/* error */
3310 xfs_btree_new_root(
3311 	struct xfs_btree_cur	*cur,	/* btree cursor */
3312 	int			*stat)	/* success/failure */
3313 {
3314 	struct xfs_btree_block	*block;	/* one half of the old root block */
3315 	struct xfs_buf		*bp;	/* buffer containing block */
3316 	int			error;	/* error return value */
3317 	struct xfs_buf		*lbp;	/* left buffer pointer */
3318 	struct xfs_btree_block	*left;	/* left btree block */
3319 	struct xfs_buf		*nbp;	/* new (root) buffer */
3320 	struct xfs_btree_block	*new;	/* new (root) btree block */
3321 	int			nptr;	/* new value for key index, 1 or 2 */
3322 	struct xfs_buf		*rbp;	/* right buffer pointer */
3323 	struct xfs_btree_block	*right;	/* right btree block */
3324 	union xfs_btree_ptr	rptr;
3325 	union xfs_btree_ptr	lptr;
3326 
3327 	XFS_BTREE_STATS_INC(cur, newroot);
3328 
3329 	/* initialise our start point from the cursor */
3330 	xfs_btree_init_ptr_from_cur(cur, &rptr);
3331 
3332 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3333 	error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
3334 	if (error)
3335 		goto error0;
3336 	if (*stat == 0)
3337 		goto out0;
3338 	XFS_BTREE_STATS_INC(cur, alloc);
3339 
3340 	/* Set up the new block. */
3341 	error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3342 	if (error)
3343 		goto error0;
3344 
3345 	/* Set the root in the holding structure  increasing the level by 1. */
3346 	xfs_btree_set_root(cur, &lptr, 1);
3347 
3348 	/*
3349 	 * At the previous root level there are now two blocks: the old root,
3350 	 * and the new block generated when it was split.  We don't know which
3351 	 * one the cursor is pointing at, so we set up variables "left" and
3352 	 * "right" for each case.
3353 	 */
3354 	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3355 
3356 #ifdef DEBUG
3357 	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3358 	if (error)
3359 		goto error0;
3360 #endif
3361 
3362 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3363 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3364 		/* Our block is left, pick up the right block. */
3365 		lbp = bp;
3366 		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3367 		left = block;
3368 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3369 		if (error)
3370 			goto error0;
3371 		bp = rbp;
3372 		nptr = 1;
3373 	} else {
3374 		/* Our block is right, pick up the left block. */
3375 		rbp = bp;
3376 		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3377 		right = block;
3378 		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3379 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3380 		if (error)
3381 			goto error0;
3382 		bp = lbp;
3383 		nptr = 2;
3384 	}
3385 
3386 	/* Fill in the new block's btree header and log it. */
3387 	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3388 	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3389 	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3390 			!xfs_btree_ptr_is_null(cur, &rptr));
3391 
3392 	/* Fill in the key data in the new root. */
3393 	if (xfs_btree_get_level(left) > 0) {
3394 		/*
3395 		 * Get the keys for the left block's keys and put them directly
3396 		 * in the parent block.  Do the same for the right block.
3397 		 */
3398 		xfs_btree_get_node_keys(cur, left,
3399 				xfs_btree_key_addr(cur, 1, new));
3400 		xfs_btree_get_node_keys(cur, right,
3401 				xfs_btree_key_addr(cur, 2, new));
3402 	} else {
3403 		/*
3404 		 * Get the keys for the left block's records and put them
3405 		 * directly in the parent block.  Do the same for the right
3406 		 * block.
3407 		 */
3408 		xfs_btree_get_leaf_keys(cur, left,
3409 			xfs_btree_key_addr(cur, 1, new));
3410 		xfs_btree_get_leaf_keys(cur, right,
3411 			xfs_btree_key_addr(cur, 2, new));
3412 	}
3413 	xfs_btree_log_keys(cur, nbp, 1, 2);
3414 
3415 	/* Fill in the pointer data in the new root. */
3416 	xfs_btree_copy_ptrs(cur,
3417 		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3418 	xfs_btree_copy_ptrs(cur,
3419 		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3420 	xfs_btree_log_ptrs(cur, nbp, 1, 2);
3421 
3422 	/* Fix up the cursor. */
3423 	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3424 	cur->bc_levels[cur->bc_nlevels].ptr = nptr;
3425 	cur->bc_nlevels++;
3426 	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
3427 	*stat = 1;
3428 	return 0;
3429 error0:
3430 	return error;
3431 out0:
3432 	*stat = 0;
3433 	return 0;
3434 }
3435 
3436 STATIC int
3437 xfs_btree_make_block_unfull(
3438 	struct xfs_btree_cur	*cur,	/* btree cursor */
3439 	int			level,	/* btree level */
3440 	int			numrecs,/* # of recs in block */
3441 	int			*oindex,/* old tree index */
3442 	int			*index,	/* new tree index */
3443 	union xfs_btree_ptr	*nptr,	/* new btree ptr */
3444 	struct xfs_btree_cur	**ncur,	/* new btree cursor */
3445 	union xfs_btree_key	*key,	/* key of new block */
3446 	int			*stat)
3447 {
3448 	int			error = 0;
3449 
3450 	if (xfs_btree_at_iroot(cur, level)) {
3451 		struct xfs_inode *ip = cur->bc_ino.ip;
3452 
3453 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3454 			/* A root block that can be made bigger. */
3455 			cur->bc_ops->broot_realloc(cur, numrecs + 1);
3456 			*stat = 1;
3457 		} else {
3458 			/* A root block that needs replacing */
3459 			int	logflags = 0;
3460 
3461 			error = xfs_btree_new_iroot(cur, &logflags, stat);
3462 			if (error || *stat == 0)
3463 				return error;
3464 
3465 			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3466 		}
3467 
3468 		return 0;
3469 	}
3470 
3471 	/* First, try shifting an entry to the right neighbor. */
3472 	error = xfs_btree_rshift(cur, level, stat);
3473 	if (error || *stat)
3474 		return error;
3475 
3476 	/* Next, try shifting an entry to the left neighbor. */
3477 	error = xfs_btree_lshift(cur, level, stat);
3478 	if (error)
3479 		return error;
3480 
3481 	if (*stat) {
3482 		*oindex = *index = cur->bc_levels[level].ptr;
3483 		return 0;
3484 	}
3485 
3486 	/*
3487 	 * Next, try splitting the current block in half.
3488 	 *
3489 	 * If this works we have to re-set our variables because we
3490 	 * could be in a different block now.
3491 	 */
3492 	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3493 	if (error || *stat == 0)
3494 		return error;
3495 
3496 
3497 	*index = cur->bc_levels[level].ptr;
3498 	return 0;
3499 }
3500 
3501 /*
3502  * Insert one record/level.  Return information to the caller
3503  * allowing the next level up to proceed if necessary.
3504  */
3505 STATIC int
3506 xfs_btree_insrec(
3507 	struct xfs_btree_cur	*cur,	/* btree cursor */
3508 	int			level,	/* level to insert record at */
3509 	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
3510 	union xfs_btree_rec	*rec,	/* record to insert */
3511 	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
3512 	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
3513 	int			*stat)	/* success/failure */
3514 {
3515 	struct xfs_btree_block	*block;	/* btree block */
3516 	struct xfs_buf		*bp;	/* buffer for block */
3517 	union xfs_btree_ptr	nptr;	/* new block ptr */
3518 	struct xfs_btree_cur	*ncur = NULL;	/* new btree cursor */
3519 	union xfs_btree_key	nkey;	/* new block key */
3520 	union xfs_btree_key	*lkey;
3521 	int			optr;	/* old key/record index */
3522 	int			ptr;	/* key/record index */
3523 	int			numrecs;/* number of records */
3524 	int			error;	/* error return value */
3525 	int			i;
3526 	xfs_daddr_t		old_bn;
3527 
3528 	ncur = NULL;
3529 	lkey = &nkey;
3530 
3531 	/*
3532 	 * If we have an external root pointer, and we've made it to the
3533 	 * root level, allocate a new root block and we're done.
3534 	 */
3535 	if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
3536 	    level >= cur->bc_nlevels) {
3537 		error = xfs_btree_new_root(cur, stat);
3538 		xfs_btree_set_ptr_null(cur, ptrp);
3539 
3540 		return error;
3541 	}
3542 
3543 	/* If we're off the left edge, return failure. */
3544 	ptr = cur->bc_levels[level].ptr;
3545 	if (ptr == 0) {
3546 		*stat = 0;
3547 		return 0;
3548 	}
3549 
3550 	optr = ptr;
3551 
3552 	XFS_BTREE_STATS_INC(cur, insrec);
3553 
3554 	/* Get pointers to the btree buffer and block. */
3555 	block = xfs_btree_get_block(cur, level, &bp);
3556 	old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
3557 	numrecs = xfs_btree_get_numrecs(block);
3558 
3559 #ifdef DEBUG
3560 	error = xfs_btree_check_block(cur, block, level, bp);
3561 	if (error)
3562 		goto error0;
3563 
3564 	/* Check that the new entry is being inserted in the right place. */
3565 	if (ptr <= numrecs) {
3566 		if (level == 0) {
3567 			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3568 				xfs_btree_rec_addr(cur, ptr, block)));
3569 		} else {
3570 			ASSERT(cur->bc_ops->keys_inorder(cur, key,
3571 				xfs_btree_key_addr(cur, ptr, block)));
3572 		}
3573 	}
3574 #endif
3575 
3576 	/*
3577 	 * If the block is full, we can't insert the new entry until we
3578 	 * make the block un-full.
3579 	 */
3580 	xfs_btree_set_ptr_null(cur, &nptr);
3581 	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3582 		error = xfs_btree_make_block_unfull(cur, level, numrecs,
3583 					&optr, &ptr, &nptr, &ncur, lkey, stat);
3584 		if (error || *stat == 0)
3585 			goto error0;
3586 	}
3587 
3588 	/*
3589 	 * The current block may have changed if the block was
3590 	 * previously full and we have just made space in it.
3591 	 */
3592 	block = xfs_btree_get_block(cur, level, &bp);
3593 	numrecs = xfs_btree_get_numrecs(block);
3594 
3595 #ifdef DEBUG
3596 	error = xfs_btree_check_block(cur, block, level, bp);
3597 	if (error)
3598 		goto error0;
3599 #endif
3600 
3601 	/*
3602 	 * At this point we know there's room for our new entry in the block
3603 	 * we're pointing at.
3604 	 */
3605 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3606 
3607 	if (level > 0) {
3608 		/* It's a nonleaf. make a hole in the keys and ptrs */
3609 		union xfs_btree_key	*kp;
3610 		union xfs_btree_ptr	*pp;
3611 
3612 		kp = xfs_btree_key_addr(cur, ptr, block);
3613 		pp = xfs_btree_ptr_addr(cur, ptr, block);
3614 
3615 		for (i = numrecs - ptr; i >= 0; i--) {
3616 			error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3617 			if (error)
3618 				goto error0;
3619 		}
3620 
3621 		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3622 		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3623 
3624 		error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3625 		if (error)
3626 			goto error0;
3627 
3628 		/* Now put the new data in, bump numrecs and log it. */
3629 		xfs_btree_copy_keys(cur, kp, key, 1);
3630 		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3631 		numrecs++;
3632 		xfs_btree_set_numrecs(block, numrecs);
3633 		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3634 		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3635 #ifdef DEBUG
3636 		if (ptr < numrecs) {
3637 			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3638 				xfs_btree_key_addr(cur, ptr + 1, block)));
3639 		}
3640 #endif
3641 	} else {
3642 		/* It's a leaf. make a hole in the records */
3643 		union xfs_btree_rec             *rp;
3644 
3645 		rp = xfs_btree_rec_addr(cur, ptr, block);
3646 
3647 		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3648 
3649 		/* Now put the new data in, bump numrecs and log it. */
3650 		xfs_btree_copy_recs(cur, rp, rec, 1);
3651 		xfs_btree_set_numrecs(block, ++numrecs);
3652 		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3653 #ifdef DEBUG
3654 		if (ptr < numrecs) {
3655 			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3656 				xfs_btree_rec_addr(cur, ptr + 1, block)));
3657 		}
3658 #endif
3659 	}
3660 
3661 	/* Log the new number of records in the btree header. */
3662 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3663 
3664 	/*
3665 	 * Update btree keys to reflect the newly added record or keyptr.
3666 	 * There are three cases here to be aware of.  Normally, all we have to
3667 	 * do is walk towards the root, updating keys as necessary.
3668 	 *
3669 	 * If the caller had us target a full block for the insertion, we dealt
3670 	 * with that by calling the _make_block_unfull function.  If the
3671 	 * "make unfull" function splits the block, it'll hand us back the key
3672 	 * and pointer of the new block.  We haven't yet added the new block to
3673 	 * the next level up, so if we decide to add the new record to the new
3674 	 * block (bp->b_bn != old_bn), we have to update the caller's pointer
3675 	 * so that the caller adds the new block with the correct key.
3676 	 *
3677 	 * However, there is a third possibility-- if the selected block is the
3678 	 * root block of an inode-rooted btree and cannot be expanded further,
3679 	 * the "make unfull" function moves the root block contents to a new
3680 	 * block and updates the root block to point to the new block.  In this
3681 	 * case, no block pointer is passed back because the block has already
3682 	 * been added to the btree.  In this case, we need to use the regular
3683 	 * key update function, just like the first case.  This is critical for
3684 	 * overlapping btrees, because the high key must be updated to reflect
3685 	 * the entire tree, not just the subtree accessible through the first
3686 	 * child of the root (which is now two levels down from the root).
3687 	 */
3688 	if (!xfs_btree_ptr_is_null(cur, &nptr) &&
3689 	    bp && xfs_buf_daddr(bp) != old_bn) {
3690 		xfs_btree_get_keys(cur, block, lkey);
3691 	} else if (xfs_btree_needs_key_update(cur, optr)) {
3692 		error = xfs_btree_update_keys(cur, level);
3693 		if (error)
3694 			goto error0;
3695 	}
3696 
3697 	/*
3698 	 * Return the new block number, if any.
3699 	 * If there is one, give back a record value and a cursor too.
3700 	 */
3701 	*ptrp = nptr;
3702 	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3703 		xfs_btree_copy_keys(cur, key, lkey, 1);
3704 		*curp = ncur;
3705 	}
3706 
3707 	*stat = 1;
3708 	return 0;
3709 
3710 error0:
3711 	if (ncur)
3712 		xfs_btree_del_cursor(ncur, error);
3713 	return error;
3714 }
3715 
3716 /*
3717  * Insert the record at the point referenced by cur.
3718  *
3719  * A multi-level split of the tree on insert will invalidate the original
3720  * cursor.  All callers of this function should assume that the cursor is
3721  * no longer valid and revalidate it.
3722  */
3723 int
3724 xfs_btree_insert(
3725 	struct xfs_btree_cur	*cur,
3726 	int			*stat)
3727 {
3728 	int			error;	/* error return value */
3729 	int			i;	/* result value, 0 for failure */
3730 	int			level;	/* current level number in btree */
3731 	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3732 	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3733 	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3734 	union xfs_btree_key	bkey;	/* key of block to insert */
3735 	union xfs_btree_key	*key;
3736 	union xfs_btree_rec	rec;	/* record to insert */
3737 
3738 	level = 0;
3739 	ncur = NULL;
3740 	pcur = cur;
3741 	key = &bkey;
3742 
3743 	xfs_btree_set_ptr_null(cur, &nptr);
3744 
3745 	/* Make a key out of the record data to be inserted, and save it. */
3746 	cur->bc_ops->init_rec_from_cur(cur, &rec);
3747 	cur->bc_ops->init_key_from_rec(key, &rec);
3748 
3749 	/*
3750 	 * Loop going up the tree, starting at the leaf level.
3751 	 * Stop when we don't get a split block, that must mean that
3752 	 * the insert is finished with this level.
3753 	 */
3754 	do {
3755 		/*
3756 		 * Insert nrec/nptr into this level of the tree.
3757 		 * Note if we fail, nptr will be null.
3758 		 */
3759 		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3760 				&ncur, &i);
3761 		if (error) {
3762 			if (pcur != cur)
3763 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3764 			goto error0;
3765 		}
3766 
3767 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3768 			xfs_btree_mark_sick(cur);
3769 			error = -EFSCORRUPTED;
3770 			goto error0;
3771 		}
3772 		level++;
3773 
3774 		/*
3775 		 * See if the cursor we just used is trash.
3776 		 * Can't trash the caller's cursor, but otherwise we should
3777 		 * if ncur is a new cursor or we're about to be done.
3778 		 */
3779 		if (pcur != cur &&
3780 		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3781 			/* Save the state from the cursor before we trash it */
3782 			if (cur->bc_ops->update_cursor &&
3783 			    !(cur->bc_flags & XFS_BTREE_STAGING))
3784 				cur->bc_ops->update_cursor(pcur, cur);
3785 			cur->bc_nlevels = pcur->bc_nlevels;
3786 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3787 		}
3788 		/* If we got a new cursor, switch to it. */
3789 		if (ncur) {
3790 			pcur = ncur;
3791 			ncur = NULL;
3792 		}
3793 	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3794 
3795 	*stat = i;
3796 	return 0;
3797 error0:
3798 	return error;
3799 }
3800 
3801 /* Move the records from a child leaf block to the root block. */
3802 STATIC void
3803 xfs_btree_demote_leaf_child(
3804 	struct xfs_btree_cur	*cur,
3805 	struct xfs_btree_block	*cblock,
3806 	int			numrecs)
3807 {
3808 	union xfs_btree_rec	*rp;
3809 	union xfs_btree_rec	*crp;
3810 	struct xfs_btree_block	*broot;
3811 
3812 	/*
3813 	 * Decrease the tree height.
3814 	 *
3815 	 * Trickery here: The amount of memory that we need per record for the
3816 	 * ifork's btree root block may change when we convert the broot from a
3817 	 * node to a leaf.  Free the old node broot so that we can get a fresh
3818 	 * leaf broot.
3819 	 */
3820 	cur->bc_ops->broot_realloc(cur, 0);
3821 	cur->bc_nlevels--;
3822 
3823 	/*
3824 	 * Allocate a new leaf broot and copy the records from the old child.
3825 	 * Detach the old child from the cursor.
3826 	 */
3827 	broot = cur->bc_ops->broot_realloc(cur, numrecs);
3828 	xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs,
3829 			cur->bc_ino.ip->i_ino);
3830 
3831 	rp = xfs_btree_rec_addr(cur, 1, broot);
3832 	crp = xfs_btree_rec_addr(cur, 1, cblock);
3833 	xfs_btree_copy_recs(cur, rp, crp, numrecs);
3834 
3835 	cur->bc_levels[0].bp = NULL;
3836 }
3837 
3838 /*
3839  * Move the keyptrs from a child node block to the root block.
3840  *
3841  * Since the keyptr size does not change, all we have to do is increase the
3842  * tree height, copy the keyptrs to the new internal node (cblock), shrink
3843  * the root, and copy the pointers there.
3844  */
3845 STATIC int
3846 xfs_btree_demote_node_child(
3847 	struct xfs_btree_cur	*cur,
3848 	struct xfs_btree_block	*cblock,
3849 	int			level,
3850 	int			numrecs)
3851 {
3852 	struct xfs_btree_block	*block;
3853 	union xfs_btree_key	*ckp;
3854 	union xfs_btree_key	*kp;
3855 	union xfs_btree_ptr	*cpp;
3856 	union xfs_btree_ptr	*pp;
3857 	int			i;
3858 	int			error;
3859 
3860 	/*
3861 	 * Adjust the root btree node size and the record count to match the
3862 	 * doomed child so that we can copy the keyptrs ahead of changing the
3863 	 * tree shape.
3864 	 */
3865 	block = cur->bc_ops->broot_realloc(cur, numrecs);
3866 
3867 	xfs_btree_set_numrecs(block, numrecs);
3868 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3869 
3870 	/* Copy keys from the doomed block. */
3871 	kp = xfs_btree_key_addr(cur, 1, block);
3872 	ckp = xfs_btree_key_addr(cur, 1, cblock);
3873 	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3874 
3875 	/* Copy pointers from the doomed block. */
3876 	pp = xfs_btree_ptr_addr(cur, 1, block);
3877 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3878 	for (i = 0; i < numrecs; i++) {
3879 		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3880 		if (error)
3881 			return error;
3882 	}
3883 	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3884 
3885 	/* Decrease tree height, adjusting the root block level to match. */
3886 	cur->bc_levels[level - 1].bp = NULL;
3887 	be16_add_cpu(&block->bb_level, -1);
3888 	cur->bc_nlevels--;
3889 	return 0;
3890 }
3891 
3892 /*
3893  * Try to merge a non-leaf block back into the inode root.
3894  *
3895  * Note: the killroot names comes from the fact that we're effectively
3896  * killing the old root block.  But because we can't just delete the
3897  * inode we have to copy the single block it was pointing to into the
3898  * inode.
3899  */
3900 STATIC int
3901 xfs_btree_kill_iroot(
3902 	struct xfs_btree_cur	*cur)
3903 {
3904 	struct xfs_inode	*ip = cur->bc_ino.ip;
3905 	struct xfs_btree_block	*block;
3906 	struct xfs_btree_block	*cblock;
3907 	struct xfs_buf		*cbp;
3908 	int			level;
3909 	int			numrecs;
3910 	int			error;
3911 #ifdef DEBUG
3912 	union xfs_btree_ptr	ptr;
3913 #endif
3914 
3915 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
3916 	ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) ||
3917 	       cur->bc_nlevels > 1);
3918 
3919 	/*
3920 	 * Don't deal with the root block needs to be a leaf case.
3921 	 * We're just going to turn the thing back into extents anyway.
3922 	 */
3923 	level = cur->bc_nlevels - 1;
3924 	if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS))
3925 		goto out0;
3926 
3927 	/* If we're already a leaf, jump out. */
3928 	if (level == 0)
3929 		goto out0;
3930 
3931 	/*
3932 	 * Give up if the root has multiple children.
3933 	 */
3934 	block = xfs_btree_get_iroot(cur);
3935 	if (xfs_btree_get_numrecs(block) != 1)
3936 		goto out0;
3937 
3938 	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3939 	numrecs = xfs_btree_get_numrecs(cblock);
3940 
3941 	/*
3942 	 * Only do this if the next level will fit.
3943 	 * Then the data must be copied up to the inode,
3944 	 * instead of freeing the root you free the next level.
3945 	 */
3946 	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3947 		goto out0;
3948 
3949 	XFS_BTREE_STATS_INC(cur, killroot);
3950 
3951 #ifdef DEBUG
3952 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3953 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3954 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3955 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3956 #endif
3957 
3958 	if (level > 1) {
3959 		error = xfs_btree_demote_node_child(cur, cblock, level,
3960 				numrecs);
3961 		if (error)
3962 			return error;
3963 	} else
3964 		xfs_btree_demote_leaf_child(cur, cblock, numrecs);
3965 
3966 	error = xfs_btree_free_block(cur, cbp);
3967 	if (error)
3968 		return error;
3969 
3970 	xfs_trans_log_inode(cur->bc_tp, ip,
3971 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3972 out0:
3973 	return 0;
3974 }
3975 
3976 /*
3977  * Kill the current root node, and replace it with it's only child node.
3978  */
3979 STATIC int
3980 xfs_btree_kill_root(
3981 	struct xfs_btree_cur	*cur,
3982 	struct xfs_buf		*bp,
3983 	int			level,
3984 	union xfs_btree_ptr	*newroot)
3985 {
3986 	int			error;
3987 
3988 	XFS_BTREE_STATS_INC(cur, killroot);
3989 
3990 	/*
3991 	 * Update the root pointer, decreasing the level by 1 and then
3992 	 * free the old root.
3993 	 */
3994 	xfs_btree_set_root(cur, newroot, -1);
3995 
3996 	error = xfs_btree_free_block(cur, bp);
3997 	if (error)
3998 		return error;
3999 
4000 	cur->bc_levels[level].bp = NULL;
4001 	cur->bc_levels[level].ra = 0;
4002 	cur->bc_nlevels--;
4003 
4004 	return 0;
4005 }
4006 
4007 STATIC int
4008 xfs_btree_dec_cursor(
4009 	struct xfs_btree_cur	*cur,
4010 	int			level,
4011 	int			*stat)
4012 {
4013 	int			error;
4014 	int			i;
4015 
4016 	if (level > 0) {
4017 		error = xfs_btree_decrement(cur, level, &i);
4018 		if (error)
4019 			return error;
4020 	}
4021 
4022 	*stat = 1;
4023 	return 0;
4024 }
4025 
4026 /*
4027  * Single level of the btree record deletion routine.
4028  * Delete record pointed to by cur/level.
4029  * Remove the record from its block then rebalance the tree.
4030  * Return 0 for error, 1 for done, 2 to go on to the next level.
4031  */
4032 STATIC int					/* error */
4033 xfs_btree_delrec(
4034 	struct xfs_btree_cur	*cur,		/* btree cursor */
4035 	int			level,		/* level removing record from */
4036 	int			*stat)		/* fail/done/go-on */
4037 {
4038 	struct xfs_btree_block	*block;		/* btree block */
4039 	union xfs_btree_ptr	cptr;		/* current block ptr */
4040 	struct xfs_buf		*bp;		/* buffer for block */
4041 	int			error;		/* error return value */
4042 	int			i;		/* loop counter */
4043 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
4044 	struct xfs_buf		*lbp;		/* left buffer pointer */
4045 	struct xfs_btree_block	*left;		/* left btree block */
4046 	int			lrecs = 0;	/* left record count */
4047 	int			ptr;		/* key/record index */
4048 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
4049 	struct xfs_buf		*rbp;		/* right buffer pointer */
4050 	struct xfs_btree_block	*right;		/* right btree block */
4051 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
4052 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
4053 	int			rrecs = 0;	/* right record count */
4054 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
4055 	int			numrecs;	/* temporary numrec count */
4056 
4057 	tcur = NULL;
4058 
4059 	/* Get the index of the entry being deleted, check for nothing there. */
4060 	ptr = cur->bc_levels[level].ptr;
4061 	if (ptr == 0) {
4062 		*stat = 0;
4063 		return 0;
4064 	}
4065 
4066 	/* Get the buffer & block containing the record or key/ptr. */
4067 	block = xfs_btree_get_block(cur, level, &bp);
4068 	numrecs = xfs_btree_get_numrecs(block);
4069 
4070 #ifdef DEBUG
4071 	error = xfs_btree_check_block(cur, block, level, bp);
4072 	if (error)
4073 		goto error0;
4074 #endif
4075 
4076 	/* Fail if we're off the end of the block. */
4077 	if (ptr > numrecs) {
4078 		*stat = 0;
4079 		return 0;
4080 	}
4081 
4082 	XFS_BTREE_STATS_INC(cur, delrec);
4083 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
4084 
4085 	/* Excise the entries being deleted. */
4086 	if (level > 0) {
4087 		/* It's a nonleaf. operate on keys and ptrs */
4088 		union xfs_btree_key	*lkp;
4089 		union xfs_btree_ptr	*lpp;
4090 
4091 		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
4092 		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
4093 
4094 		for (i = 0; i < numrecs - ptr; i++) {
4095 			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
4096 			if (error)
4097 				goto error0;
4098 		}
4099 
4100 		if (ptr < numrecs) {
4101 			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
4102 			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
4103 			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
4104 			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
4105 		}
4106 	} else {
4107 		/* It's a leaf. operate on records */
4108 		if (ptr < numrecs) {
4109 			xfs_btree_shift_recs(cur,
4110 				xfs_btree_rec_addr(cur, ptr + 1, block),
4111 				-1, numrecs - ptr);
4112 			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
4113 		}
4114 	}
4115 
4116 	/*
4117 	 * Decrement and log the number of entries in the block.
4118 	 */
4119 	xfs_btree_set_numrecs(block, --numrecs);
4120 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
4121 
4122 	/*
4123 	 * We're at the root level.  First, shrink the root block in-memory.
4124 	 * Try to get rid of the next level down.  If we can't then there's
4125 	 * nothing left to do.  numrecs was decremented above.
4126 	 */
4127 	if (xfs_btree_at_iroot(cur, level)) {
4128 		cur->bc_ops->broot_realloc(cur, numrecs);
4129 
4130 		error = xfs_btree_kill_iroot(cur);
4131 		if (error)
4132 			goto error0;
4133 
4134 		error = xfs_btree_dec_cursor(cur, level, stat);
4135 		if (error)
4136 			goto error0;
4137 		*stat = 1;
4138 		return 0;
4139 	}
4140 
4141 	/*
4142 	 * If this is the root level, and there's only one entry left, and it's
4143 	 * NOT the leaf level, then we can get rid of this level.
4144 	 */
4145 	if (level == cur->bc_nlevels - 1) {
4146 		if (numrecs == 1 && level > 0) {
4147 			union xfs_btree_ptr	*pp;
4148 			/*
4149 			 * pp is still set to the first pointer in the block.
4150 			 * Make it the new root of the btree.
4151 			 */
4152 			pp = xfs_btree_ptr_addr(cur, 1, block);
4153 			error = xfs_btree_kill_root(cur, bp, level, pp);
4154 			if (error)
4155 				goto error0;
4156 		} else if (level > 0) {
4157 			error = xfs_btree_dec_cursor(cur, level, stat);
4158 			if (error)
4159 				goto error0;
4160 		}
4161 		*stat = 1;
4162 		return 0;
4163 	}
4164 
4165 	/*
4166 	 * If we deleted the leftmost entry in the block, update the
4167 	 * key values above us in the tree.
4168 	 */
4169 	if (xfs_btree_needs_key_update(cur, ptr)) {
4170 		error = xfs_btree_update_keys(cur, level);
4171 		if (error)
4172 			goto error0;
4173 	}
4174 
4175 	/*
4176 	 * If the number of records remaining in the block is at least
4177 	 * the minimum, we're done.
4178 	 */
4179 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
4180 		error = xfs_btree_dec_cursor(cur, level, stat);
4181 		if (error)
4182 			goto error0;
4183 		return 0;
4184 	}
4185 
4186 	/*
4187 	 * Otherwise, we have to move some records around to keep the
4188 	 * tree balanced.  Look at the left and right sibling blocks to
4189 	 * see if we can re-balance by moving only one record.
4190 	 */
4191 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4192 	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
4193 
4194 	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
4195 		/*
4196 		 * One child of root, need to get a chance to copy its contents
4197 		 * into the root and delete it. Can't go up to next level,
4198 		 * there's nothing to delete there.
4199 		 */
4200 		if (xfs_btree_ptr_is_null(cur, &rptr) &&
4201 		    xfs_btree_ptr_is_null(cur, &lptr) &&
4202 		    level == cur->bc_nlevels - 2) {
4203 			error = xfs_btree_kill_iroot(cur);
4204 			if (!error)
4205 				error = xfs_btree_dec_cursor(cur, level, stat);
4206 			if (error)
4207 				goto error0;
4208 			return 0;
4209 		}
4210 	}
4211 
4212 	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
4213 	       !xfs_btree_ptr_is_null(cur, &lptr));
4214 
4215 	/*
4216 	 * Duplicate the cursor so our btree manipulations here won't
4217 	 * disrupt the next level up.
4218 	 */
4219 	error = xfs_btree_dup_cursor(cur, &tcur);
4220 	if (error)
4221 		goto error0;
4222 
4223 	/*
4224 	 * If there's a right sibling, see if it's ok to shift an entry
4225 	 * out of it.
4226 	 */
4227 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
4228 		/*
4229 		 * Move the temp cursor to the last entry in the next block.
4230 		 * Actually any entry but the first would suffice.
4231 		 */
4232 		i = xfs_btree_lastrec(tcur, level);
4233 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4234 			xfs_btree_mark_sick(cur);
4235 			error = -EFSCORRUPTED;
4236 			goto error0;
4237 		}
4238 
4239 		error = xfs_btree_increment(tcur, level, &i);
4240 		if (error)
4241 			goto error0;
4242 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4243 			xfs_btree_mark_sick(cur);
4244 			error = -EFSCORRUPTED;
4245 			goto error0;
4246 		}
4247 
4248 		i = xfs_btree_lastrec(tcur, level);
4249 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4250 			xfs_btree_mark_sick(cur);
4251 			error = -EFSCORRUPTED;
4252 			goto error0;
4253 		}
4254 
4255 		/* Grab a pointer to the block. */
4256 		right = xfs_btree_get_block(tcur, level, &rbp);
4257 #ifdef DEBUG
4258 		error = xfs_btree_check_block(tcur, right, level, rbp);
4259 		if (error)
4260 			goto error0;
4261 #endif
4262 		/* Grab the current block number, for future use. */
4263 		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
4264 
4265 		/*
4266 		 * If right block is full enough so that removing one entry
4267 		 * won't make it too empty, and left-shifting an entry out
4268 		 * of right to us works, we're done.
4269 		 */
4270 		if (xfs_btree_get_numrecs(right) - 1 >=
4271 		    cur->bc_ops->get_minrecs(tcur, level)) {
4272 			error = xfs_btree_lshift(tcur, level, &i);
4273 			if (error)
4274 				goto error0;
4275 			if (i) {
4276 				ASSERT(xfs_btree_get_numrecs(block) >=
4277 				       cur->bc_ops->get_minrecs(tcur, level));
4278 
4279 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4280 				tcur = NULL;
4281 
4282 				error = xfs_btree_dec_cursor(cur, level, stat);
4283 				if (error)
4284 					goto error0;
4285 				return 0;
4286 			}
4287 		}
4288 
4289 		/*
4290 		 * Otherwise, grab the number of records in right for
4291 		 * future reference, and fix up the temp cursor to point
4292 		 * to our block again (last record).
4293 		 */
4294 		rrecs = xfs_btree_get_numrecs(right);
4295 		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4296 			i = xfs_btree_firstrec(tcur, level);
4297 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4298 				xfs_btree_mark_sick(cur);
4299 				error = -EFSCORRUPTED;
4300 				goto error0;
4301 			}
4302 
4303 			error = xfs_btree_decrement(tcur, level, &i);
4304 			if (error)
4305 				goto error0;
4306 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4307 				xfs_btree_mark_sick(cur);
4308 				error = -EFSCORRUPTED;
4309 				goto error0;
4310 			}
4311 		}
4312 	}
4313 
4314 	/*
4315 	 * If there's a left sibling, see if it's ok to shift an entry
4316 	 * out of it.
4317 	 */
4318 	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4319 		/*
4320 		 * Move the temp cursor to the first entry in the
4321 		 * previous block.
4322 		 */
4323 		i = xfs_btree_firstrec(tcur, level);
4324 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4325 			xfs_btree_mark_sick(cur);
4326 			error = -EFSCORRUPTED;
4327 			goto error0;
4328 		}
4329 
4330 		error = xfs_btree_decrement(tcur, level, &i);
4331 		if (error)
4332 			goto error0;
4333 		i = xfs_btree_firstrec(tcur, level);
4334 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4335 			xfs_btree_mark_sick(cur);
4336 			error = -EFSCORRUPTED;
4337 			goto error0;
4338 		}
4339 
4340 		/* Grab a pointer to the block. */
4341 		left = xfs_btree_get_block(tcur, level, &lbp);
4342 #ifdef DEBUG
4343 		error = xfs_btree_check_block(cur, left, level, lbp);
4344 		if (error)
4345 			goto error0;
4346 #endif
4347 		/* Grab the current block number, for future use. */
4348 		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
4349 
4350 		/*
4351 		 * If left block is full enough so that removing one entry
4352 		 * won't make it too empty, and right-shifting an entry out
4353 		 * of left to us works, we're done.
4354 		 */
4355 		if (xfs_btree_get_numrecs(left) - 1 >=
4356 		    cur->bc_ops->get_minrecs(tcur, level)) {
4357 			error = xfs_btree_rshift(tcur, level, &i);
4358 			if (error)
4359 				goto error0;
4360 			if (i) {
4361 				ASSERT(xfs_btree_get_numrecs(block) >=
4362 				       cur->bc_ops->get_minrecs(tcur, level));
4363 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4364 				tcur = NULL;
4365 				if (level == 0)
4366 					cur->bc_levels[0].ptr++;
4367 
4368 				*stat = 1;
4369 				return 0;
4370 			}
4371 		}
4372 
4373 		/*
4374 		 * Otherwise, grab the number of records in right for
4375 		 * future reference.
4376 		 */
4377 		lrecs = xfs_btree_get_numrecs(left);
4378 	}
4379 
4380 	/* Delete the temp cursor, we're done with it. */
4381 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4382 	tcur = NULL;
4383 
4384 	/* If here, we need to do a join to keep the tree balanced. */
4385 	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4386 
4387 	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4388 	    lrecs + xfs_btree_get_numrecs(block) <=
4389 			cur->bc_ops->get_maxrecs(cur, level)) {
4390 		/*
4391 		 * Set "right" to be the starting block,
4392 		 * "left" to be the left neighbor.
4393 		 */
4394 		rptr = cptr;
4395 		right = block;
4396 		rbp = bp;
4397 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4398 		if (error)
4399 			goto error0;
4400 
4401 	/*
4402 	 * If that won't work, see if we can join with the right neighbor block.
4403 	 */
4404 	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4405 		   rrecs + xfs_btree_get_numrecs(block) <=
4406 			cur->bc_ops->get_maxrecs(cur, level)) {
4407 		/*
4408 		 * Set "left" to be the starting block,
4409 		 * "right" to be the right neighbor.
4410 		 */
4411 		lptr = cptr;
4412 		left = block;
4413 		lbp = bp;
4414 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4415 		if (error)
4416 			goto error0;
4417 
4418 	/*
4419 	 * Otherwise, we can't fix the imbalance.
4420 	 * Just return.  This is probably a logic error, but it's not fatal.
4421 	 */
4422 	} else {
4423 		error = xfs_btree_dec_cursor(cur, level, stat);
4424 		if (error)
4425 			goto error0;
4426 		return 0;
4427 	}
4428 
4429 	rrecs = xfs_btree_get_numrecs(right);
4430 	lrecs = xfs_btree_get_numrecs(left);
4431 
4432 	/*
4433 	 * We're now going to join "left" and "right" by moving all the stuff
4434 	 * in "right" to "left" and deleting "right".
4435 	 */
4436 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4437 	if (level > 0) {
4438 		/* It's a non-leaf.  Move keys and pointers. */
4439 		union xfs_btree_key	*lkp;	/* left btree key */
4440 		union xfs_btree_ptr	*lpp;	/* left address pointer */
4441 		union xfs_btree_key	*rkp;	/* right btree key */
4442 		union xfs_btree_ptr	*rpp;	/* right address pointer */
4443 
4444 		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4445 		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4446 		rkp = xfs_btree_key_addr(cur, 1, right);
4447 		rpp = xfs_btree_ptr_addr(cur, 1, right);
4448 
4449 		for (i = 1; i < rrecs; i++) {
4450 			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4451 			if (error)
4452 				goto error0;
4453 		}
4454 
4455 		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4456 		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4457 
4458 		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4459 		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4460 	} else {
4461 		/* It's a leaf.  Move records.  */
4462 		union xfs_btree_rec	*lrp;	/* left record pointer */
4463 		union xfs_btree_rec	*rrp;	/* right record pointer */
4464 
4465 		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4466 		rrp = xfs_btree_rec_addr(cur, 1, right);
4467 
4468 		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4469 		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4470 	}
4471 
4472 	XFS_BTREE_STATS_INC(cur, join);
4473 
4474 	/*
4475 	 * Fix up the number of records and right block pointer in the
4476 	 * surviving block, and log it.
4477 	 */
4478 	xfs_btree_set_numrecs(left, lrecs + rrecs);
4479 	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4480 	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4481 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4482 
4483 	/* If there is a right sibling, point it to the remaining block. */
4484 	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4485 	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4486 		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4487 		if (error)
4488 			goto error0;
4489 		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4490 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4491 	}
4492 
4493 	/* Free the deleted block. */
4494 	error = xfs_btree_free_block(cur, rbp);
4495 	if (error)
4496 		goto error0;
4497 
4498 	/*
4499 	 * If we joined with the left neighbor, set the buffer in the
4500 	 * cursor to the left block, and fix up the index.
4501 	 */
4502 	if (bp != lbp) {
4503 		cur->bc_levels[level].bp = lbp;
4504 		cur->bc_levels[level].ptr += lrecs;
4505 		cur->bc_levels[level].ra = 0;
4506 	}
4507 	/*
4508 	 * If we joined with the right neighbor and there's a level above
4509 	 * us, increment the cursor at that level.
4510 	 */
4511 	else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
4512 		 level + 1 < cur->bc_nlevels) {
4513 		error = xfs_btree_increment(cur, level + 1, &i);
4514 		if (error)
4515 			goto error0;
4516 	}
4517 
4518 	/*
4519 	 * Readjust the ptr at this level if it's not a leaf, since it's
4520 	 * still pointing at the deletion point, which makes the cursor
4521 	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4522 	 * We can't use decrement because it would change the next level up.
4523 	 */
4524 	if (level > 0)
4525 		cur->bc_levels[level].ptr--;
4526 
4527 	/*
4528 	 * We combined blocks, so we have to update the parent keys if the
4529 	 * btree supports overlapped intervals.  However,
4530 	 * bc_levels[level + 1].ptr points to the old block so that the caller
4531 	 * knows which record to delete.  Therefore, the caller must be savvy
4532 	 * enough to call updkeys for us if we return stat == 2.  The other
4533 	 * exit points from this function don't require deletions further up
4534 	 * the tree, so they can call updkeys directly.
4535 	 */
4536 
4537 	/* Return value means the next level up has something to do. */
4538 	*stat = 2;
4539 	return 0;
4540 
4541 error0:
4542 	if (tcur)
4543 		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4544 	return error;
4545 }
4546 
4547 /*
4548  * Delete the record pointed to by cur.
4549  * The cursor refers to the place where the record was (could be inserted)
4550  * when the operation returns.
4551  */
4552 int					/* error */
4553 xfs_btree_delete(
4554 	struct xfs_btree_cur	*cur,
4555 	int			*stat)	/* success/failure */
4556 {
4557 	int			error;	/* error return value */
4558 	int			level;
4559 	int			i;
4560 	bool			joined = false;
4561 
4562 	/*
4563 	 * Go up the tree, starting at leaf level.
4564 	 *
4565 	 * If 2 is returned then a join was done; go to the next level.
4566 	 * Otherwise we are done.
4567 	 */
4568 	for (level = 0, i = 2; i == 2; level++) {
4569 		error = xfs_btree_delrec(cur, level, &i);
4570 		if (error)
4571 			goto error0;
4572 		if (i == 2)
4573 			joined = true;
4574 	}
4575 
4576 	/*
4577 	 * If we combined blocks as part of deleting the record, delrec won't
4578 	 * have updated the parent high keys so we have to do that here.
4579 	 */
4580 	if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
4581 		error = xfs_btree_updkeys_force(cur, 0);
4582 		if (error)
4583 			goto error0;
4584 	}
4585 
4586 	if (i == 0) {
4587 		for (level = 1; level < cur->bc_nlevels; level++) {
4588 			if (cur->bc_levels[level].ptr == 0) {
4589 				error = xfs_btree_decrement(cur, level, &i);
4590 				if (error)
4591 					goto error0;
4592 				break;
4593 			}
4594 		}
4595 	}
4596 
4597 	*stat = i;
4598 	return 0;
4599 error0:
4600 	return error;
4601 }
4602 
4603 /*
4604  * Get the data from the pointed-to record.
4605  */
4606 int					/* error */
4607 xfs_btree_get_rec(
4608 	struct xfs_btree_cur	*cur,	/* btree cursor */
4609 	union xfs_btree_rec	**recp,	/* output: btree record */
4610 	int			*stat)	/* output: success/failure */
4611 {
4612 	struct xfs_btree_block	*block;	/* btree block */
4613 	struct xfs_buf		*bp;	/* buffer pointer */
4614 	int			ptr;	/* record number */
4615 #ifdef DEBUG
4616 	int			error;	/* error return value */
4617 #endif
4618 
4619 	ptr = cur->bc_levels[0].ptr;
4620 	block = xfs_btree_get_block(cur, 0, &bp);
4621 
4622 #ifdef DEBUG
4623 	error = xfs_btree_check_block(cur, block, 0, bp);
4624 	if (error)
4625 		return error;
4626 #endif
4627 
4628 	/*
4629 	 * Off the right end or left end, return failure.
4630 	 */
4631 	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4632 		*stat = 0;
4633 		return 0;
4634 	}
4635 
4636 	/*
4637 	 * Point to the record and extract its data.
4638 	 */
4639 	*recp = xfs_btree_rec_addr(cur, ptr, block);
4640 	*stat = 1;
4641 	return 0;
4642 }
4643 
4644 /* Visit a block in a btree. */
4645 STATIC int
4646 xfs_btree_visit_block(
4647 	struct xfs_btree_cur		*cur,
4648 	int				level,
4649 	xfs_btree_visit_blocks_fn	fn,
4650 	void				*data)
4651 {
4652 	struct xfs_btree_block		*block;
4653 	struct xfs_buf			*bp;
4654 	union xfs_btree_ptr		rptr, bufptr;
4655 	int				error;
4656 
4657 	/* do right sibling readahead */
4658 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4659 	block = xfs_btree_get_block(cur, level, &bp);
4660 
4661 	/* process the block */
4662 	error = fn(cur, level, data);
4663 	if (error)
4664 		return error;
4665 
4666 	/* now read rh sibling block for next iteration */
4667 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4668 	if (xfs_btree_ptr_is_null(cur, &rptr))
4669 		return -ENOENT;
4670 
4671 	/*
4672 	 * We only visit blocks once in this walk, so we have to avoid the
4673 	 * internal xfs_btree_lookup_get_block() optimisation where it will
4674 	 * return the same block without checking if the right sibling points
4675 	 * back to us and creates a cyclic reference in the btree.
4676 	 */
4677 	xfs_btree_buf_to_ptr(cur, bp, &bufptr);
4678 	if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
4679 		xfs_btree_mark_sick(cur);
4680 		return -EFSCORRUPTED;
4681 	}
4682 
4683 	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4684 }
4685 
4686 
4687 /* Visit every block in a btree. */
4688 int
4689 xfs_btree_visit_blocks(
4690 	struct xfs_btree_cur		*cur,
4691 	xfs_btree_visit_blocks_fn	fn,
4692 	unsigned int			flags,
4693 	void				*data)
4694 {
4695 	union xfs_btree_ptr		lptr;
4696 	int				level;
4697 	struct xfs_btree_block		*block = NULL;
4698 	int				error = 0;
4699 
4700 	xfs_btree_init_ptr_from_cur(cur, &lptr);
4701 
4702 	/* for each level */
4703 	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4704 		/* grab the left hand block */
4705 		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4706 		if (error)
4707 			return error;
4708 
4709 		/* readahead the left most block for the next level down */
4710 		if (level > 0) {
4711 			union xfs_btree_ptr     *ptr;
4712 
4713 			ptr = xfs_btree_ptr_addr(cur, 1, block);
4714 			xfs_btree_readahead_ptr(cur, ptr, 1);
4715 
4716 			/* save for the next iteration of the loop */
4717 			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4718 
4719 			if (!(flags & XFS_BTREE_VISIT_LEAVES))
4720 				continue;
4721 		} else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4722 			continue;
4723 		}
4724 
4725 		/* for each buffer in the level */
4726 		do {
4727 			error = xfs_btree_visit_block(cur, level, fn, data);
4728 		} while (!error);
4729 
4730 		if (error != -ENOENT)
4731 			return error;
4732 	}
4733 
4734 	return 0;
4735 }
4736 
4737 /*
4738  * Change the owner of a btree.
4739  *
4740  * The mechanism we use here is ordered buffer logging. Because we don't know
4741  * how many buffers were are going to need to modify, we don't really want to
4742  * have to make transaction reservations for the worst case of every buffer in a
4743  * full size btree as that may be more space that we can fit in the log....
4744  *
4745  * We do the btree walk in the most optimal manner possible - we have sibling
4746  * pointers so we can just walk all the blocks on each level from left to right
4747  * in a single pass, and then move to the next level and do the same. We can
4748  * also do readahead on the sibling pointers to get IO moving more quickly,
4749  * though for slow disks this is unlikely to make much difference to performance
4750  * as the amount of CPU work we have to do before moving to the next block is
4751  * relatively small.
4752  *
4753  * For each btree block that we load, modify the owner appropriately, set the
4754  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4755  * we mark the region we change dirty so that if the buffer is relogged in
4756  * a subsequent transaction the changes we make here as an ordered buffer are
4757  * correctly relogged in that transaction.  If we are in recovery context, then
4758  * just queue the modified buffer as delayed write buffer so the transaction
4759  * recovery completion writes the changes to disk.
4760  */
4761 struct xfs_btree_block_change_owner_info {
4762 	uint64_t		new_owner;
4763 	struct list_head	*buffer_list;
4764 };
4765 
4766 static int
4767 xfs_btree_block_change_owner(
4768 	struct xfs_btree_cur	*cur,
4769 	int			level,
4770 	void			*data)
4771 {
4772 	struct xfs_btree_block_change_owner_info	*bbcoi = data;
4773 	struct xfs_btree_block	*block;
4774 	struct xfs_buf		*bp;
4775 
4776 	/* modify the owner */
4777 	block = xfs_btree_get_block(cur, level, &bp);
4778 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
4779 		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4780 			return 0;
4781 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4782 	} else {
4783 		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4784 			return 0;
4785 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4786 	}
4787 
4788 	/*
4789 	 * If the block is a root block hosted in an inode, we might not have a
4790 	 * buffer pointer here and we shouldn't attempt to log the change as the
4791 	 * information is already held in the inode and discarded when the root
4792 	 * block is formatted into the on-disk inode fork. We still change it,
4793 	 * though, so everything is consistent in memory.
4794 	 */
4795 	if (!bp) {
4796 		ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
4797 		ASSERT(level == cur->bc_nlevels - 1);
4798 		return 0;
4799 	}
4800 
4801 	if (cur->bc_tp) {
4802 		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4803 			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4804 			return -EAGAIN;
4805 		}
4806 	} else {
4807 		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4808 	}
4809 
4810 	return 0;
4811 }
4812 
4813 int
4814 xfs_btree_change_owner(
4815 	struct xfs_btree_cur	*cur,
4816 	uint64_t		new_owner,
4817 	struct list_head	*buffer_list)
4818 {
4819 	struct xfs_btree_block_change_owner_info	bbcoi;
4820 
4821 	bbcoi.new_owner = new_owner;
4822 	bbcoi.buffer_list = buffer_list;
4823 
4824 	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4825 			XFS_BTREE_VISIT_ALL, &bbcoi);
4826 }
4827 
4828 /* Verify the v5 fields of a long-format btree block. */
4829 xfs_failaddr_t
4830 xfs_btree_fsblock_v5hdr_verify(
4831 	struct xfs_buf		*bp,
4832 	uint64_t		owner)
4833 {
4834 	struct xfs_mount	*mp = bp->b_mount;
4835 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4836 
4837 	if (!xfs_has_crc(mp))
4838 		return __this_address;
4839 	if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4840 		return __this_address;
4841 	if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4842 		return __this_address;
4843 	if (owner != XFS_RMAP_OWN_UNKNOWN &&
4844 	    be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4845 		return __this_address;
4846 	return NULL;
4847 }
4848 
4849 /* Verify a long-format btree block. */
4850 xfs_failaddr_t
4851 xfs_btree_fsblock_verify(
4852 	struct xfs_buf		*bp,
4853 	unsigned int		max_recs)
4854 {
4855 	struct xfs_mount	*mp = bp->b_mount;
4856 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4857 	xfs_fsblock_t		fsb;
4858 	xfs_failaddr_t		fa;
4859 
4860 	ASSERT(!xfs_buftarg_is_mem(bp->b_target));
4861 
4862 	/* numrecs verification */
4863 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4864 		return __this_address;
4865 
4866 	/* sibling pointer verification */
4867 	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
4868 	fa = xfs_btree_check_fsblock_siblings(mp, fsb,
4869 			block->bb_u.l.bb_leftsib);
4870 	if (!fa)
4871 		fa = xfs_btree_check_fsblock_siblings(mp, fsb,
4872 				block->bb_u.l.bb_rightsib);
4873 	return fa;
4874 }
4875 
4876 /* Verify an in-memory btree block. */
4877 xfs_failaddr_t
4878 xfs_btree_memblock_verify(
4879 	struct xfs_buf		*bp,
4880 	unsigned int		max_recs)
4881 {
4882 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4883 	struct xfs_buftarg	*btp = bp->b_target;
4884 	xfs_failaddr_t		fa;
4885 	xfbno_t			bno;
4886 
4887 	ASSERT(xfs_buftarg_is_mem(bp->b_target));
4888 
4889 	/* numrecs verification */
4890 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4891 		return __this_address;
4892 
4893 	/* sibling pointer verification */
4894 	bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
4895 	fa = xfs_btree_check_memblock_siblings(btp, bno,
4896 			block->bb_u.l.bb_leftsib);
4897 	if (fa)
4898 		return fa;
4899 	fa = xfs_btree_check_memblock_siblings(btp, bno,
4900 			block->bb_u.l.bb_rightsib);
4901 	if (fa)
4902 		return fa;
4903 
4904 	return NULL;
4905 }
4906 /**
4907  * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
4908  *				      btree block
4909  *
4910  * @bp: buffer containing the btree block
4911  */
4912 xfs_failaddr_t
4913 xfs_btree_agblock_v5hdr_verify(
4914 	struct xfs_buf		*bp)
4915 {
4916 	struct xfs_mount	*mp = bp->b_mount;
4917 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4918 	struct xfs_perag	*pag = bp->b_pag;
4919 
4920 	if (!xfs_has_crc(mp))
4921 		return __this_address;
4922 	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4923 		return __this_address;
4924 	if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4925 		return __this_address;
4926 	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag))
4927 		return __this_address;
4928 	return NULL;
4929 }
4930 
4931 /**
4932  * xfs_btree_agblock_verify() -- verify a short-format btree block
4933  *
4934  * @bp: buffer containing the btree block
4935  * @max_recs: maximum records allowed in this btree node
4936  */
4937 xfs_failaddr_t
4938 xfs_btree_agblock_verify(
4939 	struct xfs_buf		*bp,
4940 	unsigned int		max_recs)
4941 {
4942 	struct xfs_mount	*mp = bp->b_mount;
4943 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4944 	xfs_agblock_t		agbno;
4945 	xfs_failaddr_t		fa;
4946 
4947 	ASSERT(!xfs_buftarg_is_mem(bp->b_target));
4948 
4949 	/* numrecs verification */
4950 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4951 		return __this_address;
4952 
4953 	/* sibling pointer verification */
4954 	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
4955 	fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
4956 			block->bb_u.s.bb_leftsib);
4957 	if (!fa)
4958 		fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
4959 				block->bb_u.s.bb_rightsib);
4960 	return fa;
4961 }
4962 
4963 /*
4964  * For the given limits on leaf and keyptr records per block, calculate the
4965  * height of the tree needed to index the number of leaf records.
4966  */
4967 unsigned int
4968 xfs_btree_compute_maxlevels(
4969 	const unsigned int	*limits,
4970 	unsigned long long	records)
4971 {
4972 	unsigned long long	level_blocks = howmany_64(records, limits[0]);
4973 	unsigned int		height = 1;
4974 
4975 	while (level_blocks > 1) {
4976 		level_blocks = howmany_64(level_blocks, limits[1]);
4977 		height++;
4978 	}
4979 
4980 	return height;
4981 }
4982 
4983 /*
4984  * For the given limits on leaf and keyptr records per block, calculate the
4985  * number of blocks needed to index the given number of leaf records.
4986  */
4987 unsigned long long
4988 xfs_btree_calc_size(
4989 	const unsigned int	*limits,
4990 	unsigned long long	records)
4991 {
4992 	unsigned long long	level_blocks = howmany_64(records, limits[0]);
4993 	unsigned long long	blocks = level_blocks;
4994 
4995 	while (level_blocks > 1) {
4996 		level_blocks = howmany_64(level_blocks, limits[1]);
4997 		blocks += level_blocks;
4998 	}
4999 
5000 	return blocks;
5001 }
5002 
5003 /*
5004  * Given a number of available blocks for the btree to consume with records and
5005  * pointers, calculate the height of the tree needed to index all the records
5006  * that space can hold based on the number of pointers each interior node
5007  * holds.
5008  *
5009  * We start by assuming a single level tree consumes a single block, then track
5010  * the number of blocks each node level consumes until we no longer have space
5011  * to store the next node level. At this point, we are indexing all the leaf
5012  * blocks in the space, and there's no more free space to split the tree any
5013  * further. That's our maximum btree height.
5014  */
5015 unsigned int
5016 xfs_btree_space_to_height(
5017 	const unsigned int	*limits,
5018 	unsigned long long	leaf_blocks)
5019 {
5020 	/*
5021 	 * The root btree block can have fewer than minrecs pointers in it
5022 	 * because the tree might not be big enough to require that amount of
5023 	 * fanout. Hence it has a minimum size of 2 pointers, not limits[1].
5024 	 */
5025 	unsigned long long	node_blocks = 2;
5026 	unsigned long long	blocks_left = leaf_blocks - 1;
5027 	unsigned int		height = 1;
5028 
5029 	if (leaf_blocks < 1)
5030 		return 0;
5031 
5032 	while (node_blocks < blocks_left) {
5033 		blocks_left -= node_blocks;
5034 		node_blocks *= limits[1];
5035 		height++;
5036 	}
5037 
5038 	return height;
5039 }
5040 
5041 /*
5042  * Query a regular btree for all records overlapping a given interval.
5043  * Start with a LE lookup of the key of low_rec and return all records
5044  * until we find a record with a key greater than the key of high_rec.
5045  */
5046 STATIC int
5047 xfs_btree_simple_query_range(
5048 	struct xfs_btree_cur		*cur,
5049 	const union xfs_btree_key	*low_key,
5050 	const union xfs_btree_key	*high_key,
5051 	xfs_btree_query_range_fn	fn,
5052 	void				*priv)
5053 {
5054 	union xfs_btree_rec		*recp;
5055 	union xfs_btree_key		rec_key;
5056 	int				stat;
5057 	bool				firstrec = true;
5058 	int				error;
5059 
5060 	ASSERT(cur->bc_ops->init_high_key_from_rec);
5061 	ASSERT(cur->bc_ops->diff_two_keys);
5062 
5063 	/*
5064 	 * Find the leftmost record.  The btree cursor must be set
5065 	 * to the low record used to generate low_key.
5066 	 */
5067 	stat = 0;
5068 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
5069 	if (error)
5070 		goto out;
5071 
5072 	/* Nothing?  See if there's anything to the right. */
5073 	if (!stat) {
5074 		error = xfs_btree_increment(cur, 0, &stat);
5075 		if (error)
5076 			goto out;
5077 	}
5078 
5079 	while (stat) {
5080 		/* Find the record. */
5081 		error = xfs_btree_get_rec(cur, &recp, &stat);
5082 		if (error || !stat)
5083 			break;
5084 
5085 		/* Skip if low_key > high_key(rec). */
5086 		if (firstrec) {
5087 			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
5088 			firstrec = false;
5089 			if (xfs_btree_keycmp_gt(cur, low_key, &rec_key))
5090 				goto advloop;
5091 		}
5092 
5093 		/* Stop if low_key(rec) > high_key. */
5094 		cur->bc_ops->init_key_from_rec(&rec_key, recp);
5095 		if (xfs_btree_keycmp_gt(cur, &rec_key, high_key))
5096 			break;
5097 
5098 		/* Callback */
5099 		error = fn(cur, recp, priv);
5100 		if (error)
5101 			break;
5102 
5103 advloop:
5104 		/* Move on to the next record. */
5105 		error = xfs_btree_increment(cur, 0, &stat);
5106 		if (error)
5107 			break;
5108 	}
5109 
5110 out:
5111 	return error;
5112 }
5113 
5114 /*
5115  * Query an overlapped interval btree for all records overlapping a given
5116  * interval.  This function roughly follows the algorithm given in
5117  * "Interval Trees" of _Introduction to Algorithms_, which is section
5118  * 14.3 in the 2nd and 3rd editions.
5119  *
5120  * First, generate keys for the low and high records passed in.
5121  *
5122  * For any leaf node, generate the high and low keys for the record.
5123  * If the record keys overlap with the query low/high keys, pass the
5124  * record to the function iterator.
5125  *
5126  * For any internal node, compare the low and high keys of each
5127  * pointer against the query low/high keys.  If there's an overlap,
5128  * follow the pointer.
5129  *
5130  * As an optimization, we stop scanning a block when we find a low key
5131  * that is greater than the query's high key.
5132  */
5133 STATIC int
5134 xfs_btree_overlapped_query_range(
5135 	struct xfs_btree_cur		*cur,
5136 	const union xfs_btree_key	*low_key,
5137 	const union xfs_btree_key	*high_key,
5138 	xfs_btree_query_range_fn	fn,
5139 	void				*priv)
5140 {
5141 	union xfs_btree_ptr		ptr;
5142 	union xfs_btree_ptr		*pp;
5143 	union xfs_btree_key		rec_key;
5144 	union xfs_btree_key		rec_hkey;
5145 	union xfs_btree_key		*lkp;
5146 	union xfs_btree_key		*hkp;
5147 	union xfs_btree_rec		*recp;
5148 	struct xfs_btree_block		*block;
5149 	int				level;
5150 	struct xfs_buf			*bp;
5151 	int				i;
5152 	int				error;
5153 
5154 	/* Load the root of the btree. */
5155 	level = cur->bc_nlevels - 1;
5156 	xfs_btree_init_ptr_from_cur(cur, &ptr);
5157 	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
5158 	if (error)
5159 		return error;
5160 	xfs_btree_get_block(cur, level, &bp);
5161 	trace_xfs_btree_overlapped_query_range(cur, level, bp);
5162 #ifdef DEBUG
5163 	error = xfs_btree_check_block(cur, block, level, bp);
5164 	if (error)
5165 		goto out;
5166 #endif
5167 	cur->bc_levels[level].ptr = 1;
5168 
5169 	while (level < cur->bc_nlevels) {
5170 		block = xfs_btree_get_block(cur, level, &bp);
5171 
5172 		/* End of node, pop back towards the root. */
5173 		if (cur->bc_levels[level].ptr >
5174 					be16_to_cpu(block->bb_numrecs)) {
5175 pop_up:
5176 			if (level < cur->bc_nlevels - 1)
5177 				cur->bc_levels[level + 1].ptr++;
5178 			level++;
5179 			continue;
5180 		}
5181 
5182 		if (level == 0) {
5183 			/* Handle a leaf node. */
5184 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
5185 					block);
5186 
5187 			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
5188 			cur->bc_ops->init_key_from_rec(&rec_key, recp);
5189 
5190 			/*
5191 			 * If (query's high key < record's low key), then there
5192 			 * are no more interesting records in this block.  Pop
5193 			 * up to the leaf level to find more record blocks.
5194 			 *
5195 			 * If (record's high key >= query's low key) and
5196 			 *    (query's high key >= record's low key), then
5197 			 * this record overlaps the query range; callback.
5198 			 */
5199 			if (xfs_btree_keycmp_lt(cur, high_key, &rec_key))
5200 				goto pop_up;
5201 			if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) {
5202 				error = fn(cur, recp, priv);
5203 				if (error)
5204 					break;
5205 			}
5206 			cur->bc_levels[level].ptr++;
5207 			continue;
5208 		}
5209 
5210 		/* Handle an internal node. */
5211 		lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
5212 		hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
5213 				block);
5214 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
5215 
5216 		/*
5217 		 * If (query's high key < pointer's low key), then there are no
5218 		 * more interesting keys in this block.  Pop up one leaf level
5219 		 * to continue looking for records.
5220 		 *
5221 		 * If (pointer's high key >= query's low key) and
5222 		 *    (query's high key >= pointer's low key), then
5223 		 * this record overlaps the query range; follow pointer.
5224 		 */
5225 		if (xfs_btree_keycmp_lt(cur, high_key, lkp))
5226 			goto pop_up;
5227 		if (xfs_btree_keycmp_ge(cur, hkp, low_key)) {
5228 			level--;
5229 			error = xfs_btree_lookup_get_block(cur, level, pp,
5230 					&block);
5231 			if (error)
5232 				goto out;
5233 			xfs_btree_get_block(cur, level, &bp);
5234 			trace_xfs_btree_overlapped_query_range(cur, level, bp);
5235 #ifdef DEBUG
5236 			error = xfs_btree_check_block(cur, block, level, bp);
5237 			if (error)
5238 				goto out;
5239 #endif
5240 			cur->bc_levels[level].ptr = 1;
5241 			continue;
5242 		}
5243 		cur->bc_levels[level].ptr++;
5244 	}
5245 
5246 out:
5247 	/*
5248 	 * If we don't end this function with the cursor pointing at a record
5249 	 * block, a subsequent non-error cursor deletion will not release
5250 	 * node-level buffers, causing a buffer leak.  This is quite possible
5251 	 * with a zero-results range query, so release the buffers if we
5252 	 * failed to return any results.
5253 	 */
5254 	if (cur->bc_levels[0].bp == NULL) {
5255 		for (i = 0; i < cur->bc_nlevels; i++) {
5256 			if (cur->bc_levels[i].bp) {
5257 				xfs_trans_brelse(cur->bc_tp,
5258 						cur->bc_levels[i].bp);
5259 				cur->bc_levels[i].bp = NULL;
5260 				cur->bc_levels[i].ptr = 0;
5261 				cur->bc_levels[i].ra = 0;
5262 			}
5263 		}
5264 	}
5265 
5266 	return error;
5267 }
5268 
5269 static inline void
5270 xfs_btree_key_from_irec(
5271 	struct xfs_btree_cur		*cur,
5272 	union xfs_btree_key		*key,
5273 	const union xfs_btree_irec	*irec)
5274 {
5275 	union xfs_btree_rec		rec;
5276 
5277 	cur->bc_rec = *irec;
5278 	cur->bc_ops->init_rec_from_cur(cur, &rec);
5279 	cur->bc_ops->init_key_from_rec(key, &rec);
5280 }
5281 
5282 /*
5283  * Query a btree for all records overlapping a given interval of keys.  The
5284  * supplied function will be called with each record found; return one of the
5285  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
5286  * code.  This function returns -ECANCELED, zero, or a negative error code.
5287  */
5288 int
5289 xfs_btree_query_range(
5290 	struct xfs_btree_cur		*cur,
5291 	const union xfs_btree_irec	*low_rec,
5292 	const union xfs_btree_irec	*high_rec,
5293 	xfs_btree_query_range_fn	fn,
5294 	void				*priv)
5295 {
5296 	union xfs_btree_key		low_key;
5297 	union xfs_btree_key		high_key;
5298 
5299 	/* Find the keys of both ends of the interval. */
5300 	xfs_btree_key_from_irec(cur, &high_key, high_rec);
5301 	xfs_btree_key_from_irec(cur, &low_key, low_rec);
5302 
5303 	/* Enforce low key <= high key. */
5304 	if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
5305 		return -EINVAL;
5306 
5307 	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
5308 		return xfs_btree_simple_query_range(cur, &low_key,
5309 				&high_key, fn, priv);
5310 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
5311 			fn, priv);
5312 }
5313 
5314 /* Query a btree for all records. */
5315 int
5316 xfs_btree_query_all(
5317 	struct xfs_btree_cur		*cur,
5318 	xfs_btree_query_range_fn	fn,
5319 	void				*priv)
5320 {
5321 	union xfs_btree_key		low_key;
5322 	union xfs_btree_key		high_key;
5323 
5324 	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
5325 	memset(&low_key, 0, sizeof(low_key));
5326 	memset(&high_key, 0xFF, sizeof(high_key));
5327 
5328 	return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
5329 }
5330 
5331 static int
5332 xfs_btree_count_blocks_helper(
5333 	struct xfs_btree_cur	*cur,
5334 	int			level,
5335 	void			*data)
5336 {
5337 	xfs_filblks_t		*blocks = data;
5338 	(*blocks)++;
5339 
5340 	return 0;
5341 }
5342 
5343 /* Count the blocks in a btree and return the result in *blocks. */
5344 int
5345 xfs_btree_count_blocks(
5346 	struct xfs_btree_cur	*cur,
5347 	xfs_filblks_t		*blocks)
5348 {
5349 	*blocks = 0;
5350 	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
5351 			XFS_BTREE_VISIT_ALL, blocks);
5352 }
5353 
5354 /* Compare two btree pointers. */
5355 int64_t
5356 xfs_btree_diff_two_ptrs(
5357 	struct xfs_btree_cur		*cur,
5358 	const union xfs_btree_ptr	*a,
5359 	const union xfs_btree_ptr	*b)
5360 {
5361 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
5362 		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
5363 	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
5364 }
5365 
5366 struct xfs_btree_has_records {
5367 	/* Keys for the start and end of the range we want to know about. */
5368 	union xfs_btree_key		start_key;
5369 	union xfs_btree_key		end_key;
5370 
5371 	/* Mask for key comparisons, if desired. */
5372 	const union xfs_btree_key	*key_mask;
5373 
5374 	/* Highest record key we've seen so far. */
5375 	union xfs_btree_key		high_key;
5376 
5377 	enum xbtree_recpacking		outcome;
5378 };
5379 
5380 STATIC int
5381 xfs_btree_has_records_helper(
5382 	struct xfs_btree_cur		*cur,
5383 	const union xfs_btree_rec	*rec,
5384 	void				*priv)
5385 {
5386 	union xfs_btree_key		rec_key;
5387 	union xfs_btree_key		rec_high_key;
5388 	struct xfs_btree_has_records	*info = priv;
5389 	enum xbtree_key_contig		key_contig;
5390 
5391 	cur->bc_ops->init_key_from_rec(&rec_key, rec);
5392 
5393 	if (info->outcome == XBTREE_RECPACKING_EMPTY) {
5394 		info->outcome = XBTREE_RECPACKING_SPARSE;
5395 
5396 		/*
5397 		 * If the first record we find does not overlap the start key,
5398 		 * then there is a hole at the start of the search range.
5399 		 * Classify this as sparse and stop immediately.
5400 		 */
5401 		if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key,
5402 					info->key_mask))
5403 			return -ECANCELED;
5404 	} else {
5405 		/*
5406 		 * If a subsequent record does not overlap with the any record
5407 		 * we've seen so far, there is a hole in the middle of the
5408 		 * search range.  Classify this as sparse and stop.
5409 		 * If the keys overlap and this btree does not allow overlap,
5410 		 * signal corruption.
5411 		 */
5412 		key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
5413 					&rec_key, info->key_mask);
5414 		if (key_contig == XBTREE_KEY_OVERLAP &&
5415 				!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
5416 			return -EFSCORRUPTED;
5417 		if (key_contig == XBTREE_KEY_GAP)
5418 			return -ECANCELED;
5419 	}
5420 
5421 	/*
5422 	 * If high_key(rec) is larger than any other high key we've seen,
5423 	 * remember it for later.
5424 	 */
5425 	cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec);
5426 	if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key,
5427 				info->key_mask))
5428 		info->high_key = rec_high_key; /* struct copy */
5429 
5430 	return 0;
5431 }
5432 
5433 /*
5434  * Scan part of the keyspace of a btree and tell us if that keyspace does not
5435  * map to any records; is fully mapped to records; or is partially mapped to
5436  * records.  This is the btree record equivalent to determining if a file is
5437  * sparse.
5438  *
5439  * For most btree types, the record scan should use all available btree key
5440  * fields to compare the keys encountered.  These callers should pass NULL for
5441  * @mask.  However, some callers (e.g.  scanning physical space in the rmapbt)
5442  * want to ignore some part of the btree record keyspace when performing the
5443  * comparison.  These callers should pass in a union xfs_btree_key object with
5444  * the fields that *should* be a part of the comparison set to any nonzero
5445  * value, and the rest zeroed.
5446  */
5447 int
5448 xfs_btree_has_records(
5449 	struct xfs_btree_cur		*cur,
5450 	const union xfs_btree_irec	*low,
5451 	const union xfs_btree_irec	*high,
5452 	const union xfs_btree_key	*mask,
5453 	enum xbtree_recpacking		*outcome)
5454 {
5455 	struct xfs_btree_has_records	info = {
5456 		.outcome		= XBTREE_RECPACKING_EMPTY,
5457 		.key_mask		= mask,
5458 	};
5459 	int				error;
5460 
5461 	/* Not all btrees support this operation. */
5462 	if (!cur->bc_ops->keys_contiguous) {
5463 		ASSERT(0);
5464 		return -EOPNOTSUPP;
5465 	}
5466 
5467 	xfs_btree_key_from_irec(cur, &info.start_key, low);
5468 	xfs_btree_key_from_irec(cur, &info.end_key, high);
5469 
5470 	error = xfs_btree_query_range(cur, low, high,
5471 			xfs_btree_has_records_helper, &info);
5472 	if (error == -ECANCELED)
5473 		goto out;
5474 	if (error)
5475 		return error;
5476 
5477 	if (info.outcome == XBTREE_RECPACKING_EMPTY)
5478 		goto out;
5479 
5480 	/*
5481 	 * If the largest high_key(rec) we saw during the walk is greater than
5482 	 * the end of the search range, classify this as full.  Otherwise,
5483 	 * there is a hole at the end of the search range.
5484 	 */
5485 	if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key,
5486 				mask))
5487 		info.outcome = XBTREE_RECPACKING_FULL;
5488 
5489 out:
5490 	*outcome = info.outcome;
5491 	return 0;
5492 }
5493 
5494 /* Are there more records in this btree? */
5495 bool
5496 xfs_btree_has_more_records(
5497 	struct xfs_btree_cur	*cur)
5498 {
5499 	struct xfs_btree_block	*block;
5500 	struct xfs_buf		*bp;
5501 
5502 	block = xfs_btree_get_block(cur, 0, &bp);
5503 
5504 	/* There are still records in this block. */
5505 	if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
5506 		return true;
5507 
5508 	/* There are more record blocks. */
5509 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
5510 		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
5511 	else
5512 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
5513 }
5514 
5515 /* Set up all the btree cursor caches. */
5516 int __init
5517 xfs_btree_init_cur_caches(void)
5518 {
5519 	int		error;
5520 
5521 	error = xfs_allocbt_init_cur_cache();
5522 	if (error)
5523 		return error;
5524 	error = xfs_inobt_init_cur_cache();
5525 	if (error)
5526 		goto err;
5527 	error = xfs_bmbt_init_cur_cache();
5528 	if (error)
5529 		goto err;
5530 	error = xfs_rmapbt_init_cur_cache();
5531 	if (error)
5532 		goto err;
5533 	error = xfs_refcountbt_init_cur_cache();
5534 	if (error)
5535 		goto err;
5536 	error = xfs_rtrmapbt_init_cur_cache();
5537 	if (error)
5538 		goto err;
5539 	error = xfs_rtrefcountbt_init_cur_cache();
5540 	if (error)
5541 		goto err;
5542 
5543 	return 0;
5544 err:
5545 	xfs_btree_destroy_cur_caches();
5546 	return error;
5547 }
5548 
5549 /* Destroy all the btree cursor caches, if they've been allocated. */
5550 void
5551 xfs_btree_destroy_cur_caches(void)
5552 {
5553 	xfs_allocbt_destroy_cur_cache();
5554 	xfs_inobt_destroy_cur_cache();
5555 	xfs_bmbt_destroy_cur_cache();
5556 	xfs_rmapbt_destroy_cur_cache();
5557 	xfs_refcountbt_destroy_cur_cache();
5558 	xfs_rtrmapbt_destroy_cur_cache();
5559 	xfs_rtrefcountbt_destroy_cur_cache();
5560 }
5561 
5562 /* Move the btree cursor before the first record. */
5563 int
5564 xfs_btree_goto_left_edge(
5565 	struct xfs_btree_cur	*cur)
5566 {
5567 	int			stat = 0;
5568 	int			error;
5569 
5570 	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
5571 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
5572 	if (error)
5573 		return error;
5574 	if (!stat)
5575 		return 0;
5576 
5577 	error = xfs_btree_decrement(cur, 0, &stat);
5578 	if (error)
5579 		return error;
5580 	if (stat != 0) {
5581 		ASSERT(0);
5582 		xfs_btree_mark_sick(cur);
5583 		return -EFSCORRUPTED;
5584 	}
5585 
5586 	return 0;
5587 }
5588 
5589 /* Allocate a block for an inode-rooted metadata btree. */
5590 int
5591 xfs_btree_alloc_metafile_block(
5592 	struct xfs_btree_cur		*cur,
5593 	const union xfs_btree_ptr	*start,
5594 	union xfs_btree_ptr		*new,
5595 	int				*stat)
5596 {
5597 	struct xfs_alloc_arg		args = {
5598 		.mp			= cur->bc_mp,
5599 		.tp			= cur->bc_tp,
5600 		.resv			= XFS_AG_RESV_METAFILE,
5601 		.minlen			= 1,
5602 		.maxlen			= 1,
5603 		.prod			= 1,
5604 	};
5605 	struct xfs_inode		*ip = cur->bc_ino.ip;
5606 	int				error;
5607 
5608 	ASSERT(xfs_is_metadir_inode(ip));
5609 
5610 	xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork);
5611 	error = xfs_alloc_vextent_start_ag(&args,
5612 			XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino));
5613 	if (error)
5614 		return error;
5615 	if (args.fsbno == NULLFSBLOCK) {
5616 		*stat = 0;
5617 		return 0;
5618 	}
5619 	ASSERT(args.len == 1);
5620 
5621 	xfs_metafile_resv_alloc_space(ip, &args);
5622 
5623 	new->l = cpu_to_be64(args.fsbno);
5624 	*stat = 1;
5625 	return 0;
5626 }
5627 
5628 /* Free a block from an inode-rooted metadata btree. */
5629 int
5630 xfs_btree_free_metafile_block(
5631 	struct xfs_btree_cur	*cur,
5632 	struct xfs_buf		*bp)
5633 {
5634 	struct xfs_owner_info	oinfo;
5635 	struct xfs_mount	*mp = cur->bc_mp;
5636 	struct xfs_inode	*ip = cur->bc_ino.ip;
5637 	struct xfs_trans	*tp = cur->bc_tp;
5638 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
5639 	int			error;
5640 
5641 	ASSERT(xfs_is_metadir_inode(ip));
5642 
5643 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
5644 	error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE,
5645 			0);
5646 	if (error)
5647 		return error;
5648 
5649 	xfs_metafile_resv_free_space(ip, tp, 1);
5650 	return 0;
5651 }
5652