xref: /linux/fs/xfs/libxfs/xfs_btree.c (revision d19e470b6605c900db21fc7b34c66b6891a79983)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_btree.h"
18 #include "xfs_errortag.h"
19 #include "xfs_error.h"
20 #include "xfs_trace.h"
21 #include "xfs_alloc.h"
22 #include "xfs_log.h"
23 
24 /*
25  * Cursor allocation zone.
26  */
27 kmem_zone_t	*xfs_btree_cur_zone;
28 
29 /*
30  * Btree magic numbers.
31  */
32 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
33 	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
34 	  XFS_FIBT_MAGIC, 0 },
35 	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
36 	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
37 	  XFS_REFC_CRC_MAGIC }
38 };
39 
40 uint32_t
41 xfs_btree_magic(
42 	int			crc,
43 	xfs_btnum_t		btnum)
44 {
45 	uint32_t		magic = xfs_magics[crc][btnum];
46 
47 	/* Ensure we asked for crc for crc-only magics. */
48 	ASSERT(magic != 0);
49 	return magic;
50 }
51 
52 /*
53  * Check a long btree block header.  Return the address of the failing check,
54  * or NULL if everything is ok.
55  */
56 xfs_failaddr_t
57 __xfs_btree_check_lblock(
58 	struct xfs_btree_cur	*cur,
59 	struct xfs_btree_block	*block,
60 	int			level,
61 	struct xfs_buf		*bp)
62 {
63 	struct xfs_mount	*mp = cur->bc_mp;
64 	xfs_btnum_t		btnum = cur->bc_btnum;
65 	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
66 
67 	if (crc) {
68 		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
69 			return __this_address;
70 		if (block->bb_u.l.bb_blkno !=
71 		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
72 			return __this_address;
73 		if (block->bb_u.l.bb_pad != cpu_to_be32(0))
74 			return __this_address;
75 	}
76 
77 	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
78 		return __this_address;
79 	if (be16_to_cpu(block->bb_level) != level)
80 		return __this_address;
81 	if (be16_to_cpu(block->bb_numrecs) >
82 	    cur->bc_ops->get_maxrecs(cur, level))
83 		return __this_address;
84 	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
85 	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
86 			level + 1))
87 		return __this_address;
88 	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
89 	    !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
90 			level + 1))
91 		return __this_address;
92 
93 	return NULL;
94 }
95 
96 /* Check a long btree block header. */
97 static int
98 xfs_btree_check_lblock(
99 	struct xfs_btree_cur	*cur,
100 	struct xfs_btree_block	*block,
101 	int			level,
102 	struct xfs_buf		*bp)
103 {
104 	struct xfs_mount	*mp = cur->bc_mp;
105 	xfs_failaddr_t		fa;
106 
107 	fa = __xfs_btree_check_lblock(cur, block, level, bp);
108 	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
109 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
110 		if (bp)
111 			trace_xfs_btree_corrupt(bp, _RET_IP_);
112 		return -EFSCORRUPTED;
113 	}
114 	return 0;
115 }
116 
117 /*
118  * Check a short btree block header.  Return the address of the failing check,
119  * or NULL if everything is ok.
120  */
121 xfs_failaddr_t
122 __xfs_btree_check_sblock(
123 	struct xfs_btree_cur	*cur,
124 	struct xfs_btree_block	*block,
125 	int			level,
126 	struct xfs_buf		*bp)
127 {
128 	struct xfs_mount	*mp = cur->bc_mp;
129 	xfs_btnum_t		btnum = cur->bc_btnum;
130 	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
131 
132 	if (crc) {
133 		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
134 			return __this_address;
135 		if (block->bb_u.s.bb_blkno !=
136 		    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
137 			return __this_address;
138 	}
139 
140 	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
141 		return __this_address;
142 	if (be16_to_cpu(block->bb_level) != level)
143 		return __this_address;
144 	if (be16_to_cpu(block->bb_numrecs) >
145 	    cur->bc_ops->get_maxrecs(cur, level))
146 		return __this_address;
147 	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
148 	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
149 			level + 1))
150 		return __this_address;
151 	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
152 	    !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
153 			level + 1))
154 		return __this_address;
155 
156 	return NULL;
157 }
158 
159 /* Check a short btree block header. */
160 STATIC int
161 xfs_btree_check_sblock(
162 	struct xfs_btree_cur	*cur,
163 	struct xfs_btree_block	*block,
164 	int			level,
165 	struct xfs_buf		*bp)
166 {
167 	struct xfs_mount	*mp = cur->bc_mp;
168 	xfs_failaddr_t		fa;
169 
170 	fa = __xfs_btree_check_sblock(cur, block, level, bp);
171 	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
172 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
173 		if (bp)
174 			trace_xfs_btree_corrupt(bp, _RET_IP_);
175 		return -EFSCORRUPTED;
176 	}
177 	return 0;
178 }
179 
180 /*
181  * Debug routine: check that block header is ok.
182  */
183 int
184 xfs_btree_check_block(
185 	struct xfs_btree_cur	*cur,	/* btree cursor */
186 	struct xfs_btree_block	*block,	/* generic btree block pointer */
187 	int			level,	/* level of the btree block */
188 	struct xfs_buf		*bp)	/* buffer containing block, if any */
189 {
190 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
191 		return xfs_btree_check_lblock(cur, block, level, bp);
192 	else
193 		return xfs_btree_check_sblock(cur, block, level, bp);
194 }
195 
196 /* Check that this long pointer is valid and points within the fs. */
197 bool
198 xfs_btree_check_lptr(
199 	struct xfs_btree_cur	*cur,
200 	xfs_fsblock_t		fsbno,
201 	int			level)
202 {
203 	if (level <= 0)
204 		return false;
205 	return xfs_verify_fsbno(cur->bc_mp, fsbno);
206 }
207 
208 /* Check that this short pointer is valid and points within the AG. */
209 bool
210 xfs_btree_check_sptr(
211 	struct xfs_btree_cur	*cur,
212 	xfs_agblock_t		agbno,
213 	int			level)
214 {
215 	if (level <= 0)
216 		return false;
217 	return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
218 }
219 
220 /*
221  * Check that a given (indexed) btree pointer at a certain level of a
222  * btree is valid and doesn't point past where it should.
223  */
224 static int
225 xfs_btree_check_ptr(
226 	struct xfs_btree_cur	*cur,
227 	union xfs_btree_ptr	*ptr,
228 	int			index,
229 	int			level)
230 {
231 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
232 		if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
233 				level))
234 			return 0;
235 		xfs_err(cur->bc_mp,
236 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
237 				cur->bc_private.b.ip->i_ino,
238 				cur->bc_private.b.whichfork, cur->bc_btnum,
239 				level, index);
240 	} else {
241 		if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
242 				level))
243 			return 0;
244 		xfs_err(cur->bc_mp,
245 "AG %u: Corrupt btree %d pointer at level %d index %d.",
246 				cur->bc_private.a.agno, cur->bc_btnum,
247 				level, index);
248 	}
249 
250 	return -EFSCORRUPTED;
251 }
252 
253 #ifdef DEBUG
254 # define xfs_btree_debug_check_ptr	xfs_btree_check_ptr
255 #else
256 # define xfs_btree_debug_check_ptr(...)	(0)
257 #endif
258 
259 /*
260  * Calculate CRC on the whole btree block and stuff it into the
261  * long-form btree header.
262  *
263  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
264  * it into the buffer so recovery knows what the last modification was that made
265  * it to disk.
266  */
267 void
268 xfs_btree_lblock_calc_crc(
269 	struct xfs_buf		*bp)
270 {
271 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
272 	struct xfs_buf_log_item	*bip = bp->b_log_item;
273 
274 	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
275 		return;
276 	if (bip)
277 		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
278 	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
279 }
280 
281 bool
282 xfs_btree_lblock_verify_crc(
283 	struct xfs_buf		*bp)
284 {
285 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
286 	struct xfs_mount	*mp = bp->b_mount;
287 
288 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
289 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
290 			return false;
291 		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
292 	}
293 
294 	return true;
295 }
296 
297 /*
298  * Calculate CRC on the whole btree block and stuff it into the
299  * short-form btree header.
300  *
301  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
302  * it into the buffer so recovery knows what the last modification was that made
303  * it to disk.
304  */
305 void
306 xfs_btree_sblock_calc_crc(
307 	struct xfs_buf		*bp)
308 {
309 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
310 	struct xfs_buf_log_item	*bip = bp->b_log_item;
311 
312 	if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
313 		return;
314 	if (bip)
315 		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
316 	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
317 }
318 
319 bool
320 xfs_btree_sblock_verify_crc(
321 	struct xfs_buf		*bp)
322 {
323 	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
324 	struct xfs_mount	*mp = bp->b_mount;
325 
326 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
327 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
328 			return false;
329 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
330 	}
331 
332 	return true;
333 }
334 
335 static int
336 xfs_btree_free_block(
337 	struct xfs_btree_cur	*cur,
338 	struct xfs_buf		*bp)
339 {
340 	int			error;
341 
342 	error = cur->bc_ops->free_block(cur, bp);
343 	if (!error) {
344 		xfs_trans_binval(cur->bc_tp, bp);
345 		XFS_BTREE_STATS_INC(cur, free);
346 	}
347 	return error;
348 }
349 
350 /*
351  * Delete the btree cursor.
352  */
353 void
354 xfs_btree_del_cursor(
355 	xfs_btree_cur_t	*cur,		/* btree cursor */
356 	int		error)		/* del because of error */
357 {
358 	int		i;		/* btree level */
359 
360 	/*
361 	 * Clear the buffer pointers, and release the buffers.
362 	 * If we're doing this in the face of an error, we
363 	 * need to make sure to inspect all of the entries
364 	 * in the bc_bufs array for buffers to be unlocked.
365 	 * This is because some of the btree code works from
366 	 * level n down to 0, and if we get an error along
367 	 * the way we won't have initialized all the entries
368 	 * down to 0.
369 	 */
370 	for (i = 0; i < cur->bc_nlevels; i++) {
371 		if (cur->bc_bufs[i])
372 			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
373 		else if (!error)
374 			break;
375 	}
376 	/*
377 	 * Can't free a bmap cursor without having dealt with the
378 	 * allocated indirect blocks' accounting.
379 	 */
380 	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
381 	       cur->bc_private.b.allocated == 0);
382 	/*
383 	 * Free the cursor.
384 	 */
385 	kmem_cache_free(xfs_btree_cur_zone, cur);
386 }
387 
388 /*
389  * Duplicate the btree cursor.
390  * Allocate a new one, copy the record, re-get the buffers.
391  */
392 int					/* error */
393 xfs_btree_dup_cursor(
394 	xfs_btree_cur_t	*cur,		/* input cursor */
395 	xfs_btree_cur_t	**ncur)		/* output cursor */
396 {
397 	xfs_buf_t	*bp;		/* btree block's buffer pointer */
398 	int		error;		/* error return value */
399 	int		i;		/* level number of btree block */
400 	xfs_mount_t	*mp;		/* mount structure for filesystem */
401 	xfs_btree_cur_t	*new;		/* new cursor value */
402 	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
403 
404 	tp = cur->bc_tp;
405 	mp = cur->bc_mp;
406 
407 	/*
408 	 * Allocate a new cursor like the old one.
409 	 */
410 	new = cur->bc_ops->dup_cursor(cur);
411 
412 	/*
413 	 * Copy the record currently in the cursor.
414 	 */
415 	new->bc_rec = cur->bc_rec;
416 
417 	/*
418 	 * For each level current, re-get the buffer and copy the ptr value.
419 	 */
420 	for (i = 0; i < new->bc_nlevels; i++) {
421 		new->bc_ptrs[i] = cur->bc_ptrs[i];
422 		new->bc_ra[i] = cur->bc_ra[i];
423 		bp = cur->bc_bufs[i];
424 		if (bp) {
425 			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
426 						   XFS_BUF_ADDR(bp), mp->m_bsize,
427 						   0, &bp,
428 						   cur->bc_ops->buf_ops);
429 			if (error) {
430 				xfs_btree_del_cursor(new, error);
431 				*ncur = NULL;
432 				return error;
433 			}
434 		}
435 		new->bc_bufs[i] = bp;
436 	}
437 	*ncur = new;
438 	return 0;
439 }
440 
441 /*
442  * XFS btree block layout and addressing:
443  *
444  * There are two types of blocks in the btree: leaf and non-leaf blocks.
445  *
446  * The leaf record start with a header then followed by records containing
447  * the values.  A non-leaf block also starts with the same header, and
448  * then first contains lookup keys followed by an equal number of pointers
449  * to the btree blocks at the previous level.
450  *
451  *		+--------+-------+-------+-------+-------+-------+-------+
452  * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
453  *		+--------+-------+-------+-------+-------+-------+-------+
454  *
455  *		+--------+-------+-------+-------+-------+-------+-------+
456  * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
457  *		+--------+-------+-------+-------+-------+-------+-------+
458  *
459  * The header is called struct xfs_btree_block for reasons better left unknown
460  * and comes in different versions for short (32bit) and long (64bit) block
461  * pointers.  The record and key structures are defined by the btree instances
462  * and opaque to the btree core.  The block pointers are simple disk endian
463  * integers, available in a short (32bit) and long (64bit) variant.
464  *
465  * The helpers below calculate the offset of a given record, key or pointer
466  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
467  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
468  * inside the btree block is done using indices starting at one, not zero!
469  *
470  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
471  * overlapping intervals.  In such a tree, records are still sorted lowest to
472  * highest and indexed by the smallest key value that refers to the record.
473  * However, nodes are different: each pointer has two associated keys -- one
474  * indexing the lowest key available in the block(s) below (the same behavior
475  * as the key in a regular btree) and another indexing the highest key
476  * available in the block(s) below.  Because records are /not/ sorted by the
477  * highest key, all leaf block updates require us to compute the highest key
478  * that matches any record in the leaf and to recursively update the high keys
479  * in the nodes going further up in the tree, if necessary.  Nodes look like
480  * this:
481  *
482  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
483  * Non-Leaf:	| header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
484  *		+--------+-----+-----+-----+-----+-----+-------+-------+-----+
485  *
486  * To perform an interval query on an overlapped tree, perform the usual
487  * depth-first search and use the low and high keys to decide if we can skip
488  * that particular node.  If a leaf node is reached, return the records that
489  * intersect the interval.  Note that an interval query may return numerous
490  * entries.  For a non-overlapped tree, simply search for the record associated
491  * with the lowest key and iterate forward until a non-matching record is
492  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
493  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
494  * more detail.
495  *
496  * Why do we care about overlapping intervals?  Let's say you have a bunch of
497  * reverse mapping records on a reflink filesystem:
498  *
499  * 1: +- file A startblock B offset C length D -----------+
500  * 2:      +- file E startblock F offset G length H --------------+
501  * 3:      +- file I startblock F offset J length K --+
502  * 4:                                                        +- file L... --+
503  *
504  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
505  * we'd simply increment the length of record 1.  But how do we find the record
506  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
507  * record 3 because the keys are ordered first by startblock.  An interval
508  * query would return records 1 and 2 because they both overlap (B+D-1), and
509  * from that we can pick out record 1 as the appropriate left neighbor.
510  *
511  * In the non-overlapped case you can do a LE lookup and decrement the cursor
512  * because a record's interval must end before the next record.
513  */
514 
515 /*
516  * Return size of the btree block header for this btree instance.
517  */
518 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
519 {
520 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
521 		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
522 			return XFS_BTREE_LBLOCK_CRC_LEN;
523 		return XFS_BTREE_LBLOCK_LEN;
524 	}
525 	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
526 		return XFS_BTREE_SBLOCK_CRC_LEN;
527 	return XFS_BTREE_SBLOCK_LEN;
528 }
529 
530 /*
531  * Return size of btree block pointers for this btree instance.
532  */
533 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
534 {
535 	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
536 		sizeof(__be64) : sizeof(__be32);
537 }
538 
539 /*
540  * Calculate offset of the n-th record in a btree block.
541  */
542 STATIC size_t
543 xfs_btree_rec_offset(
544 	struct xfs_btree_cur	*cur,
545 	int			n)
546 {
547 	return xfs_btree_block_len(cur) +
548 		(n - 1) * cur->bc_ops->rec_len;
549 }
550 
551 /*
552  * Calculate offset of the n-th key in a btree block.
553  */
554 STATIC size_t
555 xfs_btree_key_offset(
556 	struct xfs_btree_cur	*cur,
557 	int			n)
558 {
559 	return xfs_btree_block_len(cur) +
560 		(n - 1) * cur->bc_ops->key_len;
561 }
562 
563 /*
564  * Calculate offset of the n-th high key in a btree block.
565  */
566 STATIC size_t
567 xfs_btree_high_key_offset(
568 	struct xfs_btree_cur	*cur,
569 	int			n)
570 {
571 	return xfs_btree_block_len(cur) +
572 		(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
573 }
574 
575 /*
576  * Calculate offset of the n-th block pointer in a btree block.
577  */
578 STATIC size_t
579 xfs_btree_ptr_offset(
580 	struct xfs_btree_cur	*cur,
581 	int			n,
582 	int			level)
583 {
584 	return xfs_btree_block_len(cur) +
585 		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
586 		(n - 1) * xfs_btree_ptr_len(cur);
587 }
588 
589 /*
590  * Return a pointer to the n-th record in the btree block.
591  */
592 union xfs_btree_rec *
593 xfs_btree_rec_addr(
594 	struct xfs_btree_cur	*cur,
595 	int			n,
596 	struct xfs_btree_block	*block)
597 {
598 	return (union xfs_btree_rec *)
599 		((char *)block + xfs_btree_rec_offset(cur, n));
600 }
601 
602 /*
603  * Return a pointer to the n-th key in the btree block.
604  */
605 union xfs_btree_key *
606 xfs_btree_key_addr(
607 	struct xfs_btree_cur	*cur,
608 	int			n,
609 	struct xfs_btree_block	*block)
610 {
611 	return (union xfs_btree_key *)
612 		((char *)block + xfs_btree_key_offset(cur, n));
613 }
614 
615 /*
616  * Return a pointer to the n-th high key in the btree block.
617  */
618 union xfs_btree_key *
619 xfs_btree_high_key_addr(
620 	struct xfs_btree_cur	*cur,
621 	int			n,
622 	struct xfs_btree_block	*block)
623 {
624 	return (union xfs_btree_key *)
625 		((char *)block + xfs_btree_high_key_offset(cur, n));
626 }
627 
628 /*
629  * Return a pointer to the n-th block pointer in the btree block.
630  */
631 union xfs_btree_ptr *
632 xfs_btree_ptr_addr(
633 	struct xfs_btree_cur	*cur,
634 	int			n,
635 	struct xfs_btree_block	*block)
636 {
637 	int			level = xfs_btree_get_level(block);
638 
639 	ASSERT(block->bb_level != 0);
640 
641 	return (union xfs_btree_ptr *)
642 		((char *)block + xfs_btree_ptr_offset(cur, n, level));
643 }
644 
645 /*
646  * Get the root block which is stored in the inode.
647  *
648  * For now this btree implementation assumes the btree root is always
649  * stored in the if_broot field of an inode fork.
650  */
651 STATIC struct xfs_btree_block *
652 xfs_btree_get_iroot(
653 	struct xfs_btree_cur	*cur)
654 {
655 	struct xfs_ifork	*ifp;
656 
657 	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
658 	return (struct xfs_btree_block *)ifp->if_broot;
659 }
660 
661 /*
662  * Retrieve the block pointer from the cursor at the given level.
663  * This may be an inode btree root or from a buffer.
664  */
665 struct xfs_btree_block *		/* generic btree block pointer */
666 xfs_btree_get_block(
667 	struct xfs_btree_cur	*cur,	/* btree cursor */
668 	int			level,	/* level in btree */
669 	struct xfs_buf		**bpp)	/* buffer containing the block */
670 {
671 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
672 	    (level == cur->bc_nlevels - 1)) {
673 		*bpp = NULL;
674 		return xfs_btree_get_iroot(cur);
675 	}
676 
677 	*bpp = cur->bc_bufs[level];
678 	return XFS_BUF_TO_BLOCK(*bpp);
679 }
680 
681 /*
682  * Get a buffer for the block, return it with no data read.
683  * Long-form addressing.
684  */
685 xfs_buf_t *				/* buffer for fsbno */
686 xfs_btree_get_bufl(
687 	xfs_mount_t	*mp,		/* file system mount point */
688 	xfs_trans_t	*tp,		/* transaction pointer */
689 	xfs_fsblock_t	fsbno)		/* file system block number */
690 {
691 	xfs_daddr_t		d;		/* real disk block address */
692 
693 	ASSERT(fsbno != NULLFSBLOCK);
694 	d = XFS_FSB_TO_DADDR(mp, fsbno);
695 	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
696 }
697 
698 /*
699  * Get a buffer for the block, return it with no data read.
700  * Short-form addressing.
701  */
702 xfs_buf_t *				/* buffer for agno/agbno */
703 xfs_btree_get_bufs(
704 	xfs_mount_t	*mp,		/* file system mount point */
705 	xfs_trans_t	*tp,		/* transaction pointer */
706 	xfs_agnumber_t	agno,		/* allocation group number */
707 	xfs_agblock_t	agbno)		/* allocation group block number */
708 {
709 	xfs_daddr_t		d;		/* real disk block address */
710 
711 	ASSERT(agno != NULLAGNUMBER);
712 	ASSERT(agbno != NULLAGBLOCK);
713 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
714 	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
715 }
716 
717 /*
718  * Change the cursor to point to the first record at the given level.
719  * Other levels are unaffected.
720  */
721 STATIC int				/* success=1, failure=0 */
722 xfs_btree_firstrec(
723 	xfs_btree_cur_t		*cur,	/* btree cursor */
724 	int			level)	/* level to change */
725 {
726 	struct xfs_btree_block	*block;	/* generic btree block pointer */
727 	xfs_buf_t		*bp;	/* buffer containing block */
728 
729 	/*
730 	 * Get the block pointer for this level.
731 	 */
732 	block = xfs_btree_get_block(cur, level, &bp);
733 	if (xfs_btree_check_block(cur, block, level, bp))
734 		return 0;
735 	/*
736 	 * It's empty, there is no such record.
737 	 */
738 	if (!block->bb_numrecs)
739 		return 0;
740 	/*
741 	 * Set the ptr value to 1, that's the first record/key.
742 	 */
743 	cur->bc_ptrs[level] = 1;
744 	return 1;
745 }
746 
747 /*
748  * Change the cursor to point to the last record in the current block
749  * at the given level.  Other levels are unaffected.
750  */
751 STATIC int				/* success=1, failure=0 */
752 xfs_btree_lastrec(
753 	xfs_btree_cur_t		*cur,	/* btree cursor */
754 	int			level)	/* level to change */
755 {
756 	struct xfs_btree_block	*block;	/* generic btree block pointer */
757 	xfs_buf_t		*bp;	/* buffer containing block */
758 
759 	/*
760 	 * Get the block pointer for this level.
761 	 */
762 	block = xfs_btree_get_block(cur, level, &bp);
763 	if (xfs_btree_check_block(cur, block, level, bp))
764 		return 0;
765 	/*
766 	 * It's empty, there is no such record.
767 	 */
768 	if (!block->bb_numrecs)
769 		return 0;
770 	/*
771 	 * Set the ptr value to numrecs, that's the last record/key.
772 	 */
773 	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
774 	return 1;
775 }
776 
777 /*
778  * Compute first and last byte offsets for the fields given.
779  * Interprets the offsets table, which contains struct field offsets.
780  */
781 void
782 xfs_btree_offsets(
783 	int64_t		fields,		/* bitmask of fields */
784 	const short	*offsets,	/* table of field offsets */
785 	int		nbits,		/* number of bits to inspect */
786 	int		*first,		/* output: first byte offset */
787 	int		*last)		/* output: last byte offset */
788 {
789 	int		i;		/* current bit number */
790 	int64_t		imask;		/* mask for current bit number */
791 
792 	ASSERT(fields != 0);
793 	/*
794 	 * Find the lowest bit, so the first byte offset.
795 	 */
796 	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
797 		if (imask & fields) {
798 			*first = offsets[i];
799 			break;
800 		}
801 	}
802 	/*
803 	 * Find the highest bit, so the last byte offset.
804 	 */
805 	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
806 		if (imask & fields) {
807 			*last = offsets[i + 1] - 1;
808 			break;
809 		}
810 	}
811 }
812 
813 /*
814  * Get a buffer for the block, return it read in.
815  * Long-form addressing.
816  */
817 int
818 xfs_btree_read_bufl(
819 	struct xfs_mount	*mp,		/* file system mount point */
820 	struct xfs_trans	*tp,		/* transaction pointer */
821 	xfs_fsblock_t		fsbno,		/* file system block number */
822 	struct xfs_buf		**bpp,		/* buffer for fsbno */
823 	int			refval,		/* ref count value for buffer */
824 	const struct xfs_buf_ops *ops)
825 {
826 	struct xfs_buf		*bp;		/* return value */
827 	xfs_daddr_t		d;		/* real disk block address */
828 	int			error;
829 
830 	if (!xfs_verify_fsbno(mp, fsbno))
831 		return -EFSCORRUPTED;
832 	d = XFS_FSB_TO_DADDR(mp, fsbno);
833 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
834 				   mp->m_bsize, 0, &bp, ops);
835 	if (error)
836 		return error;
837 	if (bp)
838 		xfs_buf_set_ref(bp, refval);
839 	*bpp = bp;
840 	return 0;
841 }
842 
843 /*
844  * Read-ahead the block, don't wait for it, don't return a buffer.
845  * Long-form addressing.
846  */
847 /* ARGSUSED */
848 void
849 xfs_btree_reada_bufl(
850 	struct xfs_mount	*mp,		/* file system mount point */
851 	xfs_fsblock_t		fsbno,		/* file system block number */
852 	xfs_extlen_t		count,		/* count of filesystem blocks */
853 	const struct xfs_buf_ops *ops)
854 {
855 	xfs_daddr_t		d;
856 
857 	ASSERT(fsbno != NULLFSBLOCK);
858 	d = XFS_FSB_TO_DADDR(mp, fsbno);
859 	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
860 }
861 
862 /*
863  * Read-ahead the block, don't wait for it, don't return a buffer.
864  * Short-form addressing.
865  */
866 /* ARGSUSED */
867 void
868 xfs_btree_reada_bufs(
869 	struct xfs_mount	*mp,		/* file system mount point */
870 	xfs_agnumber_t		agno,		/* allocation group number */
871 	xfs_agblock_t		agbno,		/* allocation group block number */
872 	xfs_extlen_t		count,		/* count of filesystem blocks */
873 	const struct xfs_buf_ops *ops)
874 {
875 	xfs_daddr_t		d;
876 
877 	ASSERT(agno != NULLAGNUMBER);
878 	ASSERT(agbno != NULLAGBLOCK);
879 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
880 	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
881 }
882 
883 STATIC int
884 xfs_btree_readahead_lblock(
885 	struct xfs_btree_cur	*cur,
886 	int			lr,
887 	struct xfs_btree_block	*block)
888 {
889 	int			rval = 0;
890 	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
891 	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
892 
893 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
894 		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
895 				     cur->bc_ops->buf_ops);
896 		rval++;
897 	}
898 
899 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
900 		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
901 				     cur->bc_ops->buf_ops);
902 		rval++;
903 	}
904 
905 	return rval;
906 }
907 
908 STATIC int
909 xfs_btree_readahead_sblock(
910 	struct xfs_btree_cur	*cur,
911 	int			lr,
912 	struct xfs_btree_block *block)
913 {
914 	int			rval = 0;
915 	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
916 	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
917 
918 
919 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
920 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
921 				     left, 1, cur->bc_ops->buf_ops);
922 		rval++;
923 	}
924 
925 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
926 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
927 				     right, 1, cur->bc_ops->buf_ops);
928 		rval++;
929 	}
930 
931 	return rval;
932 }
933 
934 /*
935  * Read-ahead btree blocks, at the given level.
936  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
937  */
938 STATIC int
939 xfs_btree_readahead(
940 	struct xfs_btree_cur	*cur,		/* btree cursor */
941 	int			lev,		/* level in btree */
942 	int			lr)		/* left/right bits */
943 {
944 	struct xfs_btree_block	*block;
945 
946 	/*
947 	 * No readahead needed if we are at the root level and the
948 	 * btree root is stored in the inode.
949 	 */
950 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
951 	    (lev == cur->bc_nlevels - 1))
952 		return 0;
953 
954 	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
955 		return 0;
956 
957 	cur->bc_ra[lev] |= lr;
958 	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
959 
960 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
961 		return xfs_btree_readahead_lblock(cur, lr, block);
962 	return xfs_btree_readahead_sblock(cur, lr, block);
963 }
964 
965 STATIC int
966 xfs_btree_ptr_to_daddr(
967 	struct xfs_btree_cur	*cur,
968 	union xfs_btree_ptr	*ptr,
969 	xfs_daddr_t		*daddr)
970 {
971 	xfs_fsblock_t		fsbno;
972 	xfs_agblock_t		agbno;
973 	int			error;
974 
975 	error = xfs_btree_check_ptr(cur, ptr, 0, 1);
976 	if (error)
977 		return error;
978 
979 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
980 		fsbno = be64_to_cpu(ptr->l);
981 		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
982 	} else {
983 		agbno = be32_to_cpu(ptr->s);
984 		*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
985 				agbno);
986 	}
987 
988 	return 0;
989 }
990 
991 /*
992  * Readahead @count btree blocks at the given @ptr location.
993  *
994  * We don't need to care about long or short form btrees here as we have a
995  * method of converting the ptr directly to a daddr available to us.
996  */
997 STATIC void
998 xfs_btree_readahead_ptr(
999 	struct xfs_btree_cur	*cur,
1000 	union xfs_btree_ptr	*ptr,
1001 	xfs_extlen_t		count)
1002 {
1003 	xfs_daddr_t		daddr;
1004 
1005 	if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1006 		return;
1007 	xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
1008 			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1009 }
1010 
1011 /*
1012  * Set the buffer for level "lev" in the cursor to bp, releasing
1013  * any previous buffer.
1014  */
1015 STATIC void
1016 xfs_btree_setbuf(
1017 	xfs_btree_cur_t		*cur,	/* btree cursor */
1018 	int			lev,	/* level in btree */
1019 	xfs_buf_t		*bp)	/* new buffer to set */
1020 {
1021 	struct xfs_btree_block	*b;	/* btree block */
1022 
1023 	if (cur->bc_bufs[lev])
1024 		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1025 	cur->bc_bufs[lev] = bp;
1026 	cur->bc_ra[lev] = 0;
1027 
1028 	b = XFS_BUF_TO_BLOCK(bp);
1029 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1030 		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1031 			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1032 		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1033 			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1034 	} else {
1035 		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1036 			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1037 		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1038 			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1039 	}
1040 }
1041 
1042 bool
1043 xfs_btree_ptr_is_null(
1044 	struct xfs_btree_cur	*cur,
1045 	union xfs_btree_ptr	*ptr)
1046 {
1047 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1048 		return ptr->l == cpu_to_be64(NULLFSBLOCK);
1049 	else
1050 		return ptr->s == cpu_to_be32(NULLAGBLOCK);
1051 }
1052 
1053 STATIC void
1054 xfs_btree_set_ptr_null(
1055 	struct xfs_btree_cur	*cur,
1056 	union xfs_btree_ptr	*ptr)
1057 {
1058 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1059 		ptr->l = cpu_to_be64(NULLFSBLOCK);
1060 	else
1061 		ptr->s = cpu_to_be32(NULLAGBLOCK);
1062 }
1063 
1064 /*
1065  * Get/set/init sibling pointers
1066  */
1067 void
1068 xfs_btree_get_sibling(
1069 	struct xfs_btree_cur	*cur,
1070 	struct xfs_btree_block	*block,
1071 	union xfs_btree_ptr	*ptr,
1072 	int			lr)
1073 {
1074 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1075 
1076 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1077 		if (lr == XFS_BB_RIGHTSIB)
1078 			ptr->l = block->bb_u.l.bb_rightsib;
1079 		else
1080 			ptr->l = block->bb_u.l.bb_leftsib;
1081 	} else {
1082 		if (lr == XFS_BB_RIGHTSIB)
1083 			ptr->s = block->bb_u.s.bb_rightsib;
1084 		else
1085 			ptr->s = block->bb_u.s.bb_leftsib;
1086 	}
1087 }
1088 
1089 STATIC void
1090 xfs_btree_set_sibling(
1091 	struct xfs_btree_cur	*cur,
1092 	struct xfs_btree_block	*block,
1093 	union xfs_btree_ptr	*ptr,
1094 	int			lr)
1095 {
1096 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1097 
1098 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1099 		if (lr == XFS_BB_RIGHTSIB)
1100 			block->bb_u.l.bb_rightsib = ptr->l;
1101 		else
1102 			block->bb_u.l.bb_leftsib = ptr->l;
1103 	} else {
1104 		if (lr == XFS_BB_RIGHTSIB)
1105 			block->bb_u.s.bb_rightsib = ptr->s;
1106 		else
1107 			block->bb_u.s.bb_leftsib = ptr->s;
1108 	}
1109 }
1110 
1111 void
1112 xfs_btree_init_block_int(
1113 	struct xfs_mount	*mp,
1114 	struct xfs_btree_block	*buf,
1115 	xfs_daddr_t		blkno,
1116 	xfs_btnum_t		btnum,
1117 	__u16			level,
1118 	__u16			numrecs,
1119 	__u64			owner,
1120 	unsigned int		flags)
1121 {
1122 	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
1123 	__u32			magic = xfs_btree_magic(crc, btnum);
1124 
1125 	buf->bb_magic = cpu_to_be32(magic);
1126 	buf->bb_level = cpu_to_be16(level);
1127 	buf->bb_numrecs = cpu_to_be16(numrecs);
1128 
1129 	if (flags & XFS_BTREE_LONG_PTRS) {
1130 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1131 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1132 		if (crc) {
1133 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1134 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1135 			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1136 			buf->bb_u.l.bb_pad = 0;
1137 			buf->bb_u.l.bb_lsn = 0;
1138 		}
1139 	} else {
1140 		/* owner is a 32 bit value on short blocks */
1141 		__u32 __owner = (__u32)owner;
1142 
1143 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1144 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1145 		if (crc) {
1146 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1147 			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1148 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1149 			buf->bb_u.s.bb_lsn = 0;
1150 		}
1151 	}
1152 }
1153 
1154 void
1155 xfs_btree_init_block(
1156 	struct xfs_mount *mp,
1157 	struct xfs_buf	*bp,
1158 	xfs_btnum_t	btnum,
1159 	__u16		level,
1160 	__u16		numrecs,
1161 	__u64		owner)
1162 {
1163 	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1164 				 btnum, level, numrecs, owner, 0);
1165 }
1166 
1167 STATIC void
1168 xfs_btree_init_block_cur(
1169 	struct xfs_btree_cur	*cur,
1170 	struct xfs_buf		*bp,
1171 	int			level,
1172 	int			numrecs)
1173 {
1174 	__u64			owner;
1175 
1176 	/*
1177 	 * we can pull the owner from the cursor right now as the different
1178 	 * owners align directly with the pointer size of the btree. This may
1179 	 * change in future, but is safe for current users of the generic btree
1180 	 * code.
1181 	 */
1182 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1183 		owner = cur->bc_private.b.ip->i_ino;
1184 	else
1185 		owner = cur->bc_private.a.agno;
1186 
1187 	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1188 				 cur->bc_btnum, level, numrecs,
1189 				 owner, cur->bc_flags);
1190 }
1191 
1192 /*
1193  * Return true if ptr is the last record in the btree and
1194  * we need to track updates to this record.  The decision
1195  * will be further refined in the update_lastrec method.
1196  */
1197 STATIC int
1198 xfs_btree_is_lastrec(
1199 	struct xfs_btree_cur	*cur,
1200 	struct xfs_btree_block	*block,
1201 	int			level)
1202 {
1203 	union xfs_btree_ptr	ptr;
1204 
1205 	if (level > 0)
1206 		return 0;
1207 	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1208 		return 0;
1209 
1210 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1211 	if (!xfs_btree_ptr_is_null(cur, &ptr))
1212 		return 0;
1213 	return 1;
1214 }
1215 
1216 STATIC void
1217 xfs_btree_buf_to_ptr(
1218 	struct xfs_btree_cur	*cur,
1219 	struct xfs_buf		*bp,
1220 	union xfs_btree_ptr	*ptr)
1221 {
1222 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1223 		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1224 					XFS_BUF_ADDR(bp)));
1225 	else {
1226 		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1227 					XFS_BUF_ADDR(bp)));
1228 	}
1229 }
1230 
1231 STATIC void
1232 xfs_btree_set_refs(
1233 	struct xfs_btree_cur	*cur,
1234 	struct xfs_buf		*bp)
1235 {
1236 	switch (cur->bc_btnum) {
1237 	case XFS_BTNUM_BNO:
1238 	case XFS_BTNUM_CNT:
1239 		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1240 		break;
1241 	case XFS_BTNUM_INO:
1242 	case XFS_BTNUM_FINO:
1243 		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1244 		break;
1245 	case XFS_BTNUM_BMAP:
1246 		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1247 		break;
1248 	case XFS_BTNUM_RMAP:
1249 		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1250 		break;
1251 	case XFS_BTNUM_REFC:
1252 		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1253 		break;
1254 	default:
1255 		ASSERT(0);
1256 	}
1257 }
1258 
1259 STATIC int
1260 xfs_btree_get_buf_block(
1261 	struct xfs_btree_cur	*cur,
1262 	union xfs_btree_ptr	*ptr,
1263 	struct xfs_btree_block	**block,
1264 	struct xfs_buf		**bpp)
1265 {
1266 	struct xfs_mount	*mp = cur->bc_mp;
1267 	xfs_daddr_t		d;
1268 	int			error;
1269 
1270 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1271 	if (error)
1272 		return error;
1273 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1274 				 mp->m_bsize, 0);
1275 
1276 	if (!*bpp)
1277 		return -ENOMEM;
1278 
1279 	(*bpp)->b_ops = cur->bc_ops->buf_ops;
1280 	*block = XFS_BUF_TO_BLOCK(*bpp);
1281 	return 0;
1282 }
1283 
1284 /*
1285  * Read in the buffer at the given ptr and return the buffer and
1286  * the block pointer within the buffer.
1287  */
1288 STATIC int
1289 xfs_btree_read_buf_block(
1290 	struct xfs_btree_cur	*cur,
1291 	union xfs_btree_ptr	*ptr,
1292 	int			flags,
1293 	struct xfs_btree_block	**block,
1294 	struct xfs_buf		**bpp)
1295 {
1296 	struct xfs_mount	*mp = cur->bc_mp;
1297 	xfs_daddr_t		d;
1298 	int			error;
1299 
1300 	/* need to sort out how callers deal with failures first */
1301 	ASSERT(!(flags & XBF_TRYLOCK));
1302 
1303 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1304 	if (error)
1305 		return error;
1306 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1307 				   mp->m_bsize, flags, bpp,
1308 				   cur->bc_ops->buf_ops);
1309 	if (error)
1310 		return error;
1311 
1312 	xfs_btree_set_refs(cur, *bpp);
1313 	*block = XFS_BUF_TO_BLOCK(*bpp);
1314 	return 0;
1315 }
1316 
1317 /*
1318  * Copy keys from one btree block to another.
1319  */
1320 STATIC void
1321 xfs_btree_copy_keys(
1322 	struct xfs_btree_cur	*cur,
1323 	union xfs_btree_key	*dst_key,
1324 	union xfs_btree_key	*src_key,
1325 	int			numkeys)
1326 {
1327 	ASSERT(numkeys >= 0);
1328 	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1329 }
1330 
1331 /*
1332  * Copy records from one btree block to another.
1333  */
1334 STATIC void
1335 xfs_btree_copy_recs(
1336 	struct xfs_btree_cur	*cur,
1337 	union xfs_btree_rec	*dst_rec,
1338 	union xfs_btree_rec	*src_rec,
1339 	int			numrecs)
1340 {
1341 	ASSERT(numrecs >= 0);
1342 	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1343 }
1344 
1345 /*
1346  * Copy block pointers from one btree block to another.
1347  */
1348 STATIC void
1349 xfs_btree_copy_ptrs(
1350 	struct xfs_btree_cur	*cur,
1351 	union xfs_btree_ptr	*dst_ptr,
1352 	union xfs_btree_ptr	*src_ptr,
1353 	int			numptrs)
1354 {
1355 	ASSERT(numptrs >= 0);
1356 	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1357 }
1358 
1359 /*
1360  * Shift keys one index left/right inside a single btree block.
1361  */
1362 STATIC void
1363 xfs_btree_shift_keys(
1364 	struct xfs_btree_cur	*cur,
1365 	union xfs_btree_key	*key,
1366 	int			dir,
1367 	int			numkeys)
1368 {
1369 	char			*dst_key;
1370 
1371 	ASSERT(numkeys >= 0);
1372 	ASSERT(dir == 1 || dir == -1);
1373 
1374 	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1375 	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1376 }
1377 
1378 /*
1379  * Shift records one index left/right inside a single btree block.
1380  */
1381 STATIC void
1382 xfs_btree_shift_recs(
1383 	struct xfs_btree_cur	*cur,
1384 	union xfs_btree_rec	*rec,
1385 	int			dir,
1386 	int			numrecs)
1387 {
1388 	char			*dst_rec;
1389 
1390 	ASSERT(numrecs >= 0);
1391 	ASSERT(dir == 1 || dir == -1);
1392 
1393 	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1394 	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1395 }
1396 
1397 /*
1398  * Shift block pointers one index left/right inside a single btree block.
1399  */
1400 STATIC void
1401 xfs_btree_shift_ptrs(
1402 	struct xfs_btree_cur	*cur,
1403 	union xfs_btree_ptr	*ptr,
1404 	int			dir,
1405 	int			numptrs)
1406 {
1407 	char			*dst_ptr;
1408 
1409 	ASSERT(numptrs >= 0);
1410 	ASSERT(dir == 1 || dir == -1);
1411 
1412 	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1413 	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1414 }
1415 
1416 /*
1417  * Log key values from the btree block.
1418  */
1419 STATIC void
1420 xfs_btree_log_keys(
1421 	struct xfs_btree_cur	*cur,
1422 	struct xfs_buf		*bp,
1423 	int			first,
1424 	int			last)
1425 {
1426 
1427 	if (bp) {
1428 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1429 		xfs_trans_log_buf(cur->bc_tp, bp,
1430 				  xfs_btree_key_offset(cur, first),
1431 				  xfs_btree_key_offset(cur, last + 1) - 1);
1432 	} else {
1433 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1434 				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1435 	}
1436 }
1437 
1438 /*
1439  * Log record values from the btree block.
1440  */
1441 void
1442 xfs_btree_log_recs(
1443 	struct xfs_btree_cur	*cur,
1444 	struct xfs_buf		*bp,
1445 	int			first,
1446 	int			last)
1447 {
1448 
1449 	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1450 	xfs_trans_log_buf(cur->bc_tp, bp,
1451 			  xfs_btree_rec_offset(cur, first),
1452 			  xfs_btree_rec_offset(cur, last + 1) - 1);
1453 
1454 }
1455 
1456 /*
1457  * Log block pointer fields from a btree block (nonleaf).
1458  */
1459 STATIC void
1460 xfs_btree_log_ptrs(
1461 	struct xfs_btree_cur	*cur,	/* btree cursor */
1462 	struct xfs_buf		*bp,	/* buffer containing btree block */
1463 	int			first,	/* index of first pointer to log */
1464 	int			last)	/* index of last pointer to log */
1465 {
1466 
1467 	if (bp) {
1468 		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
1469 		int			level = xfs_btree_get_level(block);
1470 
1471 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1472 		xfs_trans_log_buf(cur->bc_tp, bp,
1473 				xfs_btree_ptr_offset(cur, first, level),
1474 				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1475 	} else {
1476 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1477 			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1478 	}
1479 
1480 }
1481 
1482 /*
1483  * Log fields from a btree block header.
1484  */
1485 void
1486 xfs_btree_log_block(
1487 	struct xfs_btree_cur	*cur,	/* btree cursor */
1488 	struct xfs_buf		*bp,	/* buffer containing btree block */
1489 	int			fields)	/* mask of fields: XFS_BB_... */
1490 {
1491 	int			first;	/* first byte offset logged */
1492 	int			last;	/* last byte offset logged */
1493 	static const short	soffsets[] = {	/* table of offsets (short) */
1494 		offsetof(struct xfs_btree_block, bb_magic),
1495 		offsetof(struct xfs_btree_block, bb_level),
1496 		offsetof(struct xfs_btree_block, bb_numrecs),
1497 		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1498 		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1499 		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1500 		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1501 		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1502 		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1503 		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1504 		XFS_BTREE_SBLOCK_CRC_LEN
1505 	};
1506 	static const short	loffsets[] = {	/* table of offsets (long) */
1507 		offsetof(struct xfs_btree_block, bb_magic),
1508 		offsetof(struct xfs_btree_block, bb_level),
1509 		offsetof(struct xfs_btree_block, bb_numrecs),
1510 		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1511 		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1512 		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1513 		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1514 		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1515 		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1516 		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1517 		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1518 		XFS_BTREE_LBLOCK_CRC_LEN
1519 	};
1520 
1521 	if (bp) {
1522 		int nbits;
1523 
1524 		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1525 			/*
1526 			 * We don't log the CRC when updating a btree
1527 			 * block but instead recreate it during log
1528 			 * recovery.  As the log buffers have checksums
1529 			 * of their own this is safe and avoids logging a crc
1530 			 * update in a lot of places.
1531 			 */
1532 			if (fields == XFS_BB_ALL_BITS)
1533 				fields = XFS_BB_ALL_BITS_CRC;
1534 			nbits = XFS_BB_NUM_BITS_CRC;
1535 		} else {
1536 			nbits = XFS_BB_NUM_BITS;
1537 		}
1538 		xfs_btree_offsets(fields,
1539 				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1540 					loffsets : soffsets,
1541 				  nbits, &first, &last);
1542 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1543 		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1544 	} else {
1545 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1546 			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1547 	}
1548 }
1549 
1550 /*
1551  * Increment cursor by one record at the level.
1552  * For nonzero levels the leaf-ward information is untouched.
1553  */
1554 int						/* error */
1555 xfs_btree_increment(
1556 	struct xfs_btree_cur	*cur,
1557 	int			level,
1558 	int			*stat)		/* success/failure */
1559 {
1560 	struct xfs_btree_block	*block;
1561 	union xfs_btree_ptr	ptr;
1562 	struct xfs_buf		*bp;
1563 	int			error;		/* error return value */
1564 	int			lev;
1565 
1566 	ASSERT(level < cur->bc_nlevels);
1567 
1568 	/* Read-ahead to the right at this level. */
1569 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1570 
1571 	/* Get a pointer to the btree block. */
1572 	block = xfs_btree_get_block(cur, level, &bp);
1573 
1574 #ifdef DEBUG
1575 	error = xfs_btree_check_block(cur, block, level, bp);
1576 	if (error)
1577 		goto error0;
1578 #endif
1579 
1580 	/* We're done if we remain in the block after the increment. */
1581 	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1582 		goto out1;
1583 
1584 	/* Fail if we just went off the right edge of the tree. */
1585 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1586 	if (xfs_btree_ptr_is_null(cur, &ptr))
1587 		goto out0;
1588 
1589 	XFS_BTREE_STATS_INC(cur, increment);
1590 
1591 	/*
1592 	 * March up the tree incrementing pointers.
1593 	 * Stop when we don't go off the right edge of a block.
1594 	 */
1595 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1596 		block = xfs_btree_get_block(cur, lev, &bp);
1597 
1598 #ifdef DEBUG
1599 		error = xfs_btree_check_block(cur, block, lev, bp);
1600 		if (error)
1601 			goto error0;
1602 #endif
1603 
1604 		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1605 			break;
1606 
1607 		/* Read-ahead the right block for the next loop. */
1608 		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1609 	}
1610 
1611 	/*
1612 	 * If we went off the root then we are either seriously
1613 	 * confused or have the tree root in an inode.
1614 	 */
1615 	if (lev == cur->bc_nlevels) {
1616 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1617 			goto out0;
1618 		ASSERT(0);
1619 		error = -EFSCORRUPTED;
1620 		goto error0;
1621 	}
1622 	ASSERT(lev < cur->bc_nlevels);
1623 
1624 	/*
1625 	 * Now walk back down the tree, fixing up the cursor's buffer
1626 	 * pointers and key numbers.
1627 	 */
1628 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1629 		union xfs_btree_ptr	*ptrp;
1630 
1631 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1632 		--lev;
1633 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1634 		if (error)
1635 			goto error0;
1636 
1637 		xfs_btree_setbuf(cur, lev, bp);
1638 		cur->bc_ptrs[lev] = 1;
1639 	}
1640 out1:
1641 	*stat = 1;
1642 	return 0;
1643 
1644 out0:
1645 	*stat = 0;
1646 	return 0;
1647 
1648 error0:
1649 	return error;
1650 }
1651 
1652 /*
1653  * Decrement cursor by one record at the level.
1654  * For nonzero levels the leaf-ward information is untouched.
1655  */
1656 int						/* error */
1657 xfs_btree_decrement(
1658 	struct xfs_btree_cur	*cur,
1659 	int			level,
1660 	int			*stat)		/* success/failure */
1661 {
1662 	struct xfs_btree_block	*block;
1663 	xfs_buf_t		*bp;
1664 	int			error;		/* error return value */
1665 	int			lev;
1666 	union xfs_btree_ptr	ptr;
1667 
1668 	ASSERT(level < cur->bc_nlevels);
1669 
1670 	/* Read-ahead to the left at this level. */
1671 	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1672 
1673 	/* We're done if we remain in the block after the decrement. */
1674 	if (--cur->bc_ptrs[level] > 0)
1675 		goto out1;
1676 
1677 	/* Get a pointer to the btree block. */
1678 	block = xfs_btree_get_block(cur, level, &bp);
1679 
1680 #ifdef DEBUG
1681 	error = xfs_btree_check_block(cur, block, level, bp);
1682 	if (error)
1683 		goto error0;
1684 #endif
1685 
1686 	/* Fail if we just went off the left edge of the tree. */
1687 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1688 	if (xfs_btree_ptr_is_null(cur, &ptr))
1689 		goto out0;
1690 
1691 	XFS_BTREE_STATS_INC(cur, decrement);
1692 
1693 	/*
1694 	 * March up the tree decrementing pointers.
1695 	 * Stop when we don't go off the left edge of a block.
1696 	 */
1697 	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1698 		if (--cur->bc_ptrs[lev] > 0)
1699 			break;
1700 		/* Read-ahead the left block for the next loop. */
1701 		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1702 	}
1703 
1704 	/*
1705 	 * If we went off the root then we are seriously confused.
1706 	 * or the root of the tree is in an inode.
1707 	 */
1708 	if (lev == cur->bc_nlevels) {
1709 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1710 			goto out0;
1711 		ASSERT(0);
1712 		error = -EFSCORRUPTED;
1713 		goto error0;
1714 	}
1715 	ASSERT(lev < cur->bc_nlevels);
1716 
1717 	/*
1718 	 * Now walk back down the tree, fixing up the cursor's buffer
1719 	 * pointers and key numbers.
1720 	 */
1721 	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1722 		union xfs_btree_ptr	*ptrp;
1723 
1724 		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1725 		--lev;
1726 		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1727 		if (error)
1728 			goto error0;
1729 		xfs_btree_setbuf(cur, lev, bp);
1730 		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1731 	}
1732 out1:
1733 	*stat = 1;
1734 	return 0;
1735 
1736 out0:
1737 	*stat = 0;
1738 	return 0;
1739 
1740 error0:
1741 	return error;
1742 }
1743 
1744 int
1745 xfs_btree_lookup_get_block(
1746 	struct xfs_btree_cur	*cur,	/* btree cursor */
1747 	int			level,	/* level in the btree */
1748 	union xfs_btree_ptr	*pp,	/* ptr to btree block */
1749 	struct xfs_btree_block	**blkp) /* return btree block */
1750 {
1751 	struct xfs_buf		*bp;	/* buffer pointer for btree block */
1752 	xfs_daddr_t		daddr;
1753 	int			error = 0;
1754 
1755 	/* special case the root block if in an inode */
1756 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1757 	    (level == cur->bc_nlevels - 1)) {
1758 		*blkp = xfs_btree_get_iroot(cur);
1759 		return 0;
1760 	}
1761 
1762 	/*
1763 	 * If the old buffer at this level for the disk address we are
1764 	 * looking for re-use it.
1765 	 *
1766 	 * Otherwise throw it away and get a new one.
1767 	 */
1768 	bp = cur->bc_bufs[level];
1769 	error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1770 	if (error)
1771 		return error;
1772 	if (bp && XFS_BUF_ADDR(bp) == daddr) {
1773 		*blkp = XFS_BUF_TO_BLOCK(bp);
1774 		return 0;
1775 	}
1776 
1777 	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1778 	if (error)
1779 		return error;
1780 
1781 	/* Check the inode owner since the verifiers don't. */
1782 	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1783 	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1784 	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1785 	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1786 			cur->bc_private.b.ip->i_ino)
1787 		goto out_bad;
1788 
1789 	/* Did we get the level we were looking for? */
1790 	if (be16_to_cpu((*blkp)->bb_level) != level)
1791 		goto out_bad;
1792 
1793 	/* Check that internal nodes have at least one record. */
1794 	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1795 		goto out_bad;
1796 
1797 	xfs_btree_setbuf(cur, level, bp);
1798 	return 0;
1799 
1800 out_bad:
1801 	*blkp = NULL;
1802 	xfs_buf_corruption_error(bp);
1803 	xfs_trans_brelse(cur->bc_tp, bp);
1804 	return -EFSCORRUPTED;
1805 }
1806 
1807 /*
1808  * Get current search key.  For level 0 we don't actually have a key
1809  * structure so we make one up from the record.  For all other levels
1810  * we just return the right key.
1811  */
1812 STATIC union xfs_btree_key *
1813 xfs_lookup_get_search_key(
1814 	struct xfs_btree_cur	*cur,
1815 	int			level,
1816 	int			keyno,
1817 	struct xfs_btree_block	*block,
1818 	union xfs_btree_key	*kp)
1819 {
1820 	if (level == 0) {
1821 		cur->bc_ops->init_key_from_rec(kp,
1822 				xfs_btree_rec_addr(cur, keyno, block));
1823 		return kp;
1824 	}
1825 
1826 	return xfs_btree_key_addr(cur, keyno, block);
1827 }
1828 
1829 /*
1830  * Lookup the record.  The cursor is made to point to it, based on dir.
1831  * stat is set to 0 if can't find any such record, 1 for success.
1832  */
1833 int					/* error */
1834 xfs_btree_lookup(
1835 	struct xfs_btree_cur	*cur,	/* btree cursor */
1836 	xfs_lookup_t		dir,	/* <=, ==, or >= */
1837 	int			*stat)	/* success/failure */
1838 {
1839 	struct xfs_btree_block	*block;	/* current btree block */
1840 	int64_t			diff;	/* difference for the current key */
1841 	int			error;	/* error return value */
1842 	int			keyno;	/* current key number */
1843 	int			level;	/* level in the btree */
1844 	union xfs_btree_ptr	*pp;	/* ptr to btree block */
1845 	union xfs_btree_ptr	ptr;	/* ptr to btree block */
1846 
1847 	XFS_BTREE_STATS_INC(cur, lookup);
1848 
1849 	/* No such thing as a zero-level tree. */
1850 	if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1851 		return -EFSCORRUPTED;
1852 
1853 	block = NULL;
1854 	keyno = 0;
1855 
1856 	/* initialise start pointer from cursor */
1857 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1858 	pp = &ptr;
1859 
1860 	/*
1861 	 * Iterate over each level in the btree, starting at the root.
1862 	 * For each level above the leaves, find the key we need, based
1863 	 * on the lookup record, then follow the corresponding block
1864 	 * pointer down to the next level.
1865 	 */
1866 	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1867 		/* Get the block we need to do the lookup on. */
1868 		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1869 		if (error)
1870 			goto error0;
1871 
1872 		if (diff == 0) {
1873 			/*
1874 			 * If we already had a key match at a higher level, we
1875 			 * know we need to use the first entry in this block.
1876 			 */
1877 			keyno = 1;
1878 		} else {
1879 			/* Otherwise search this block. Do a binary search. */
1880 
1881 			int	high;	/* high entry number */
1882 			int	low;	/* low entry number */
1883 
1884 			/* Set low and high entry numbers, 1-based. */
1885 			low = 1;
1886 			high = xfs_btree_get_numrecs(block);
1887 			if (!high) {
1888 				/* Block is empty, must be an empty leaf. */
1889 				if (level != 0 || cur->bc_nlevels != 1) {
1890 					XFS_CORRUPTION_ERROR(__func__,
1891 							XFS_ERRLEVEL_LOW,
1892 							cur->bc_mp, block,
1893 							sizeof(*block));
1894 					return -EFSCORRUPTED;
1895 				}
1896 
1897 				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1898 				*stat = 0;
1899 				return 0;
1900 			}
1901 
1902 			/* Binary search the block. */
1903 			while (low <= high) {
1904 				union xfs_btree_key	key;
1905 				union xfs_btree_key	*kp;
1906 
1907 				XFS_BTREE_STATS_INC(cur, compare);
1908 
1909 				/* keyno is average of low and high. */
1910 				keyno = (low + high) >> 1;
1911 
1912 				/* Get current search key */
1913 				kp = xfs_lookup_get_search_key(cur, level,
1914 						keyno, block, &key);
1915 
1916 				/*
1917 				 * Compute difference to get next direction:
1918 				 *  - less than, move right
1919 				 *  - greater than, move left
1920 				 *  - equal, we're done
1921 				 */
1922 				diff = cur->bc_ops->key_diff(cur, kp);
1923 				if (diff < 0)
1924 					low = keyno + 1;
1925 				else if (diff > 0)
1926 					high = keyno - 1;
1927 				else
1928 					break;
1929 			}
1930 		}
1931 
1932 		/*
1933 		 * If there are more levels, set up for the next level
1934 		 * by getting the block number and filling in the cursor.
1935 		 */
1936 		if (level > 0) {
1937 			/*
1938 			 * If we moved left, need the previous key number,
1939 			 * unless there isn't one.
1940 			 */
1941 			if (diff > 0 && --keyno < 1)
1942 				keyno = 1;
1943 			pp = xfs_btree_ptr_addr(cur, keyno, block);
1944 
1945 			error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1946 			if (error)
1947 				goto error0;
1948 
1949 			cur->bc_ptrs[level] = keyno;
1950 		}
1951 	}
1952 
1953 	/* Done with the search. See if we need to adjust the results. */
1954 	if (dir != XFS_LOOKUP_LE && diff < 0) {
1955 		keyno++;
1956 		/*
1957 		 * If ge search and we went off the end of the block, but it's
1958 		 * not the last block, we're in the wrong block.
1959 		 */
1960 		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1961 		if (dir == XFS_LOOKUP_GE &&
1962 		    keyno > xfs_btree_get_numrecs(block) &&
1963 		    !xfs_btree_ptr_is_null(cur, &ptr)) {
1964 			int	i;
1965 
1966 			cur->bc_ptrs[0] = keyno;
1967 			error = xfs_btree_increment(cur, 0, &i);
1968 			if (error)
1969 				goto error0;
1970 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1971 				return -EFSCORRUPTED;
1972 			*stat = 1;
1973 			return 0;
1974 		}
1975 	} else if (dir == XFS_LOOKUP_LE && diff > 0)
1976 		keyno--;
1977 	cur->bc_ptrs[0] = keyno;
1978 
1979 	/* Return if we succeeded or not. */
1980 	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1981 		*stat = 0;
1982 	else if (dir != XFS_LOOKUP_EQ || diff == 0)
1983 		*stat = 1;
1984 	else
1985 		*stat = 0;
1986 	return 0;
1987 
1988 error0:
1989 	return error;
1990 }
1991 
1992 /* Find the high key storage area from a regular key. */
1993 union xfs_btree_key *
1994 xfs_btree_high_key_from_key(
1995 	struct xfs_btree_cur	*cur,
1996 	union xfs_btree_key	*key)
1997 {
1998 	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1999 	return (union xfs_btree_key *)((char *)key +
2000 			(cur->bc_ops->key_len / 2));
2001 }
2002 
2003 /* Determine the low (and high if overlapped) keys of a leaf block */
2004 STATIC void
2005 xfs_btree_get_leaf_keys(
2006 	struct xfs_btree_cur	*cur,
2007 	struct xfs_btree_block	*block,
2008 	union xfs_btree_key	*key)
2009 {
2010 	union xfs_btree_key	max_hkey;
2011 	union xfs_btree_key	hkey;
2012 	union xfs_btree_rec	*rec;
2013 	union xfs_btree_key	*high;
2014 	int			n;
2015 
2016 	rec = xfs_btree_rec_addr(cur, 1, block);
2017 	cur->bc_ops->init_key_from_rec(key, rec);
2018 
2019 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2020 
2021 		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2022 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2023 			rec = xfs_btree_rec_addr(cur, n, block);
2024 			cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2025 			if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2026 					> 0)
2027 				max_hkey = hkey;
2028 		}
2029 
2030 		high = xfs_btree_high_key_from_key(cur, key);
2031 		memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2032 	}
2033 }
2034 
2035 /* Determine the low (and high if overlapped) keys of a node block */
2036 STATIC void
2037 xfs_btree_get_node_keys(
2038 	struct xfs_btree_cur	*cur,
2039 	struct xfs_btree_block	*block,
2040 	union xfs_btree_key	*key)
2041 {
2042 	union xfs_btree_key	*hkey;
2043 	union xfs_btree_key	*max_hkey;
2044 	union xfs_btree_key	*high;
2045 	int			n;
2046 
2047 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2048 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2049 				cur->bc_ops->key_len / 2);
2050 
2051 		max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2052 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2053 			hkey = xfs_btree_high_key_addr(cur, n, block);
2054 			if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2055 				max_hkey = hkey;
2056 		}
2057 
2058 		high = xfs_btree_high_key_from_key(cur, key);
2059 		memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2060 	} else {
2061 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
2062 				cur->bc_ops->key_len);
2063 	}
2064 }
2065 
2066 /* Derive the keys for any btree block. */
2067 void
2068 xfs_btree_get_keys(
2069 	struct xfs_btree_cur	*cur,
2070 	struct xfs_btree_block	*block,
2071 	union xfs_btree_key	*key)
2072 {
2073 	if (be16_to_cpu(block->bb_level) == 0)
2074 		xfs_btree_get_leaf_keys(cur, block, key);
2075 	else
2076 		xfs_btree_get_node_keys(cur, block, key);
2077 }
2078 
2079 /*
2080  * Decide if we need to update the parent keys of a btree block.  For
2081  * a standard btree this is only necessary if we're updating the first
2082  * record/key.  For an overlapping btree, we must always update the
2083  * keys because the highest key can be in any of the records or keys
2084  * in the block.
2085  */
2086 static inline bool
2087 xfs_btree_needs_key_update(
2088 	struct xfs_btree_cur	*cur,
2089 	int			ptr)
2090 {
2091 	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2092 }
2093 
2094 /*
2095  * Update the low and high parent keys of the given level, progressing
2096  * towards the root.  If force_all is false, stop if the keys for a given
2097  * level do not need updating.
2098  */
2099 STATIC int
2100 __xfs_btree_updkeys(
2101 	struct xfs_btree_cur	*cur,
2102 	int			level,
2103 	struct xfs_btree_block	*block,
2104 	struct xfs_buf		*bp0,
2105 	bool			force_all)
2106 {
2107 	union xfs_btree_key	key;	/* keys from current level */
2108 	union xfs_btree_key	*lkey;	/* keys from the next level up */
2109 	union xfs_btree_key	*hkey;
2110 	union xfs_btree_key	*nlkey;	/* keys from the next level up */
2111 	union xfs_btree_key	*nhkey;
2112 	struct xfs_buf		*bp;
2113 	int			ptr;
2114 
2115 	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2116 
2117 	/* Exit if there aren't any parent levels to update. */
2118 	if (level + 1 >= cur->bc_nlevels)
2119 		return 0;
2120 
2121 	trace_xfs_btree_updkeys(cur, level, bp0);
2122 
2123 	lkey = &key;
2124 	hkey = xfs_btree_high_key_from_key(cur, lkey);
2125 	xfs_btree_get_keys(cur, block, lkey);
2126 	for (level++; level < cur->bc_nlevels; level++) {
2127 #ifdef DEBUG
2128 		int		error;
2129 #endif
2130 		block = xfs_btree_get_block(cur, level, &bp);
2131 		trace_xfs_btree_updkeys(cur, level, bp);
2132 #ifdef DEBUG
2133 		error = xfs_btree_check_block(cur, block, level, bp);
2134 		if (error)
2135 			return error;
2136 #endif
2137 		ptr = cur->bc_ptrs[level];
2138 		nlkey = xfs_btree_key_addr(cur, ptr, block);
2139 		nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2140 		if (!force_all &&
2141 		    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2142 		      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2143 			break;
2144 		xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2145 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2146 		if (level + 1 >= cur->bc_nlevels)
2147 			break;
2148 		xfs_btree_get_node_keys(cur, block, lkey);
2149 	}
2150 
2151 	return 0;
2152 }
2153 
2154 /* Update all the keys from some level in cursor back to the root. */
2155 STATIC int
2156 xfs_btree_updkeys_force(
2157 	struct xfs_btree_cur	*cur,
2158 	int			level)
2159 {
2160 	struct xfs_buf		*bp;
2161 	struct xfs_btree_block	*block;
2162 
2163 	block = xfs_btree_get_block(cur, level, &bp);
2164 	return __xfs_btree_updkeys(cur, level, block, bp, true);
2165 }
2166 
2167 /*
2168  * Update the parent keys of the given level, progressing towards the root.
2169  */
2170 STATIC int
2171 xfs_btree_update_keys(
2172 	struct xfs_btree_cur	*cur,
2173 	int			level)
2174 {
2175 	struct xfs_btree_block	*block;
2176 	struct xfs_buf		*bp;
2177 	union xfs_btree_key	*kp;
2178 	union xfs_btree_key	key;
2179 	int			ptr;
2180 
2181 	ASSERT(level >= 0);
2182 
2183 	block = xfs_btree_get_block(cur, level, &bp);
2184 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2185 		return __xfs_btree_updkeys(cur, level, block, bp, false);
2186 
2187 	/*
2188 	 * Go up the tree from this level toward the root.
2189 	 * At each level, update the key value to the value input.
2190 	 * Stop when we reach a level where the cursor isn't pointing
2191 	 * at the first entry in the block.
2192 	 */
2193 	xfs_btree_get_keys(cur, block, &key);
2194 	for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2195 #ifdef DEBUG
2196 		int		error;
2197 #endif
2198 		block = xfs_btree_get_block(cur, level, &bp);
2199 #ifdef DEBUG
2200 		error = xfs_btree_check_block(cur, block, level, bp);
2201 		if (error)
2202 			return error;
2203 #endif
2204 		ptr = cur->bc_ptrs[level];
2205 		kp = xfs_btree_key_addr(cur, ptr, block);
2206 		xfs_btree_copy_keys(cur, kp, &key, 1);
2207 		xfs_btree_log_keys(cur, bp, ptr, ptr);
2208 	}
2209 
2210 	return 0;
2211 }
2212 
2213 /*
2214  * Update the record referred to by cur to the value in the
2215  * given record. This either works (return 0) or gets an
2216  * EFSCORRUPTED error.
2217  */
2218 int
2219 xfs_btree_update(
2220 	struct xfs_btree_cur	*cur,
2221 	union xfs_btree_rec	*rec)
2222 {
2223 	struct xfs_btree_block	*block;
2224 	struct xfs_buf		*bp;
2225 	int			error;
2226 	int			ptr;
2227 	union xfs_btree_rec	*rp;
2228 
2229 	/* Pick up the current block. */
2230 	block = xfs_btree_get_block(cur, 0, &bp);
2231 
2232 #ifdef DEBUG
2233 	error = xfs_btree_check_block(cur, block, 0, bp);
2234 	if (error)
2235 		goto error0;
2236 #endif
2237 	/* Get the address of the rec to be updated. */
2238 	ptr = cur->bc_ptrs[0];
2239 	rp = xfs_btree_rec_addr(cur, ptr, block);
2240 
2241 	/* Fill in the new contents and log them. */
2242 	xfs_btree_copy_recs(cur, rp, rec, 1);
2243 	xfs_btree_log_recs(cur, bp, ptr, ptr);
2244 
2245 	/*
2246 	 * If we are tracking the last record in the tree and
2247 	 * we are at the far right edge of the tree, update it.
2248 	 */
2249 	if (xfs_btree_is_lastrec(cur, block, 0)) {
2250 		cur->bc_ops->update_lastrec(cur, block, rec,
2251 					    ptr, LASTREC_UPDATE);
2252 	}
2253 
2254 	/* Pass new key value up to our parent. */
2255 	if (xfs_btree_needs_key_update(cur, ptr)) {
2256 		error = xfs_btree_update_keys(cur, 0);
2257 		if (error)
2258 			goto error0;
2259 	}
2260 
2261 	return 0;
2262 
2263 error0:
2264 	return error;
2265 }
2266 
2267 /*
2268  * Move 1 record left from cur/level if possible.
2269  * Update cur to reflect the new path.
2270  */
2271 STATIC int					/* error */
2272 xfs_btree_lshift(
2273 	struct xfs_btree_cur	*cur,
2274 	int			level,
2275 	int			*stat)		/* success/failure */
2276 {
2277 	struct xfs_buf		*lbp;		/* left buffer pointer */
2278 	struct xfs_btree_block	*left;		/* left btree block */
2279 	int			lrecs;		/* left record count */
2280 	struct xfs_buf		*rbp;		/* right buffer pointer */
2281 	struct xfs_btree_block	*right;		/* right btree block */
2282 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2283 	int			rrecs;		/* right record count */
2284 	union xfs_btree_ptr	lptr;		/* left btree pointer */
2285 	union xfs_btree_key	*rkp = NULL;	/* right btree key */
2286 	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
2287 	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
2288 	int			error;		/* error return value */
2289 	int			i;
2290 
2291 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2292 	    level == cur->bc_nlevels - 1)
2293 		goto out0;
2294 
2295 	/* Set up variables for this block as "right". */
2296 	right = xfs_btree_get_block(cur, level, &rbp);
2297 
2298 #ifdef DEBUG
2299 	error = xfs_btree_check_block(cur, right, level, rbp);
2300 	if (error)
2301 		goto error0;
2302 #endif
2303 
2304 	/* If we've got no left sibling then we can't shift an entry left. */
2305 	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2306 	if (xfs_btree_ptr_is_null(cur, &lptr))
2307 		goto out0;
2308 
2309 	/*
2310 	 * If the cursor entry is the one that would be moved, don't
2311 	 * do it... it's too complicated.
2312 	 */
2313 	if (cur->bc_ptrs[level] <= 1)
2314 		goto out0;
2315 
2316 	/* Set up the left neighbor as "left". */
2317 	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2318 	if (error)
2319 		goto error0;
2320 
2321 	/* If it's full, it can't take another entry. */
2322 	lrecs = xfs_btree_get_numrecs(left);
2323 	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2324 		goto out0;
2325 
2326 	rrecs = xfs_btree_get_numrecs(right);
2327 
2328 	/*
2329 	 * We add one entry to the left side and remove one for the right side.
2330 	 * Account for it here, the changes will be updated on disk and logged
2331 	 * later.
2332 	 */
2333 	lrecs++;
2334 	rrecs--;
2335 
2336 	XFS_BTREE_STATS_INC(cur, lshift);
2337 	XFS_BTREE_STATS_ADD(cur, moves, 1);
2338 
2339 	/*
2340 	 * If non-leaf, copy a key and a ptr to the left block.
2341 	 * Log the changes to the left block.
2342 	 */
2343 	if (level > 0) {
2344 		/* It's a non-leaf.  Move keys and pointers. */
2345 		union xfs_btree_key	*lkp;	/* left btree key */
2346 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2347 
2348 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2349 		rkp = xfs_btree_key_addr(cur, 1, right);
2350 
2351 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2352 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2353 
2354 		error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2355 		if (error)
2356 			goto error0;
2357 
2358 		xfs_btree_copy_keys(cur, lkp, rkp, 1);
2359 		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2360 
2361 		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2362 		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2363 
2364 		ASSERT(cur->bc_ops->keys_inorder(cur,
2365 			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2366 	} else {
2367 		/* It's a leaf.  Move records.  */
2368 		union xfs_btree_rec	*lrp;	/* left record pointer */
2369 
2370 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2371 		rrp = xfs_btree_rec_addr(cur, 1, right);
2372 
2373 		xfs_btree_copy_recs(cur, lrp, rrp, 1);
2374 		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2375 
2376 		ASSERT(cur->bc_ops->recs_inorder(cur,
2377 			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2378 	}
2379 
2380 	xfs_btree_set_numrecs(left, lrecs);
2381 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2382 
2383 	xfs_btree_set_numrecs(right, rrecs);
2384 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2385 
2386 	/*
2387 	 * Slide the contents of right down one entry.
2388 	 */
2389 	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2390 	if (level > 0) {
2391 		/* It's a nonleaf. operate on keys and ptrs */
2392 		int			i;		/* loop index */
2393 
2394 		for (i = 0; i < rrecs; i++) {
2395 			error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2396 			if (error)
2397 				goto error0;
2398 		}
2399 
2400 		xfs_btree_shift_keys(cur,
2401 				xfs_btree_key_addr(cur, 2, right),
2402 				-1, rrecs);
2403 		xfs_btree_shift_ptrs(cur,
2404 				xfs_btree_ptr_addr(cur, 2, right),
2405 				-1, rrecs);
2406 
2407 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2408 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2409 	} else {
2410 		/* It's a leaf. operate on records */
2411 		xfs_btree_shift_recs(cur,
2412 			xfs_btree_rec_addr(cur, 2, right),
2413 			-1, rrecs);
2414 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2415 	}
2416 
2417 	/*
2418 	 * Using a temporary cursor, update the parent key values of the
2419 	 * block on the left.
2420 	 */
2421 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2422 		error = xfs_btree_dup_cursor(cur, &tcur);
2423 		if (error)
2424 			goto error0;
2425 		i = xfs_btree_firstrec(tcur, level);
2426 		if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2427 			error = -EFSCORRUPTED;
2428 			goto error0;
2429 		}
2430 
2431 		error = xfs_btree_decrement(tcur, level, &i);
2432 		if (error)
2433 			goto error1;
2434 
2435 		/* Update the parent high keys of the left block, if needed. */
2436 		error = xfs_btree_update_keys(tcur, level);
2437 		if (error)
2438 			goto error1;
2439 
2440 		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2441 	}
2442 
2443 	/* Update the parent keys of the right block. */
2444 	error = xfs_btree_update_keys(cur, level);
2445 	if (error)
2446 		goto error0;
2447 
2448 	/* Slide the cursor value left one. */
2449 	cur->bc_ptrs[level]--;
2450 
2451 	*stat = 1;
2452 	return 0;
2453 
2454 out0:
2455 	*stat = 0;
2456 	return 0;
2457 
2458 error0:
2459 	return error;
2460 
2461 error1:
2462 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2463 	return error;
2464 }
2465 
2466 /*
2467  * Move 1 record right from cur/level if possible.
2468  * Update cur to reflect the new path.
2469  */
2470 STATIC int					/* error */
2471 xfs_btree_rshift(
2472 	struct xfs_btree_cur	*cur,
2473 	int			level,
2474 	int			*stat)		/* success/failure */
2475 {
2476 	struct xfs_buf		*lbp;		/* left buffer pointer */
2477 	struct xfs_btree_block	*left;		/* left btree block */
2478 	struct xfs_buf		*rbp;		/* right buffer pointer */
2479 	struct xfs_btree_block	*right;		/* right btree block */
2480 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
2481 	union xfs_btree_ptr	rptr;		/* right block pointer */
2482 	union xfs_btree_key	*rkp;		/* right btree key */
2483 	int			rrecs;		/* right record count */
2484 	int			lrecs;		/* left record count */
2485 	int			error;		/* error return value */
2486 	int			i;		/* loop counter */
2487 
2488 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2489 	    (level == cur->bc_nlevels - 1))
2490 		goto out0;
2491 
2492 	/* Set up variables for this block as "left". */
2493 	left = xfs_btree_get_block(cur, level, &lbp);
2494 
2495 #ifdef DEBUG
2496 	error = xfs_btree_check_block(cur, left, level, lbp);
2497 	if (error)
2498 		goto error0;
2499 #endif
2500 
2501 	/* If we've got no right sibling then we can't shift an entry right. */
2502 	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2503 	if (xfs_btree_ptr_is_null(cur, &rptr))
2504 		goto out0;
2505 
2506 	/*
2507 	 * If the cursor entry is the one that would be moved, don't
2508 	 * do it... it's too complicated.
2509 	 */
2510 	lrecs = xfs_btree_get_numrecs(left);
2511 	if (cur->bc_ptrs[level] >= lrecs)
2512 		goto out0;
2513 
2514 	/* Set up the right neighbor as "right". */
2515 	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2516 	if (error)
2517 		goto error0;
2518 
2519 	/* If it's full, it can't take another entry. */
2520 	rrecs = xfs_btree_get_numrecs(right);
2521 	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2522 		goto out0;
2523 
2524 	XFS_BTREE_STATS_INC(cur, rshift);
2525 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2526 
2527 	/*
2528 	 * Make a hole at the start of the right neighbor block, then
2529 	 * copy the last left block entry to the hole.
2530 	 */
2531 	if (level > 0) {
2532 		/* It's a nonleaf. make a hole in the keys and ptrs */
2533 		union xfs_btree_key	*lkp;
2534 		union xfs_btree_ptr	*lpp;
2535 		union xfs_btree_ptr	*rpp;
2536 
2537 		lkp = xfs_btree_key_addr(cur, lrecs, left);
2538 		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2539 		rkp = xfs_btree_key_addr(cur, 1, right);
2540 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2541 
2542 		for (i = rrecs - 1; i >= 0; i--) {
2543 			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2544 			if (error)
2545 				goto error0;
2546 		}
2547 
2548 		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2549 		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2550 
2551 		error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2552 		if (error)
2553 			goto error0;
2554 
2555 		/* Now put the new data in, and log it. */
2556 		xfs_btree_copy_keys(cur, rkp, lkp, 1);
2557 		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2558 
2559 		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2560 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2561 
2562 		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2563 			xfs_btree_key_addr(cur, 2, right)));
2564 	} else {
2565 		/* It's a leaf. make a hole in the records */
2566 		union xfs_btree_rec	*lrp;
2567 		union xfs_btree_rec	*rrp;
2568 
2569 		lrp = xfs_btree_rec_addr(cur, lrecs, left);
2570 		rrp = xfs_btree_rec_addr(cur, 1, right);
2571 
2572 		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2573 
2574 		/* Now put the new data in, and log it. */
2575 		xfs_btree_copy_recs(cur, rrp, lrp, 1);
2576 		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2577 	}
2578 
2579 	/*
2580 	 * Decrement and log left's numrecs, bump and log right's numrecs.
2581 	 */
2582 	xfs_btree_set_numrecs(left, --lrecs);
2583 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2584 
2585 	xfs_btree_set_numrecs(right, ++rrecs);
2586 	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2587 
2588 	/*
2589 	 * Using a temporary cursor, update the parent key values of the
2590 	 * block on the right.
2591 	 */
2592 	error = xfs_btree_dup_cursor(cur, &tcur);
2593 	if (error)
2594 		goto error0;
2595 	i = xfs_btree_lastrec(tcur, level);
2596 	if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2597 		error = -EFSCORRUPTED;
2598 		goto error0;
2599 	}
2600 
2601 	error = xfs_btree_increment(tcur, level, &i);
2602 	if (error)
2603 		goto error1;
2604 
2605 	/* Update the parent high keys of the left block, if needed. */
2606 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2607 		error = xfs_btree_update_keys(cur, level);
2608 		if (error)
2609 			goto error1;
2610 	}
2611 
2612 	/* Update the parent keys of the right block. */
2613 	error = xfs_btree_update_keys(tcur, level);
2614 	if (error)
2615 		goto error1;
2616 
2617 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2618 
2619 	*stat = 1;
2620 	return 0;
2621 
2622 out0:
2623 	*stat = 0;
2624 	return 0;
2625 
2626 error0:
2627 	return error;
2628 
2629 error1:
2630 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2631 	return error;
2632 }
2633 
2634 /*
2635  * Split cur/level block in half.
2636  * Return new block number and the key to its first
2637  * record (to be inserted into parent).
2638  */
2639 STATIC int					/* error */
2640 __xfs_btree_split(
2641 	struct xfs_btree_cur	*cur,
2642 	int			level,
2643 	union xfs_btree_ptr	*ptrp,
2644 	union xfs_btree_key	*key,
2645 	struct xfs_btree_cur	**curp,
2646 	int			*stat)		/* success/failure */
2647 {
2648 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
2649 	struct xfs_buf		*lbp;		/* left buffer pointer */
2650 	struct xfs_btree_block	*left;		/* left btree block */
2651 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
2652 	struct xfs_buf		*rbp;		/* right buffer pointer */
2653 	struct xfs_btree_block	*right;		/* right btree block */
2654 	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
2655 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
2656 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
2657 	int			lrecs;
2658 	int			rrecs;
2659 	int			src_index;
2660 	int			error;		/* error return value */
2661 	int			i;
2662 
2663 	XFS_BTREE_STATS_INC(cur, split);
2664 
2665 	/* Set up left block (current one). */
2666 	left = xfs_btree_get_block(cur, level, &lbp);
2667 
2668 #ifdef DEBUG
2669 	error = xfs_btree_check_block(cur, left, level, lbp);
2670 	if (error)
2671 		goto error0;
2672 #endif
2673 
2674 	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2675 
2676 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2677 	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2678 	if (error)
2679 		goto error0;
2680 	if (*stat == 0)
2681 		goto out0;
2682 	XFS_BTREE_STATS_INC(cur, alloc);
2683 
2684 	/* Set up the new block as "right". */
2685 	error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2686 	if (error)
2687 		goto error0;
2688 
2689 	/* Fill in the btree header for the new right block. */
2690 	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2691 
2692 	/*
2693 	 * Split the entries between the old and the new block evenly.
2694 	 * Make sure that if there's an odd number of entries now, that
2695 	 * each new block will have the same number of entries.
2696 	 */
2697 	lrecs = xfs_btree_get_numrecs(left);
2698 	rrecs = lrecs / 2;
2699 	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2700 		rrecs++;
2701 	src_index = (lrecs - rrecs + 1);
2702 
2703 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2704 
2705 	/* Adjust numrecs for the later get_*_keys() calls. */
2706 	lrecs -= rrecs;
2707 	xfs_btree_set_numrecs(left, lrecs);
2708 	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2709 
2710 	/*
2711 	 * Copy btree block entries from the left block over to the
2712 	 * new block, the right. Update the right block and log the
2713 	 * changes.
2714 	 */
2715 	if (level > 0) {
2716 		/* It's a non-leaf.  Move keys and pointers. */
2717 		union xfs_btree_key	*lkp;	/* left btree key */
2718 		union xfs_btree_ptr	*lpp;	/* left address pointer */
2719 		union xfs_btree_key	*rkp;	/* right btree key */
2720 		union xfs_btree_ptr	*rpp;	/* right address pointer */
2721 
2722 		lkp = xfs_btree_key_addr(cur, src_index, left);
2723 		lpp = xfs_btree_ptr_addr(cur, src_index, left);
2724 		rkp = xfs_btree_key_addr(cur, 1, right);
2725 		rpp = xfs_btree_ptr_addr(cur, 1, right);
2726 
2727 		for (i = src_index; i < rrecs; i++) {
2728 			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2729 			if (error)
2730 				goto error0;
2731 		}
2732 
2733 		/* Copy the keys & pointers to the new block. */
2734 		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2735 		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2736 
2737 		xfs_btree_log_keys(cur, rbp, 1, rrecs);
2738 		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2739 
2740 		/* Stash the keys of the new block for later insertion. */
2741 		xfs_btree_get_node_keys(cur, right, key);
2742 	} else {
2743 		/* It's a leaf.  Move records.  */
2744 		union xfs_btree_rec	*lrp;	/* left record pointer */
2745 		union xfs_btree_rec	*rrp;	/* right record pointer */
2746 
2747 		lrp = xfs_btree_rec_addr(cur, src_index, left);
2748 		rrp = xfs_btree_rec_addr(cur, 1, right);
2749 
2750 		/* Copy records to the new block. */
2751 		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2752 		xfs_btree_log_recs(cur, rbp, 1, rrecs);
2753 
2754 		/* Stash the keys of the new block for later insertion. */
2755 		xfs_btree_get_leaf_keys(cur, right, key);
2756 	}
2757 
2758 	/*
2759 	 * Find the left block number by looking in the buffer.
2760 	 * Adjust sibling pointers.
2761 	 */
2762 	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2763 	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2764 	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2765 	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2766 
2767 	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2768 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2769 
2770 	/*
2771 	 * If there's a block to the new block's right, make that block
2772 	 * point back to right instead of to left.
2773 	 */
2774 	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2775 		error = xfs_btree_read_buf_block(cur, &rrptr,
2776 							0, &rrblock, &rrbp);
2777 		if (error)
2778 			goto error0;
2779 		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2780 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2781 	}
2782 
2783 	/* Update the parent high keys of the left block, if needed. */
2784 	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2785 		error = xfs_btree_update_keys(cur, level);
2786 		if (error)
2787 			goto error0;
2788 	}
2789 
2790 	/*
2791 	 * If the cursor is really in the right block, move it there.
2792 	 * If it's just pointing past the last entry in left, then we'll
2793 	 * insert there, so don't change anything in that case.
2794 	 */
2795 	if (cur->bc_ptrs[level] > lrecs + 1) {
2796 		xfs_btree_setbuf(cur, level, rbp);
2797 		cur->bc_ptrs[level] -= lrecs;
2798 	}
2799 	/*
2800 	 * If there are more levels, we'll need another cursor which refers
2801 	 * the right block, no matter where this cursor was.
2802 	 */
2803 	if (level + 1 < cur->bc_nlevels) {
2804 		error = xfs_btree_dup_cursor(cur, curp);
2805 		if (error)
2806 			goto error0;
2807 		(*curp)->bc_ptrs[level + 1]++;
2808 	}
2809 	*ptrp = rptr;
2810 	*stat = 1;
2811 	return 0;
2812 out0:
2813 	*stat = 0;
2814 	return 0;
2815 
2816 error0:
2817 	return error;
2818 }
2819 
2820 struct xfs_btree_split_args {
2821 	struct xfs_btree_cur	*cur;
2822 	int			level;
2823 	union xfs_btree_ptr	*ptrp;
2824 	union xfs_btree_key	*key;
2825 	struct xfs_btree_cur	**curp;
2826 	int			*stat;		/* success/failure */
2827 	int			result;
2828 	bool			kswapd;	/* allocation in kswapd context */
2829 	struct completion	*done;
2830 	struct work_struct	work;
2831 };
2832 
2833 /*
2834  * Stack switching interfaces for allocation
2835  */
2836 static void
2837 xfs_btree_split_worker(
2838 	struct work_struct	*work)
2839 {
2840 	struct xfs_btree_split_args	*args = container_of(work,
2841 						struct xfs_btree_split_args, work);
2842 	unsigned long		pflags;
2843 	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
2844 
2845 	/*
2846 	 * we are in a transaction context here, but may also be doing work
2847 	 * in kswapd context, and hence we may need to inherit that state
2848 	 * temporarily to ensure that we don't block waiting for memory reclaim
2849 	 * in any way.
2850 	 */
2851 	if (args->kswapd)
2852 		new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2853 
2854 	current_set_flags_nested(&pflags, new_pflags);
2855 
2856 	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2857 					 args->key, args->curp, args->stat);
2858 	complete(args->done);
2859 
2860 	current_restore_flags_nested(&pflags, new_pflags);
2861 }
2862 
2863 /*
2864  * BMBT split requests often come in with little stack to work on. Push
2865  * them off to a worker thread so there is lots of stack to use. For the other
2866  * btree types, just call directly to avoid the context switch overhead here.
2867  */
2868 STATIC int					/* error */
2869 xfs_btree_split(
2870 	struct xfs_btree_cur	*cur,
2871 	int			level,
2872 	union xfs_btree_ptr	*ptrp,
2873 	union xfs_btree_key	*key,
2874 	struct xfs_btree_cur	**curp,
2875 	int			*stat)		/* success/failure */
2876 {
2877 	struct xfs_btree_split_args	args;
2878 	DECLARE_COMPLETION_ONSTACK(done);
2879 
2880 	if (cur->bc_btnum != XFS_BTNUM_BMAP)
2881 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2882 
2883 	args.cur = cur;
2884 	args.level = level;
2885 	args.ptrp = ptrp;
2886 	args.key = key;
2887 	args.curp = curp;
2888 	args.stat = stat;
2889 	args.done = &done;
2890 	args.kswapd = current_is_kswapd();
2891 	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2892 	queue_work(xfs_alloc_wq, &args.work);
2893 	wait_for_completion(&done);
2894 	destroy_work_on_stack(&args.work);
2895 	return args.result;
2896 }
2897 
2898 
2899 /*
2900  * Copy the old inode root contents into a real block and make the
2901  * broot point to it.
2902  */
2903 int						/* error */
2904 xfs_btree_new_iroot(
2905 	struct xfs_btree_cur	*cur,		/* btree cursor */
2906 	int			*logflags,	/* logging flags for inode */
2907 	int			*stat)		/* return status - 0 fail */
2908 {
2909 	struct xfs_buf		*cbp;		/* buffer for cblock */
2910 	struct xfs_btree_block	*block;		/* btree block */
2911 	struct xfs_btree_block	*cblock;	/* child btree block */
2912 	union xfs_btree_key	*ckp;		/* child key pointer */
2913 	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
2914 	union xfs_btree_key	*kp;		/* pointer to btree key */
2915 	union xfs_btree_ptr	*pp;		/* pointer to block addr */
2916 	union xfs_btree_ptr	nptr;		/* new block addr */
2917 	int			level;		/* btree level */
2918 	int			error;		/* error return code */
2919 	int			i;		/* loop counter */
2920 
2921 	XFS_BTREE_STATS_INC(cur, newroot);
2922 
2923 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2924 
2925 	level = cur->bc_nlevels - 1;
2926 
2927 	block = xfs_btree_get_iroot(cur);
2928 	pp = xfs_btree_ptr_addr(cur, 1, block);
2929 
2930 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
2931 	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2932 	if (error)
2933 		goto error0;
2934 	if (*stat == 0)
2935 		return 0;
2936 
2937 	XFS_BTREE_STATS_INC(cur, alloc);
2938 
2939 	/* Copy the root into a real block. */
2940 	error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2941 	if (error)
2942 		goto error0;
2943 
2944 	/*
2945 	 * we can't just memcpy() the root in for CRC enabled btree blocks.
2946 	 * In that case have to also ensure the blkno remains correct
2947 	 */
2948 	memcpy(cblock, block, xfs_btree_block_len(cur));
2949 	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2950 		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2951 			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2952 		else
2953 			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2954 	}
2955 
2956 	be16_add_cpu(&block->bb_level, 1);
2957 	xfs_btree_set_numrecs(block, 1);
2958 	cur->bc_nlevels++;
2959 	cur->bc_ptrs[level + 1] = 1;
2960 
2961 	kp = xfs_btree_key_addr(cur, 1, block);
2962 	ckp = xfs_btree_key_addr(cur, 1, cblock);
2963 	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2964 
2965 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2966 	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2967 		error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2968 		if (error)
2969 			goto error0;
2970 	}
2971 
2972 	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2973 
2974 	error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2975 	if (error)
2976 		goto error0;
2977 
2978 	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2979 
2980 	xfs_iroot_realloc(cur->bc_private.b.ip,
2981 			  1 - xfs_btree_get_numrecs(cblock),
2982 			  cur->bc_private.b.whichfork);
2983 
2984 	xfs_btree_setbuf(cur, level, cbp);
2985 
2986 	/*
2987 	 * Do all this logging at the end so that
2988 	 * the root is at the right level.
2989 	 */
2990 	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2991 	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2992 	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2993 
2994 	*logflags |=
2995 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2996 	*stat = 1;
2997 	return 0;
2998 error0:
2999 	return error;
3000 }
3001 
3002 /*
3003  * Allocate a new root block, fill it in.
3004  */
3005 STATIC int				/* error */
3006 xfs_btree_new_root(
3007 	struct xfs_btree_cur	*cur,	/* btree cursor */
3008 	int			*stat)	/* success/failure */
3009 {
3010 	struct xfs_btree_block	*block;	/* one half of the old root block */
3011 	struct xfs_buf		*bp;	/* buffer containing block */
3012 	int			error;	/* error return value */
3013 	struct xfs_buf		*lbp;	/* left buffer pointer */
3014 	struct xfs_btree_block	*left;	/* left btree block */
3015 	struct xfs_buf		*nbp;	/* new (root) buffer */
3016 	struct xfs_btree_block	*new;	/* new (root) btree block */
3017 	int			nptr;	/* new value for key index, 1 or 2 */
3018 	struct xfs_buf		*rbp;	/* right buffer pointer */
3019 	struct xfs_btree_block	*right;	/* right btree block */
3020 	union xfs_btree_ptr	rptr;
3021 	union xfs_btree_ptr	lptr;
3022 
3023 	XFS_BTREE_STATS_INC(cur, newroot);
3024 
3025 	/* initialise our start point from the cursor */
3026 	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3027 
3028 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
3029 	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3030 	if (error)
3031 		goto error0;
3032 	if (*stat == 0)
3033 		goto out0;
3034 	XFS_BTREE_STATS_INC(cur, alloc);
3035 
3036 	/* Set up the new block. */
3037 	error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3038 	if (error)
3039 		goto error0;
3040 
3041 	/* Set the root in the holding structure  increasing the level by 1. */
3042 	cur->bc_ops->set_root(cur, &lptr, 1);
3043 
3044 	/*
3045 	 * At the previous root level there are now two blocks: the old root,
3046 	 * and the new block generated when it was split.  We don't know which
3047 	 * one the cursor is pointing at, so we set up variables "left" and
3048 	 * "right" for each case.
3049 	 */
3050 	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3051 
3052 #ifdef DEBUG
3053 	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3054 	if (error)
3055 		goto error0;
3056 #endif
3057 
3058 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3059 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3060 		/* Our block is left, pick up the right block. */
3061 		lbp = bp;
3062 		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3063 		left = block;
3064 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3065 		if (error)
3066 			goto error0;
3067 		bp = rbp;
3068 		nptr = 1;
3069 	} else {
3070 		/* Our block is right, pick up the left block. */
3071 		rbp = bp;
3072 		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3073 		right = block;
3074 		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3075 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3076 		if (error)
3077 			goto error0;
3078 		bp = lbp;
3079 		nptr = 2;
3080 	}
3081 
3082 	/* Fill in the new block's btree header and log it. */
3083 	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3084 	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3085 	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3086 			!xfs_btree_ptr_is_null(cur, &rptr));
3087 
3088 	/* Fill in the key data in the new root. */
3089 	if (xfs_btree_get_level(left) > 0) {
3090 		/*
3091 		 * Get the keys for the left block's keys and put them directly
3092 		 * in the parent block.  Do the same for the right block.
3093 		 */
3094 		xfs_btree_get_node_keys(cur, left,
3095 				xfs_btree_key_addr(cur, 1, new));
3096 		xfs_btree_get_node_keys(cur, right,
3097 				xfs_btree_key_addr(cur, 2, new));
3098 	} else {
3099 		/*
3100 		 * Get the keys for the left block's records and put them
3101 		 * directly in the parent block.  Do the same for the right
3102 		 * block.
3103 		 */
3104 		xfs_btree_get_leaf_keys(cur, left,
3105 			xfs_btree_key_addr(cur, 1, new));
3106 		xfs_btree_get_leaf_keys(cur, right,
3107 			xfs_btree_key_addr(cur, 2, new));
3108 	}
3109 	xfs_btree_log_keys(cur, nbp, 1, 2);
3110 
3111 	/* Fill in the pointer data in the new root. */
3112 	xfs_btree_copy_ptrs(cur,
3113 		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3114 	xfs_btree_copy_ptrs(cur,
3115 		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3116 	xfs_btree_log_ptrs(cur, nbp, 1, 2);
3117 
3118 	/* Fix up the cursor. */
3119 	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3120 	cur->bc_ptrs[cur->bc_nlevels] = nptr;
3121 	cur->bc_nlevels++;
3122 	*stat = 1;
3123 	return 0;
3124 error0:
3125 	return error;
3126 out0:
3127 	*stat = 0;
3128 	return 0;
3129 }
3130 
3131 STATIC int
3132 xfs_btree_make_block_unfull(
3133 	struct xfs_btree_cur	*cur,	/* btree cursor */
3134 	int			level,	/* btree level */
3135 	int			numrecs,/* # of recs in block */
3136 	int			*oindex,/* old tree index */
3137 	int			*index,	/* new tree index */
3138 	union xfs_btree_ptr	*nptr,	/* new btree ptr */
3139 	struct xfs_btree_cur	**ncur,	/* new btree cursor */
3140 	union xfs_btree_key	*key,	/* key of new block */
3141 	int			*stat)
3142 {
3143 	int			error = 0;
3144 
3145 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3146 	    level == cur->bc_nlevels - 1) {
3147 		struct xfs_inode *ip = cur->bc_private.b.ip;
3148 
3149 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3150 			/* A root block that can be made bigger. */
3151 			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3152 			*stat = 1;
3153 		} else {
3154 			/* A root block that needs replacing */
3155 			int	logflags = 0;
3156 
3157 			error = xfs_btree_new_iroot(cur, &logflags, stat);
3158 			if (error || *stat == 0)
3159 				return error;
3160 
3161 			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3162 		}
3163 
3164 		return 0;
3165 	}
3166 
3167 	/* First, try shifting an entry to the right neighbor. */
3168 	error = xfs_btree_rshift(cur, level, stat);
3169 	if (error || *stat)
3170 		return error;
3171 
3172 	/* Next, try shifting an entry to the left neighbor. */
3173 	error = xfs_btree_lshift(cur, level, stat);
3174 	if (error)
3175 		return error;
3176 
3177 	if (*stat) {
3178 		*oindex = *index = cur->bc_ptrs[level];
3179 		return 0;
3180 	}
3181 
3182 	/*
3183 	 * Next, try splitting the current block in half.
3184 	 *
3185 	 * If this works we have to re-set our variables because we
3186 	 * could be in a different block now.
3187 	 */
3188 	error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3189 	if (error || *stat == 0)
3190 		return error;
3191 
3192 
3193 	*index = cur->bc_ptrs[level];
3194 	return 0;
3195 }
3196 
3197 /*
3198  * Insert one record/level.  Return information to the caller
3199  * allowing the next level up to proceed if necessary.
3200  */
3201 STATIC int
3202 xfs_btree_insrec(
3203 	struct xfs_btree_cur	*cur,	/* btree cursor */
3204 	int			level,	/* level to insert record at */
3205 	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
3206 	union xfs_btree_rec	*rec,	/* record to insert */
3207 	union xfs_btree_key	*key,	/* i/o: block key for ptrp */
3208 	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
3209 	int			*stat)	/* success/failure */
3210 {
3211 	struct xfs_btree_block	*block;	/* btree block */
3212 	struct xfs_buf		*bp;	/* buffer for block */
3213 	union xfs_btree_ptr	nptr;	/* new block ptr */
3214 	struct xfs_btree_cur	*ncur;	/* new btree cursor */
3215 	union xfs_btree_key	nkey;	/* new block key */
3216 	union xfs_btree_key	*lkey;
3217 	int			optr;	/* old key/record index */
3218 	int			ptr;	/* key/record index */
3219 	int			numrecs;/* number of records */
3220 	int			error;	/* error return value */
3221 	int			i;
3222 	xfs_daddr_t		old_bn;
3223 
3224 	ncur = NULL;
3225 	lkey = &nkey;
3226 
3227 	/*
3228 	 * If we have an external root pointer, and we've made it to the
3229 	 * root level, allocate a new root block and we're done.
3230 	 */
3231 	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3232 	    (level >= cur->bc_nlevels)) {
3233 		error = xfs_btree_new_root(cur, stat);
3234 		xfs_btree_set_ptr_null(cur, ptrp);
3235 
3236 		return error;
3237 	}
3238 
3239 	/* If we're off the left edge, return failure. */
3240 	ptr = cur->bc_ptrs[level];
3241 	if (ptr == 0) {
3242 		*stat = 0;
3243 		return 0;
3244 	}
3245 
3246 	optr = ptr;
3247 
3248 	XFS_BTREE_STATS_INC(cur, insrec);
3249 
3250 	/* Get pointers to the btree buffer and block. */
3251 	block = xfs_btree_get_block(cur, level, &bp);
3252 	old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3253 	numrecs = xfs_btree_get_numrecs(block);
3254 
3255 #ifdef DEBUG
3256 	error = xfs_btree_check_block(cur, block, level, bp);
3257 	if (error)
3258 		goto error0;
3259 
3260 	/* Check that the new entry is being inserted in the right place. */
3261 	if (ptr <= numrecs) {
3262 		if (level == 0) {
3263 			ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3264 				xfs_btree_rec_addr(cur, ptr, block)));
3265 		} else {
3266 			ASSERT(cur->bc_ops->keys_inorder(cur, key,
3267 				xfs_btree_key_addr(cur, ptr, block)));
3268 		}
3269 	}
3270 #endif
3271 
3272 	/*
3273 	 * If the block is full, we can't insert the new entry until we
3274 	 * make the block un-full.
3275 	 */
3276 	xfs_btree_set_ptr_null(cur, &nptr);
3277 	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3278 		error = xfs_btree_make_block_unfull(cur, level, numrecs,
3279 					&optr, &ptr, &nptr, &ncur, lkey, stat);
3280 		if (error || *stat == 0)
3281 			goto error0;
3282 	}
3283 
3284 	/*
3285 	 * The current block may have changed if the block was
3286 	 * previously full and we have just made space in it.
3287 	 */
3288 	block = xfs_btree_get_block(cur, level, &bp);
3289 	numrecs = xfs_btree_get_numrecs(block);
3290 
3291 #ifdef DEBUG
3292 	error = xfs_btree_check_block(cur, block, level, bp);
3293 	if (error)
3294 		return error;
3295 #endif
3296 
3297 	/*
3298 	 * At this point we know there's room for our new entry in the block
3299 	 * we're pointing at.
3300 	 */
3301 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3302 
3303 	if (level > 0) {
3304 		/* It's a nonleaf. make a hole in the keys and ptrs */
3305 		union xfs_btree_key	*kp;
3306 		union xfs_btree_ptr	*pp;
3307 
3308 		kp = xfs_btree_key_addr(cur, ptr, block);
3309 		pp = xfs_btree_ptr_addr(cur, ptr, block);
3310 
3311 		for (i = numrecs - ptr; i >= 0; i--) {
3312 			error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3313 			if (error)
3314 				return error;
3315 		}
3316 
3317 		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3318 		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3319 
3320 		error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3321 		if (error)
3322 			goto error0;
3323 
3324 		/* Now put the new data in, bump numrecs and log it. */
3325 		xfs_btree_copy_keys(cur, kp, key, 1);
3326 		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3327 		numrecs++;
3328 		xfs_btree_set_numrecs(block, numrecs);
3329 		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3330 		xfs_btree_log_keys(cur, bp, ptr, numrecs);
3331 #ifdef DEBUG
3332 		if (ptr < numrecs) {
3333 			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3334 				xfs_btree_key_addr(cur, ptr + 1, block)));
3335 		}
3336 #endif
3337 	} else {
3338 		/* It's a leaf. make a hole in the records */
3339 		union xfs_btree_rec             *rp;
3340 
3341 		rp = xfs_btree_rec_addr(cur, ptr, block);
3342 
3343 		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3344 
3345 		/* Now put the new data in, bump numrecs and log it. */
3346 		xfs_btree_copy_recs(cur, rp, rec, 1);
3347 		xfs_btree_set_numrecs(block, ++numrecs);
3348 		xfs_btree_log_recs(cur, bp, ptr, numrecs);
3349 #ifdef DEBUG
3350 		if (ptr < numrecs) {
3351 			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3352 				xfs_btree_rec_addr(cur, ptr + 1, block)));
3353 		}
3354 #endif
3355 	}
3356 
3357 	/* Log the new number of records in the btree header. */
3358 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3359 
3360 	/*
3361 	 * If we just inserted into a new tree block, we have to
3362 	 * recalculate nkey here because nkey is out of date.
3363 	 *
3364 	 * Otherwise we're just updating an existing block (having shoved
3365 	 * some records into the new tree block), so use the regular key
3366 	 * update mechanism.
3367 	 */
3368 	if (bp && bp->b_bn != old_bn) {
3369 		xfs_btree_get_keys(cur, block, lkey);
3370 	} else if (xfs_btree_needs_key_update(cur, optr)) {
3371 		error = xfs_btree_update_keys(cur, level);
3372 		if (error)
3373 			goto error0;
3374 	}
3375 
3376 	/*
3377 	 * If we are tracking the last record in the tree and
3378 	 * we are at the far right edge of the tree, update it.
3379 	 */
3380 	if (xfs_btree_is_lastrec(cur, block, level)) {
3381 		cur->bc_ops->update_lastrec(cur, block, rec,
3382 					    ptr, LASTREC_INSREC);
3383 	}
3384 
3385 	/*
3386 	 * Return the new block number, if any.
3387 	 * If there is one, give back a record value and a cursor too.
3388 	 */
3389 	*ptrp = nptr;
3390 	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3391 		xfs_btree_copy_keys(cur, key, lkey, 1);
3392 		*curp = ncur;
3393 	}
3394 
3395 	*stat = 1;
3396 	return 0;
3397 
3398 error0:
3399 	return error;
3400 }
3401 
3402 /*
3403  * Insert the record at the point referenced by cur.
3404  *
3405  * A multi-level split of the tree on insert will invalidate the original
3406  * cursor.  All callers of this function should assume that the cursor is
3407  * no longer valid and revalidate it.
3408  */
3409 int
3410 xfs_btree_insert(
3411 	struct xfs_btree_cur	*cur,
3412 	int			*stat)
3413 {
3414 	int			error;	/* error return value */
3415 	int			i;	/* result value, 0 for failure */
3416 	int			level;	/* current level number in btree */
3417 	union xfs_btree_ptr	nptr;	/* new block number (split result) */
3418 	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
3419 	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
3420 	union xfs_btree_key	bkey;	/* key of block to insert */
3421 	union xfs_btree_key	*key;
3422 	union xfs_btree_rec	rec;	/* record to insert */
3423 
3424 	level = 0;
3425 	ncur = NULL;
3426 	pcur = cur;
3427 	key = &bkey;
3428 
3429 	xfs_btree_set_ptr_null(cur, &nptr);
3430 
3431 	/* Make a key out of the record data to be inserted, and save it. */
3432 	cur->bc_ops->init_rec_from_cur(cur, &rec);
3433 	cur->bc_ops->init_key_from_rec(key, &rec);
3434 
3435 	/*
3436 	 * Loop going up the tree, starting at the leaf level.
3437 	 * Stop when we don't get a split block, that must mean that
3438 	 * the insert is finished with this level.
3439 	 */
3440 	do {
3441 		/*
3442 		 * Insert nrec/nptr into this level of the tree.
3443 		 * Note if we fail, nptr will be null.
3444 		 */
3445 		error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3446 				&ncur, &i);
3447 		if (error) {
3448 			if (pcur != cur)
3449 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3450 			goto error0;
3451 		}
3452 
3453 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3454 			error = -EFSCORRUPTED;
3455 			goto error0;
3456 		}
3457 		level++;
3458 
3459 		/*
3460 		 * See if the cursor we just used is trash.
3461 		 * Can't trash the caller's cursor, but otherwise we should
3462 		 * if ncur is a new cursor or we're about to be done.
3463 		 */
3464 		if (pcur != cur &&
3465 		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3466 			/* Save the state from the cursor before we trash it */
3467 			if (cur->bc_ops->update_cursor)
3468 				cur->bc_ops->update_cursor(pcur, cur);
3469 			cur->bc_nlevels = pcur->bc_nlevels;
3470 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3471 		}
3472 		/* If we got a new cursor, switch to it. */
3473 		if (ncur) {
3474 			pcur = ncur;
3475 			ncur = NULL;
3476 		}
3477 	} while (!xfs_btree_ptr_is_null(cur, &nptr));
3478 
3479 	*stat = i;
3480 	return 0;
3481 error0:
3482 	return error;
3483 }
3484 
3485 /*
3486  * Try to merge a non-leaf block back into the inode root.
3487  *
3488  * Note: the killroot names comes from the fact that we're effectively
3489  * killing the old root block.  But because we can't just delete the
3490  * inode we have to copy the single block it was pointing to into the
3491  * inode.
3492  */
3493 STATIC int
3494 xfs_btree_kill_iroot(
3495 	struct xfs_btree_cur	*cur)
3496 {
3497 	int			whichfork = cur->bc_private.b.whichfork;
3498 	struct xfs_inode	*ip = cur->bc_private.b.ip;
3499 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
3500 	struct xfs_btree_block	*block;
3501 	struct xfs_btree_block	*cblock;
3502 	union xfs_btree_key	*kp;
3503 	union xfs_btree_key	*ckp;
3504 	union xfs_btree_ptr	*pp;
3505 	union xfs_btree_ptr	*cpp;
3506 	struct xfs_buf		*cbp;
3507 	int			level;
3508 	int			index;
3509 	int			numrecs;
3510 	int			error;
3511 #ifdef DEBUG
3512 	union xfs_btree_ptr	ptr;
3513 #endif
3514 	int			i;
3515 
3516 	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3517 	ASSERT(cur->bc_nlevels > 1);
3518 
3519 	/*
3520 	 * Don't deal with the root block needs to be a leaf case.
3521 	 * We're just going to turn the thing back into extents anyway.
3522 	 */
3523 	level = cur->bc_nlevels - 1;
3524 	if (level == 1)
3525 		goto out0;
3526 
3527 	/*
3528 	 * Give up if the root has multiple children.
3529 	 */
3530 	block = xfs_btree_get_iroot(cur);
3531 	if (xfs_btree_get_numrecs(block) != 1)
3532 		goto out0;
3533 
3534 	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3535 	numrecs = xfs_btree_get_numrecs(cblock);
3536 
3537 	/*
3538 	 * Only do this if the next level will fit.
3539 	 * Then the data must be copied up to the inode,
3540 	 * instead of freeing the root you free the next level.
3541 	 */
3542 	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3543 		goto out0;
3544 
3545 	XFS_BTREE_STATS_INC(cur, killroot);
3546 
3547 #ifdef DEBUG
3548 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3549 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3550 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3551 	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3552 #endif
3553 
3554 	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3555 	if (index) {
3556 		xfs_iroot_realloc(cur->bc_private.b.ip, index,
3557 				  cur->bc_private.b.whichfork);
3558 		block = ifp->if_broot;
3559 	}
3560 
3561 	be16_add_cpu(&block->bb_numrecs, index);
3562 	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3563 
3564 	kp = xfs_btree_key_addr(cur, 1, block);
3565 	ckp = xfs_btree_key_addr(cur, 1, cblock);
3566 	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3567 
3568 	pp = xfs_btree_ptr_addr(cur, 1, block);
3569 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3570 
3571 	for (i = 0; i < numrecs; i++) {
3572 		error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3573 		if (error)
3574 			return error;
3575 	}
3576 
3577 	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3578 
3579 	error = xfs_btree_free_block(cur, cbp);
3580 	if (error)
3581 		return error;
3582 
3583 	cur->bc_bufs[level - 1] = NULL;
3584 	be16_add_cpu(&block->bb_level, -1);
3585 	xfs_trans_log_inode(cur->bc_tp, ip,
3586 		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3587 	cur->bc_nlevels--;
3588 out0:
3589 	return 0;
3590 }
3591 
3592 /*
3593  * Kill the current root node, and replace it with it's only child node.
3594  */
3595 STATIC int
3596 xfs_btree_kill_root(
3597 	struct xfs_btree_cur	*cur,
3598 	struct xfs_buf		*bp,
3599 	int			level,
3600 	union xfs_btree_ptr	*newroot)
3601 {
3602 	int			error;
3603 
3604 	XFS_BTREE_STATS_INC(cur, killroot);
3605 
3606 	/*
3607 	 * Update the root pointer, decreasing the level by 1 and then
3608 	 * free the old root.
3609 	 */
3610 	cur->bc_ops->set_root(cur, newroot, -1);
3611 
3612 	error = xfs_btree_free_block(cur, bp);
3613 	if (error)
3614 		return error;
3615 
3616 	cur->bc_bufs[level] = NULL;
3617 	cur->bc_ra[level] = 0;
3618 	cur->bc_nlevels--;
3619 
3620 	return 0;
3621 }
3622 
3623 STATIC int
3624 xfs_btree_dec_cursor(
3625 	struct xfs_btree_cur	*cur,
3626 	int			level,
3627 	int			*stat)
3628 {
3629 	int			error;
3630 	int			i;
3631 
3632 	if (level > 0) {
3633 		error = xfs_btree_decrement(cur, level, &i);
3634 		if (error)
3635 			return error;
3636 	}
3637 
3638 	*stat = 1;
3639 	return 0;
3640 }
3641 
3642 /*
3643  * Single level of the btree record deletion routine.
3644  * Delete record pointed to by cur/level.
3645  * Remove the record from its block then rebalance the tree.
3646  * Return 0 for error, 1 for done, 2 to go on to the next level.
3647  */
3648 STATIC int					/* error */
3649 xfs_btree_delrec(
3650 	struct xfs_btree_cur	*cur,		/* btree cursor */
3651 	int			level,		/* level removing record from */
3652 	int			*stat)		/* fail/done/go-on */
3653 {
3654 	struct xfs_btree_block	*block;		/* btree block */
3655 	union xfs_btree_ptr	cptr;		/* current block ptr */
3656 	struct xfs_buf		*bp;		/* buffer for block */
3657 	int			error;		/* error return value */
3658 	int			i;		/* loop counter */
3659 	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
3660 	struct xfs_buf		*lbp;		/* left buffer pointer */
3661 	struct xfs_btree_block	*left;		/* left btree block */
3662 	int			lrecs = 0;	/* left record count */
3663 	int			ptr;		/* key/record index */
3664 	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
3665 	struct xfs_buf		*rbp;		/* right buffer pointer */
3666 	struct xfs_btree_block	*right;		/* right btree block */
3667 	struct xfs_btree_block	*rrblock;	/* right-right btree block */
3668 	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
3669 	int			rrecs = 0;	/* right record count */
3670 	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
3671 	int			numrecs;	/* temporary numrec count */
3672 
3673 	tcur = NULL;
3674 
3675 	/* Get the index of the entry being deleted, check for nothing there. */
3676 	ptr = cur->bc_ptrs[level];
3677 	if (ptr == 0) {
3678 		*stat = 0;
3679 		return 0;
3680 	}
3681 
3682 	/* Get the buffer & block containing the record or key/ptr. */
3683 	block = xfs_btree_get_block(cur, level, &bp);
3684 	numrecs = xfs_btree_get_numrecs(block);
3685 
3686 #ifdef DEBUG
3687 	error = xfs_btree_check_block(cur, block, level, bp);
3688 	if (error)
3689 		goto error0;
3690 #endif
3691 
3692 	/* Fail if we're off the end of the block. */
3693 	if (ptr > numrecs) {
3694 		*stat = 0;
3695 		return 0;
3696 	}
3697 
3698 	XFS_BTREE_STATS_INC(cur, delrec);
3699 	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3700 
3701 	/* Excise the entries being deleted. */
3702 	if (level > 0) {
3703 		/* It's a nonleaf. operate on keys and ptrs */
3704 		union xfs_btree_key	*lkp;
3705 		union xfs_btree_ptr	*lpp;
3706 
3707 		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3708 		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3709 
3710 		for (i = 0; i < numrecs - ptr; i++) {
3711 			error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3712 			if (error)
3713 				goto error0;
3714 		}
3715 
3716 		if (ptr < numrecs) {
3717 			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3718 			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3719 			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3720 			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3721 		}
3722 	} else {
3723 		/* It's a leaf. operate on records */
3724 		if (ptr < numrecs) {
3725 			xfs_btree_shift_recs(cur,
3726 				xfs_btree_rec_addr(cur, ptr + 1, block),
3727 				-1, numrecs - ptr);
3728 			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3729 		}
3730 	}
3731 
3732 	/*
3733 	 * Decrement and log the number of entries in the block.
3734 	 */
3735 	xfs_btree_set_numrecs(block, --numrecs);
3736 	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3737 
3738 	/*
3739 	 * If we are tracking the last record in the tree and
3740 	 * we are at the far right edge of the tree, update it.
3741 	 */
3742 	if (xfs_btree_is_lastrec(cur, block, level)) {
3743 		cur->bc_ops->update_lastrec(cur, block, NULL,
3744 					    ptr, LASTREC_DELREC);
3745 	}
3746 
3747 	/*
3748 	 * We're at the root level.  First, shrink the root block in-memory.
3749 	 * Try to get rid of the next level down.  If we can't then there's
3750 	 * nothing left to do.
3751 	 */
3752 	if (level == cur->bc_nlevels - 1) {
3753 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3754 			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3755 					  cur->bc_private.b.whichfork);
3756 
3757 			error = xfs_btree_kill_iroot(cur);
3758 			if (error)
3759 				goto error0;
3760 
3761 			error = xfs_btree_dec_cursor(cur, level, stat);
3762 			if (error)
3763 				goto error0;
3764 			*stat = 1;
3765 			return 0;
3766 		}
3767 
3768 		/*
3769 		 * If this is the root level, and there's only one entry left,
3770 		 * and it's NOT the leaf level, then we can get rid of this
3771 		 * level.
3772 		 */
3773 		if (numrecs == 1 && level > 0) {
3774 			union xfs_btree_ptr	*pp;
3775 			/*
3776 			 * pp is still set to the first pointer in the block.
3777 			 * Make it the new root of the btree.
3778 			 */
3779 			pp = xfs_btree_ptr_addr(cur, 1, block);
3780 			error = xfs_btree_kill_root(cur, bp, level, pp);
3781 			if (error)
3782 				goto error0;
3783 		} else if (level > 0) {
3784 			error = xfs_btree_dec_cursor(cur, level, stat);
3785 			if (error)
3786 				goto error0;
3787 		}
3788 		*stat = 1;
3789 		return 0;
3790 	}
3791 
3792 	/*
3793 	 * If we deleted the leftmost entry in the block, update the
3794 	 * key values above us in the tree.
3795 	 */
3796 	if (xfs_btree_needs_key_update(cur, ptr)) {
3797 		error = xfs_btree_update_keys(cur, level);
3798 		if (error)
3799 			goto error0;
3800 	}
3801 
3802 	/*
3803 	 * If the number of records remaining in the block is at least
3804 	 * the minimum, we're done.
3805 	 */
3806 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3807 		error = xfs_btree_dec_cursor(cur, level, stat);
3808 		if (error)
3809 			goto error0;
3810 		return 0;
3811 	}
3812 
3813 	/*
3814 	 * Otherwise, we have to move some records around to keep the
3815 	 * tree balanced.  Look at the left and right sibling blocks to
3816 	 * see if we can re-balance by moving only one record.
3817 	 */
3818 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3819 	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3820 
3821 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3822 		/*
3823 		 * One child of root, need to get a chance to copy its contents
3824 		 * into the root and delete it. Can't go up to next level,
3825 		 * there's nothing to delete there.
3826 		 */
3827 		if (xfs_btree_ptr_is_null(cur, &rptr) &&
3828 		    xfs_btree_ptr_is_null(cur, &lptr) &&
3829 		    level == cur->bc_nlevels - 2) {
3830 			error = xfs_btree_kill_iroot(cur);
3831 			if (!error)
3832 				error = xfs_btree_dec_cursor(cur, level, stat);
3833 			if (error)
3834 				goto error0;
3835 			return 0;
3836 		}
3837 	}
3838 
3839 	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3840 	       !xfs_btree_ptr_is_null(cur, &lptr));
3841 
3842 	/*
3843 	 * Duplicate the cursor so our btree manipulations here won't
3844 	 * disrupt the next level up.
3845 	 */
3846 	error = xfs_btree_dup_cursor(cur, &tcur);
3847 	if (error)
3848 		goto error0;
3849 
3850 	/*
3851 	 * If there's a right sibling, see if it's ok to shift an entry
3852 	 * out of it.
3853 	 */
3854 	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3855 		/*
3856 		 * Move the temp cursor to the last entry in the next block.
3857 		 * Actually any entry but the first would suffice.
3858 		 */
3859 		i = xfs_btree_lastrec(tcur, level);
3860 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3861 			error = -EFSCORRUPTED;
3862 			goto error0;
3863 		}
3864 
3865 		error = xfs_btree_increment(tcur, level, &i);
3866 		if (error)
3867 			goto error0;
3868 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3869 			error = -EFSCORRUPTED;
3870 			goto error0;
3871 		}
3872 
3873 		i = xfs_btree_lastrec(tcur, level);
3874 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3875 			error = -EFSCORRUPTED;
3876 			goto error0;
3877 		}
3878 
3879 		/* Grab a pointer to the block. */
3880 		right = xfs_btree_get_block(tcur, level, &rbp);
3881 #ifdef DEBUG
3882 		error = xfs_btree_check_block(tcur, right, level, rbp);
3883 		if (error)
3884 			goto error0;
3885 #endif
3886 		/* Grab the current block number, for future use. */
3887 		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3888 
3889 		/*
3890 		 * If right block is full enough so that removing one entry
3891 		 * won't make it too empty, and left-shifting an entry out
3892 		 * of right to us works, we're done.
3893 		 */
3894 		if (xfs_btree_get_numrecs(right) - 1 >=
3895 		    cur->bc_ops->get_minrecs(tcur, level)) {
3896 			error = xfs_btree_lshift(tcur, level, &i);
3897 			if (error)
3898 				goto error0;
3899 			if (i) {
3900 				ASSERT(xfs_btree_get_numrecs(block) >=
3901 				       cur->bc_ops->get_minrecs(tcur, level));
3902 
3903 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3904 				tcur = NULL;
3905 
3906 				error = xfs_btree_dec_cursor(cur, level, stat);
3907 				if (error)
3908 					goto error0;
3909 				return 0;
3910 			}
3911 		}
3912 
3913 		/*
3914 		 * Otherwise, grab the number of records in right for
3915 		 * future reference, and fix up the temp cursor to point
3916 		 * to our block again (last record).
3917 		 */
3918 		rrecs = xfs_btree_get_numrecs(right);
3919 		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3920 			i = xfs_btree_firstrec(tcur, level);
3921 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3922 				error = -EFSCORRUPTED;
3923 				goto error0;
3924 			}
3925 
3926 			error = xfs_btree_decrement(tcur, level, &i);
3927 			if (error)
3928 				goto error0;
3929 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3930 				error = -EFSCORRUPTED;
3931 				goto error0;
3932 			}
3933 		}
3934 	}
3935 
3936 	/*
3937 	 * If there's a left sibling, see if it's ok to shift an entry
3938 	 * out of it.
3939 	 */
3940 	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3941 		/*
3942 		 * Move the temp cursor to the first entry in the
3943 		 * previous block.
3944 		 */
3945 		i = xfs_btree_firstrec(tcur, level);
3946 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3947 			error = -EFSCORRUPTED;
3948 			goto error0;
3949 		}
3950 
3951 		error = xfs_btree_decrement(tcur, level, &i);
3952 		if (error)
3953 			goto error0;
3954 		i = xfs_btree_firstrec(tcur, level);
3955 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3956 			error = -EFSCORRUPTED;
3957 			goto error0;
3958 		}
3959 
3960 		/* Grab a pointer to the block. */
3961 		left = xfs_btree_get_block(tcur, level, &lbp);
3962 #ifdef DEBUG
3963 		error = xfs_btree_check_block(cur, left, level, lbp);
3964 		if (error)
3965 			goto error0;
3966 #endif
3967 		/* Grab the current block number, for future use. */
3968 		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3969 
3970 		/*
3971 		 * If left block is full enough so that removing one entry
3972 		 * won't make it too empty, and right-shifting an entry out
3973 		 * of left to us works, we're done.
3974 		 */
3975 		if (xfs_btree_get_numrecs(left) - 1 >=
3976 		    cur->bc_ops->get_minrecs(tcur, level)) {
3977 			error = xfs_btree_rshift(tcur, level, &i);
3978 			if (error)
3979 				goto error0;
3980 			if (i) {
3981 				ASSERT(xfs_btree_get_numrecs(block) >=
3982 				       cur->bc_ops->get_minrecs(tcur, level));
3983 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3984 				tcur = NULL;
3985 				if (level == 0)
3986 					cur->bc_ptrs[0]++;
3987 
3988 				*stat = 1;
3989 				return 0;
3990 			}
3991 		}
3992 
3993 		/*
3994 		 * Otherwise, grab the number of records in right for
3995 		 * future reference.
3996 		 */
3997 		lrecs = xfs_btree_get_numrecs(left);
3998 	}
3999 
4000 	/* Delete the temp cursor, we're done with it. */
4001 	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4002 	tcur = NULL;
4003 
4004 	/* If here, we need to do a join to keep the tree balanced. */
4005 	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4006 
4007 	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4008 	    lrecs + xfs_btree_get_numrecs(block) <=
4009 			cur->bc_ops->get_maxrecs(cur, level)) {
4010 		/*
4011 		 * Set "right" to be the starting block,
4012 		 * "left" to be the left neighbor.
4013 		 */
4014 		rptr = cptr;
4015 		right = block;
4016 		rbp = bp;
4017 		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4018 		if (error)
4019 			goto error0;
4020 
4021 	/*
4022 	 * If that won't work, see if we can join with the right neighbor block.
4023 	 */
4024 	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4025 		   rrecs + xfs_btree_get_numrecs(block) <=
4026 			cur->bc_ops->get_maxrecs(cur, level)) {
4027 		/*
4028 		 * Set "left" to be the starting block,
4029 		 * "right" to be the right neighbor.
4030 		 */
4031 		lptr = cptr;
4032 		left = block;
4033 		lbp = bp;
4034 		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4035 		if (error)
4036 			goto error0;
4037 
4038 	/*
4039 	 * Otherwise, we can't fix the imbalance.
4040 	 * Just return.  This is probably a logic error, but it's not fatal.
4041 	 */
4042 	} else {
4043 		error = xfs_btree_dec_cursor(cur, level, stat);
4044 		if (error)
4045 			goto error0;
4046 		return 0;
4047 	}
4048 
4049 	rrecs = xfs_btree_get_numrecs(right);
4050 	lrecs = xfs_btree_get_numrecs(left);
4051 
4052 	/*
4053 	 * We're now going to join "left" and "right" by moving all the stuff
4054 	 * in "right" to "left" and deleting "right".
4055 	 */
4056 	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4057 	if (level > 0) {
4058 		/* It's a non-leaf.  Move keys and pointers. */
4059 		union xfs_btree_key	*lkp;	/* left btree key */
4060 		union xfs_btree_ptr	*lpp;	/* left address pointer */
4061 		union xfs_btree_key	*rkp;	/* right btree key */
4062 		union xfs_btree_ptr	*rpp;	/* right address pointer */
4063 
4064 		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4065 		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4066 		rkp = xfs_btree_key_addr(cur, 1, right);
4067 		rpp = xfs_btree_ptr_addr(cur, 1, right);
4068 
4069 		for (i = 1; i < rrecs; i++) {
4070 			error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4071 			if (error)
4072 				goto error0;
4073 		}
4074 
4075 		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4076 		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4077 
4078 		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4079 		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4080 	} else {
4081 		/* It's a leaf.  Move records.  */
4082 		union xfs_btree_rec	*lrp;	/* left record pointer */
4083 		union xfs_btree_rec	*rrp;	/* right record pointer */
4084 
4085 		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4086 		rrp = xfs_btree_rec_addr(cur, 1, right);
4087 
4088 		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4089 		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4090 	}
4091 
4092 	XFS_BTREE_STATS_INC(cur, join);
4093 
4094 	/*
4095 	 * Fix up the number of records and right block pointer in the
4096 	 * surviving block, and log it.
4097 	 */
4098 	xfs_btree_set_numrecs(left, lrecs + rrecs);
4099 	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4100 	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4101 	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4102 
4103 	/* If there is a right sibling, point it to the remaining block. */
4104 	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4105 	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4106 		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4107 		if (error)
4108 			goto error0;
4109 		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4110 		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4111 	}
4112 
4113 	/* Free the deleted block. */
4114 	error = xfs_btree_free_block(cur, rbp);
4115 	if (error)
4116 		goto error0;
4117 
4118 	/*
4119 	 * If we joined with the left neighbor, set the buffer in the
4120 	 * cursor to the left block, and fix up the index.
4121 	 */
4122 	if (bp != lbp) {
4123 		cur->bc_bufs[level] = lbp;
4124 		cur->bc_ptrs[level] += lrecs;
4125 		cur->bc_ra[level] = 0;
4126 	}
4127 	/*
4128 	 * If we joined with the right neighbor and there's a level above
4129 	 * us, increment the cursor at that level.
4130 	 */
4131 	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4132 		   (level + 1 < cur->bc_nlevels)) {
4133 		error = xfs_btree_increment(cur, level + 1, &i);
4134 		if (error)
4135 			goto error0;
4136 	}
4137 
4138 	/*
4139 	 * Readjust the ptr at this level if it's not a leaf, since it's
4140 	 * still pointing at the deletion point, which makes the cursor
4141 	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4142 	 * We can't use decrement because it would change the next level up.
4143 	 */
4144 	if (level > 0)
4145 		cur->bc_ptrs[level]--;
4146 
4147 	/*
4148 	 * We combined blocks, so we have to update the parent keys if the
4149 	 * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4150 	 * points to the old block so that the caller knows which record to
4151 	 * delete.  Therefore, the caller must be savvy enough to call updkeys
4152 	 * for us if we return stat == 2.  The other exit points from this
4153 	 * function don't require deletions further up the tree, so they can
4154 	 * call updkeys directly.
4155 	 */
4156 
4157 	/* Return value means the next level up has something to do. */
4158 	*stat = 2;
4159 	return 0;
4160 
4161 error0:
4162 	if (tcur)
4163 		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4164 	return error;
4165 }
4166 
4167 /*
4168  * Delete the record pointed to by cur.
4169  * The cursor refers to the place where the record was (could be inserted)
4170  * when the operation returns.
4171  */
4172 int					/* error */
4173 xfs_btree_delete(
4174 	struct xfs_btree_cur	*cur,
4175 	int			*stat)	/* success/failure */
4176 {
4177 	int			error;	/* error return value */
4178 	int			level;
4179 	int			i;
4180 	bool			joined = false;
4181 
4182 	/*
4183 	 * Go up the tree, starting at leaf level.
4184 	 *
4185 	 * If 2 is returned then a join was done; go to the next level.
4186 	 * Otherwise we are done.
4187 	 */
4188 	for (level = 0, i = 2; i == 2; level++) {
4189 		error = xfs_btree_delrec(cur, level, &i);
4190 		if (error)
4191 			goto error0;
4192 		if (i == 2)
4193 			joined = true;
4194 	}
4195 
4196 	/*
4197 	 * If we combined blocks as part of deleting the record, delrec won't
4198 	 * have updated the parent high keys so we have to do that here.
4199 	 */
4200 	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4201 		error = xfs_btree_updkeys_force(cur, 0);
4202 		if (error)
4203 			goto error0;
4204 	}
4205 
4206 	if (i == 0) {
4207 		for (level = 1; level < cur->bc_nlevels; level++) {
4208 			if (cur->bc_ptrs[level] == 0) {
4209 				error = xfs_btree_decrement(cur, level, &i);
4210 				if (error)
4211 					goto error0;
4212 				break;
4213 			}
4214 		}
4215 	}
4216 
4217 	*stat = i;
4218 	return 0;
4219 error0:
4220 	return error;
4221 }
4222 
4223 /*
4224  * Get the data from the pointed-to record.
4225  */
4226 int					/* error */
4227 xfs_btree_get_rec(
4228 	struct xfs_btree_cur	*cur,	/* btree cursor */
4229 	union xfs_btree_rec	**recp,	/* output: btree record */
4230 	int			*stat)	/* output: success/failure */
4231 {
4232 	struct xfs_btree_block	*block;	/* btree block */
4233 	struct xfs_buf		*bp;	/* buffer pointer */
4234 	int			ptr;	/* record number */
4235 #ifdef DEBUG
4236 	int			error;	/* error return value */
4237 #endif
4238 
4239 	ptr = cur->bc_ptrs[0];
4240 	block = xfs_btree_get_block(cur, 0, &bp);
4241 
4242 #ifdef DEBUG
4243 	error = xfs_btree_check_block(cur, block, 0, bp);
4244 	if (error)
4245 		return error;
4246 #endif
4247 
4248 	/*
4249 	 * Off the right end or left end, return failure.
4250 	 */
4251 	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4252 		*stat = 0;
4253 		return 0;
4254 	}
4255 
4256 	/*
4257 	 * Point to the record and extract its data.
4258 	 */
4259 	*recp = xfs_btree_rec_addr(cur, ptr, block);
4260 	*stat = 1;
4261 	return 0;
4262 }
4263 
4264 /* Visit a block in a btree. */
4265 STATIC int
4266 xfs_btree_visit_block(
4267 	struct xfs_btree_cur		*cur,
4268 	int				level,
4269 	xfs_btree_visit_blocks_fn	fn,
4270 	void				*data)
4271 {
4272 	struct xfs_btree_block		*block;
4273 	struct xfs_buf			*bp;
4274 	union xfs_btree_ptr		rptr;
4275 	int				error;
4276 
4277 	/* do right sibling readahead */
4278 	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4279 	block = xfs_btree_get_block(cur, level, &bp);
4280 
4281 	/* process the block */
4282 	error = fn(cur, level, data);
4283 	if (error)
4284 		return error;
4285 
4286 	/* now read rh sibling block for next iteration */
4287 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4288 	if (xfs_btree_ptr_is_null(cur, &rptr))
4289 		return -ENOENT;
4290 
4291 	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4292 }
4293 
4294 
4295 /* Visit every block in a btree. */
4296 int
4297 xfs_btree_visit_blocks(
4298 	struct xfs_btree_cur		*cur,
4299 	xfs_btree_visit_blocks_fn	fn,
4300 	unsigned int			flags,
4301 	void				*data)
4302 {
4303 	union xfs_btree_ptr		lptr;
4304 	int				level;
4305 	struct xfs_btree_block		*block = NULL;
4306 	int				error = 0;
4307 
4308 	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4309 
4310 	/* for each level */
4311 	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4312 		/* grab the left hand block */
4313 		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4314 		if (error)
4315 			return error;
4316 
4317 		/* readahead the left most block for the next level down */
4318 		if (level > 0) {
4319 			union xfs_btree_ptr     *ptr;
4320 
4321 			ptr = xfs_btree_ptr_addr(cur, 1, block);
4322 			xfs_btree_readahead_ptr(cur, ptr, 1);
4323 
4324 			/* save for the next iteration of the loop */
4325 			xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4326 
4327 			if (!(flags & XFS_BTREE_VISIT_LEAVES))
4328 				continue;
4329 		} else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4330 			continue;
4331 		}
4332 
4333 		/* for each buffer in the level */
4334 		do {
4335 			error = xfs_btree_visit_block(cur, level, fn, data);
4336 		} while (!error);
4337 
4338 		if (error != -ENOENT)
4339 			return error;
4340 	}
4341 
4342 	return 0;
4343 }
4344 
4345 /*
4346  * Change the owner of a btree.
4347  *
4348  * The mechanism we use here is ordered buffer logging. Because we don't know
4349  * how many buffers were are going to need to modify, we don't really want to
4350  * have to make transaction reservations for the worst case of every buffer in a
4351  * full size btree as that may be more space that we can fit in the log....
4352  *
4353  * We do the btree walk in the most optimal manner possible - we have sibling
4354  * pointers so we can just walk all the blocks on each level from left to right
4355  * in a single pass, and then move to the next level and do the same. We can
4356  * also do readahead on the sibling pointers to get IO moving more quickly,
4357  * though for slow disks this is unlikely to make much difference to performance
4358  * as the amount of CPU work we have to do before moving to the next block is
4359  * relatively small.
4360  *
4361  * For each btree block that we load, modify the owner appropriately, set the
4362  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4363  * we mark the region we change dirty so that if the buffer is relogged in
4364  * a subsequent transaction the changes we make here as an ordered buffer are
4365  * correctly relogged in that transaction.  If we are in recovery context, then
4366  * just queue the modified buffer as delayed write buffer so the transaction
4367  * recovery completion writes the changes to disk.
4368  */
4369 struct xfs_btree_block_change_owner_info {
4370 	uint64_t		new_owner;
4371 	struct list_head	*buffer_list;
4372 };
4373 
4374 static int
4375 xfs_btree_block_change_owner(
4376 	struct xfs_btree_cur	*cur,
4377 	int			level,
4378 	void			*data)
4379 {
4380 	struct xfs_btree_block_change_owner_info	*bbcoi = data;
4381 	struct xfs_btree_block	*block;
4382 	struct xfs_buf		*bp;
4383 
4384 	/* modify the owner */
4385 	block = xfs_btree_get_block(cur, level, &bp);
4386 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4387 		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4388 			return 0;
4389 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4390 	} else {
4391 		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4392 			return 0;
4393 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4394 	}
4395 
4396 	/*
4397 	 * If the block is a root block hosted in an inode, we might not have a
4398 	 * buffer pointer here and we shouldn't attempt to log the change as the
4399 	 * information is already held in the inode and discarded when the root
4400 	 * block is formatted into the on-disk inode fork. We still change it,
4401 	 * though, so everything is consistent in memory.
4402 	 */
4403 	if (!bp) {
4404 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4405 		ASSERT(level == cur->bc_nlevels - 1);
4406 		return 0;
4407 	}
4408 
4409 	if (cur->bc_tp) {
4410 		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4411 			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4412 			return -EAGAIN;
4413 		}
4414 	} else {
4415 		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4416 	}
4417 
4418 	return 0;
4419 }
4420 
4421 int
4422 xfs_btree_change_owner(
4423 	struct xfs_btree_cur	*cur,
4424 	uint64_t		new_owner,
4425 	struct list_head	*buffer_list)
4426 {
4427 	struct xfs_btree_block_change_owner_info	bbcoi;
4428 
4429 	bbcoi.new_owner = new_owner;
4430 	bbcoi.buffer_list = buffer_list;
4431 
4432 	return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4433 			XFS_BTREE_VISIT_ALL, &bbcoi);
4434 }
4435 
4436 /* Verify the v5 fields of a long-format btree block. */
4437 xfs_failaddr_t
4438 xfs_btree_lblock_v5hdr_verify(
4439 	struct xfs_buf		*bp,
4440 	uint64_t		owner)
4441 {
4442 	struct xfs_mount	*mp = bp->b_mount;
4443 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4444 
4445 	if (!xfs_sb_version_hascrc(&mp->m_sb))
4446 		return __this_address;
4447 	if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4448 		return __this_address;
4449 	if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4450 		return __this_address;
4451 	if (owner != XFS_RMAP_OWN_UNKNOWN &&
4452 	    be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4453 		return __this_address;
4454 	return NULL;
4455 }
4456 
4457 /* Verify a long-format btree block. */
4458 xfs_failaddr_t
4459 xfs_btree_lblock_verify(
4460 	struct xfs_buf		*bp,
4461 	unsigned int		max_recs)
4462 {
4463 	struct xfs_mount	*mp = bp->b_mount;
4464 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4465 
4466 	/* numrecs verification */
4467 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4468 		return __this_address;
4469 
4470 	/* sibling pointer verification */
4471 	if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4472 	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4473 		return __this_address;
4474 	if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4475 	    !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4476 		return __this_address;
4477 
4478 	return NULL;
4479 }
4480 
4481 /**
4482  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4483  *				      btree block
4484  *
4485  * @bp: buffer containing the btree block
4486  */
4487 xfs_failaddr_t
4488 xfs_btree_sblock_v5hdr_verify(
4489 	struct xfs_buf		*bp)
4490 {
4491 	struct xfs_mount	*mp = bp->b_mount;
4492 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4493 	struct xfs_perag	*pag = bp->b_pag;
4494 
4495 	if (!xfs_sb_version_hascrc(&mp->m_sb))
4496 		return __this_address;
4497 	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4498 		return __this_address;
4499 	if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4500 		return __this_address;
4501 	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4502 		return __this_address;
4503 	return NULL;
4504 }
4505 
4506 /**
4507  * xfs_btree_sblock_verify() -- verify a short-format btree block
4508  *
4509  * @bp: buffer containing the btree block
4510  * @max_recs: maximum records allowed in this btree node
4511  */
4512 xfs_failaddr_t
4513 xfs_btree_sblock_verify(
4514 	struct xfs_buf		*bp,
4515 	unsigned int		max_recs)
4516 {
4517 	struct xfs_mount	*mp = bp->b_mount;
4518 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
4519 	xfs_agblock_t		agno;
4520 
4521 	/* numrecs verification */
4522 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
4523 		return __this_address;
4524 
4525 	/* sibling pointer verification */
4526 	agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4527 	if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4528 	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4529 		return __this_address;
4530 	if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4531 	    !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4532 		return __this_address;
4533 
4534 	return NULL;
4535 }
4536 
4537 /*
4538  * Calculate the number of btree levels needed to store a given number of
4539  * records in a short-format btree.
4540  */
4541 uint
4542 xfs_btree_compute_maxlevels(
4543 	uint			*limits,
4544 	unsigned long		len)
4545 {
4546 	uint			level;
4547 	unsigned long		maxblocks;
4548 
4549 	maxblocks = (len + limits[0] - 1) / limits[0];
4550 	for (level = 1; maxblocks > 1; level++)
4551 		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4552 	return level;
4553 }
4554 
4555 /*
4556  * Query a regular btree for all records overlapping a given interval.
4557  * Start with a LE lookup of the key of low_rec and return all records
4558  * until we find a record with a key greater than the key of high_rec.
4559  */
4560 STATIC int
4561 xfs_btree_simple_query_range(
4562 	struct xfs_btree_cur		*cur,
4563 	union xfs_btree_key		*low_key,
4564 	union xfs_btree_key		*high_key,
4565 	xfs_btree_query_range_fn	fn,
4566 	void				*priv)
4567 {
4568 	union xfs_btree_rec		*recp;
4569 	union xfs_btree_key		rec_key;
4570 	int64_t				diff;
4571 	int				stat;
4572 	bool				firstrec = true;
4573 	int				error;
4574 
4575 	ASSERT(cur->bc_ops->init_high_key_from_rec);
4576 	ASSERT(cur->bc_ops->diff_two_keys);
4577 
4578 	/*
4579 	 * Find the leftmost record.  The btree cursor must be set
4580 	 * to the low record used to generate low_key.
4581 	 */
4582 	stat = 0;
4583 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4584 	if (error)
4585 		goto out;
4586 
4587 	/* Nothing?  See if there's anything to the right. */
4588 	if (!stat) {
4589 		error = xfs_btree_increment(cur, 0, &stat);
4590 		if (error)
4591 			goto out;
4592 	}
4593 
4594 	while (stat) {
4595 		/* Find the record. */
4596 		error = xfs_btree_get_rec(cur, &recp, &stat);
4597 		if (error || !stat)
4598 			break;
4599 
4600 		/* Skip if high_key(rec) < low_key. */
4601 		if (firstrec) {
4602 			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4603 			firstrec = false;
4604 			diff = cur->bc_ops->diff_two_keys(cur, low_key,
4605 					&rec_key);
4606 			if (diff > 0)
4607 				goto advloop;
4608 		}
4609 
4610 		/* Stop if high_key < low_key(rec). */
4611 		cur->bc_ops->init_key_from_rec(&rec_key, recp);
4612 		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4613 		if (diff > 0)
4614 			break;
4615 
4616 		/* Callback */
4617 		error = fn(cur, recp, priv);
4618 		if (error)
4619 			break;
4620 
4621 advloop:
4622 		/* Move on to the next record. */
4623 		error = xfs_btree_increment(cur, 0, &stat);
4624 		if (error)
4625 			break;
4626 	}
4627 
4628 out:
4629 	return error;
4630 }
4631 
4632 /*
4633  * Query an overlapped interval btree for all records overlapping a given
4634  * interval.  This function roughly follows the algorithm given in
4635  * "Interval Trees" of _Introduction to Algorithms_, which is section
4636  * 14.3 in the 2nd and 3rd editions.
4637  *
4638  * First, generate keys for the low and high records passed in.
4639  *
4640  * For any leaf node, generate the high and low keys for the record.
4641  * If the record keys overlap with the query low/high keys, pass the
4642  * record to the function iterator.
4643  *
4644  * For any internal node, compare the low and high keys of each
4645  * pointer against the query low/high keys.  If there's an overlap,
4646  * follow the pointer.
4647  *
4648  * As an optimization, we stop scanning a block when we find a low key
4649  * that is greater than the query's high key.
4650  */
4651 STATIC int
4652 xfs_btree_overlapped_query_range(
4653 	struct xfs_btree_cur		*cur,
4654 	union xfs_btree_key		*low_key,
4655 	union xfs_btree_key		*high_key,
4656 	xfs_btree_query_range_fn	fn,
4657 	void				*priv)
4658 {
4659 	union xfs_btree_ptr		ptr;
4660 	union xfs_btree_ptr		*pp;
4661 	union xfs_btree_key		rec_key;
4662 	union xfs_btree_key		rec_hkey;
4663 	union xfs_btree_key		*lkp;
4664 	union xfs_btree_key		*hkp;
4665 	union xfs_btree_rec		*recp;
4666 	struct xfs_btree_block		*block;
4667 	int64_t				ldiff;
4668 	int64_t				hdiff;
4669 	int				level;
4670 	struct xfs_buf			*bp;
4671 	int				i;
4672 	int				error;
4673 
4674 	/* Load the root of the btree. */
4675 	level = cur->bc_nlevels - 1;
4676 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4677 	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4678 	if (error)
4679 		return error;
4680 	xfs_btree_get_block(cur, level, &bp);
4681 	trace_xfs_btree_overlapped_query_range(cur, level, bp);
4682 #ifdef DEBUG
4683 	error = xfs_btree_check_block(cur, block, level, bp);
4684 	if (error)
4685 		goto out;
4686 #endif
4687 	cur->bc_ptrs[level] = 1;
4688 
4689 	while (level < cur->bc_nlevels) {
4690 		block = xfs_btree_get_block(cur, level, &bp);
4691 
4692 		/* End of node, pop back towards the root. */
4693 		if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4694 pop_up:
4695 			if (level < cur->bc_nlevels - 1)
4696 				cur->bc_ptrs[level + 1]++;
4697 			level++;
4698 			continue;
4699 		}
4700 
4701 		if (level == 0) {
4702 			/* Handle a leaf node. */
4703 			recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4704 
4705 			cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4706 			ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4707 					low_key);
4708 
4709 			cur->bc_ops->init_key_from_rec(&rec_key, recp);
4710 			hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4711 					&rec_key);
4712 
4713 			/*
4714 			 * If (record's high key >= query's low key) and
4715 			 *    (query's high key >= record's low key), then
4716 			 * this record overlaps the query range; callback.
4717 			 */
4718 			if (ldiff >= 0 && hdiff >= 0) {
4719 				error = fn(cur, recp, priv);
4720 				if (error)
4721 					break;
4722 			} else if (hdiff < 0) {
4723 				/* Record is larger than high key; pop. */
4724 				goto pop_up;
4725 			}
4726 			cur->bc_ptrs[level]++;
4727 			continue;
4728 		}
4729 
4730 		/* Handle an internal node. */
4731 		lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4732 		hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4733 		pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4734 
4735 		ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4736 		hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4737 
4738 		/*
4739 		 * If (pointer's high key >= query's low key) and
4740 		 *    (query's high key >= pointer's low key), then
4741 		 * this record overlaps the query range; follow pointer.
4742 		 */
4743 		if (ldiff >= 0 && hdiff >= 0) {
4744 			level--;
4745 			error = xfs_btree_lookup_get_block(cur, level, pp,
4746 					&block);
4747 			if (error)
4748 				goto out;
4749 			xfs_btree_get_block(cur, level, &bp);
4750 			trace_xfs_btree_overlapped_query_range(cur, level, bp);
4751 #ifdef DEBUG
4752 			error = xfs_btree_check_block(cur, block, level, bp);
4753 			if (error)
4754 				goto out;
4755 #endif
4756 			cur->bc_ptrs[level] = 1;
4757 			continue;
4758 		} else if (hdiff < 0) {
4759 			/* The low key is larger than the upper range; pop. */
4760 			goto pop_up;
4761 		}
4762 		cur->bc_ptrs[level]++;
4763 	}
4764 
4765 out:
4766 	/*
4767 	 * If we don't end this function with the cursor pointing at a record
4768 	 * block, a subsequent non-error cursor deletion will not release
4769 	 * node-level buffers, causing a buffer leak.  This is quite possible
4770 	 * with a zero-results range query, so release the buffers if we
4771 	 * failed to return any results.
4772 	 */
4773 	if (cur->bc_bufs[0] == NULL) {
4774 		for (i = 0; i < cur->bc_nlevels; i++) {
4775 			if (cur->bc_bufs[i]) {
4776 				xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4777 				cur->bc_bufs[i] = NULL;
4778 				cur->bc_ptrs[i] = 0;
4779 				cur->bc_ra[i] = 0;
4780 			}
4781 		}
4782 	}
4783 
4784 	return error;
4785 }
4786 
4787 /*
4788  * Query a btree for all records overlapping a given interval of keys.  The
4789  * supplied function will be called with each record found; return one of the
4790  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4791  * code.  This function returns -ECANCELED, zero, or a negative error code.
4792  */
4793 int
4794 xfs_btree_query_range(
4795 	struct xfs_btree_cur		*cur,
4796 	union xfs_btree_irec		*low_rec,
4797 	union xfs_btree_irec		*high_rec,
4798 	xfs_btree_query_range_fn	fn,
4799 	void				*priv)
4800 {
4801 	union xfs_btree_rec		rec;
4802 	union xfs_btree_key		low_key;
4803 	union xfs_btree_key		high_key;
4804 
4805 	/* Find the keys of both ends of the interval. */
4806 	cur->bc_rec = *high_rec;
4807 	cur->bc_ops->init_rec_from_cur(cur, &rec);
4808 	cur->bc_ops->init_key_from_rec(&high_key, &rec);
4809 
4810 	cur->bc_rec = *low_rec;
4811 	cur->bc_ops->init_rec_from_cur(cur, &rec);
4812 	cur->bc_ops->init_key_from_rec(&low_key, &rec);
4813 
4814 	/* Enforce low key < high key. */
4815 	if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4816 		return -EINVAL;
4817 
4818 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4819 		return xfs_btree_simple_query_range(cur, &low_key,
4820 				&high_key, fn, priv);
4821 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4822 			fn, priv);
4823 }
4824 
4825 /* Query a btree for all records. */
4826 int
4827 xfs_btree_query_all(
4828 	struct xfs_btree_cur		*cur,
4829 	xfs_btree_query_range_fn	fn,
4830 	void				*priv)
4831 {
4832 	union xfs_btree_key		low_key;
4833 	union xfs_btree_key		high_key;
4834 
4835 	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4836 	memset(&low_key, 0, sizeof(low_key));
4837 	memset(&high_key, 0xFF, sizeof(high_key));
4838 
4839 	return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4840 }
4841 
4842 /*
4843  * Calculate the number of blocks needed to store a given number of records
4844  * in a short-format (per-AG metadata) btree.
4845  */
4846 unsigned long long
4847 xfs_btree_calc_size(
4848 	uint			*limits,
4849 	unsigned long long	len)
4850 {
4851 	int			level;
4852 	int			maxrecs;
4853 	unsigned long long	rval;
4854 
4855 	maxrecs = limits[0];
4856 	for (level = 0, rval = 0; len > 1; level++) {
4857 		len += maxrecs - 1;
4858 		do_div(len, maxrecs);
4859 		maxrecs = limits[1];
4860 		rval += len;
4861 	}
4862 	return rval;
4863 }
4864 
4865 static int
4866 xfs_btree_count_blocks_helper(
4867 	struct xfs_btree_cur	*cur,
4868 	int			level,
4869 	void			*data)
4870 {
4871 	xfs_extlen_t		*blocks = data;
4872 	(*blocks)++;
4873 
4874 	return 0;
4875 }
4876 
4877 /* Count the blocks in a btree and return the result in *blocks. */
4878 int
4879 xfs_btree_count_blocks(
4880 	struct xfs_btree_cur	*cur,
4881 	xfs_extlen_t		*blocks)
4882 {
4883 	*blocks = 0;
4884 	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4885 			XFS_BTREE_VISIT_ALL, blocks);
4886 }
4887 
4888 /* Compare two btree pointers. */
4889 int64_t
4890 xfs_btree_diff_two_ptrs(
4891 	struct xfs_btree_cur		*cur,
4892 	const union xfs_btree_ptr	*a,
4893 	const union xfs_btree_ptr	*b)
4894 {
4895 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4896 		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4897 	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4898 }
4899 
4900 /* If there's an extent, we're done. */
4901 STATIC int
4902 xfs_btree_has_record_helper(
4903 	struct xfs_btree_cur		*cur,
4904 	union xfs_btree_rec		*rec,
4905 	void				*priv)
4906 {
4907 	return -ECANCELED;
4908 }
4909 
4910 /* Is there a record covering a given range of keys? */
4911 int
4912 xfs_btree_has_record(
4913 	struct xfs_btree_cur	*cur,
4914 	union xfs_btree_irec	*low,
4915 	union xfs_btree_irec	*high,
4916 	bool			*exists)
4917 {
4918 	int			error;
4919 
4920 	error = xfs_btree_query_range(cur, low, high,
4921 			&xfs_btree_has_record_helper, NULL);
4922 	if (error == -ECANCELED) {
4923 		*exists = true;
4924 		return 0;
4925 	}
4926 	*exists = false;
4927 	return error;
4928 }
4929 
4930 /* Are there more records in this btree? */
4931 bool
4932 xfs_btree_has_more_records(
4933 	struct xfs_btree_cur	*cur)
4934 {
4935 	struct xfs_btree_block	*block;
4936 	struct xfs_buf		*bp;
4937 
4938 	block = xfs_btree_get_block(cur, 0, &bp);
4939 
4940 	/* There are still records in this block. */
4941 	if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4942 		return true;
4943 
4944 	/* There are more record blocks. */
4945 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4946 		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4947 	else
4948 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4949 }
4950