xref: /linux/fs/xfs/libxfs/xfs_rtrmap_btree.c (revision 3f2a5ba784b808109cac0aac921213e43143a216)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_alloc.h"
19 #include "xfs_btree.h"
20 #include "xfs_btree_staging.h"
21 #include "xfs_metafile.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rtrmap_btree.h"
24 #include "xfs_trace.h"
25 #include "xfs_cksum.h"
26 #include "xfs_error.h"
27 #include "xfs_extent_busy.h"
28 #include "xfs_rtgroup.h"
29 #include "xfs_bmap.h"
30 #include "xfs_health.h"
31 #include "xfs_buf_mem.h"
32 #include "xfs_btree_mem.h"
33 
34 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
35 
36 /*
37  * Realtime Reverse Map btree.
38  *
39  * This is a btree used to track the owner(s) of a given extent in the realtime
40  * device.  See the comments in xfs_rmap_btree.c for more information.
41  *
42  * This tree is basically the same as the regular rmap btree except that it
43  * is rooted in an inode and does not live in free space.
44  */
45 
46 static struct xfs_btree_cur *
47 xfs_rtrmapbt_dup_cursor(
48 	struct xfs_btree_cur	*cur)
49 {
50 	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
51 }
52 
53 STATIC int
54 xfs_rtrmapbt_get_minrecs(
55 	struct xfs_btree_cur	*cur,
56 	int			level)
57 {
58 	if (level == cur->bc_nlevels - 1) {
59 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
60 
61 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
62 				level == 0) / 2;
63 	}
64 
65 	return cur->bc_mp->m_rtrmap_mnr[level != 0];
66 }
67 
68 STATIC int
69 xfs_rtrmapbt_get_maxrecs(
70 	struct xfs_btree_cur	*cur,
71 	int			level)
72 {
73 	if (level == cur->bc_nlevels - 1) {
74 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
75 
76 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
77 				level == 0);
78 	}
79 
80 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
81 }
82 
83 /* Calculate number of records in the ondisk realtime rmap btree inode root. */
84 unsigned int
85 xfs_rtrmapbt_droot_maxrecs(
86 	unsigned int		blocklen,
87 	bool			leaf)
88 {
89 	blocklen -= sizeof(struct xfs_rtrmap_root);
90 
91 	if (leaf)
92 		return blocklen / sizeof(struct xfs_rmap_rec);
93 	return blocklen / (2 * sizeof(struct xfs_rmap_key) +
94 			sizeof(xfs_rtrmap_ptr_t));
95 }
96 
97 /*
98  * Get the maximum records we could store in the on-disk format.
99  *
100  * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
101  * for the root node this checks the available space in the dinode fork
102  * so that we can resize the in-memory buffer to match it.  After a
103  * resize to the maximum size this function returns the same value
104  * as xfs_rtrmapbt_get_maxrecs for the root node, too.
105  */
106 STATIC int
107 xfs_rtrmapbt_get_dmaxrecs(
108 	struct xfs_btree_cur	*cur,
109 	int			level)
110 {
111 	if (level != cur->bc_nlevels - 1)
112 		return cur->bc_mp->m_rtrmap_mxr[level != 0];
113 	return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
114 }
115 
116 /*
117  * Convert the ondisk record's offset field into the ondisk key's offset field.
118  * Fork and bmbt are significant parts of the rmap record key, but written
119  * status is merely a record attribute.
120  */
121 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
122 {
123 	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
124 }
125 
126 STATIC void
127 xfs_rtrmapbt_init_key_from_rec(
128 	union xfs_btree_key		*key,
129 	const union xfs_btree_rec	*rec)
130 {
131 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
132 	key->rmap.rm_owner = rec->rmap.rm_owner;
133 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
134 }
135 
136 STATIC void
137 xfs_rtrmapbt_init_high_key_from_rec(
138 	union xfs_btree_key		*key,
139 	const union xfs_btree_rec	*rec)
140 {
141 	uint64_t			off;
142 	int				adj;
143 
144 	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
145 
146 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
147 	be32_add_cpu(&key->rmap.rm_startblock, adj);
148 	key->rmap.rm_owner = rec->rmap.rm_owner;
149 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
150 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
151 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
152 		return;
153 	off = be64_to_cpu(key->rmap.rm_offset);
154 	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
155 	key->rmap.rm_offset = cpu_to_be64(off);
156 }
157 
158 STATIC void
159 xfs_rtrmapbt_init_rec_from_cur(
160 	struct xfs_btree_cur	*cur,
161 	union xfs_btree_rec	*rec)
162 {
163 	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
164 	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
165 	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
166 	rec->rmap.rm_offset = cpu_to_be64(
167 			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
168 }
169 
170 STATIC void
171 xfs_rtrmapbt_init_ptr_from_cur(
172 	struct xfs_btree_cur	*cur,
173 	union xfs_btree_ptr	*ptr)
174 {
175 	ptr->l = 0;
176 }
177 
178 /*
179  * Mask the appropriate parts of the ondisk key field for a key comparison.
180  * Fork and bmbt are significant parts of the rmap record key, but written
181  * status is merely a record attribute.
182  */
183 static inline uint64_t offset_keymask(uint64_t offset)
184 {
185 	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
186 }
187 
188 STATIC int
189 xfs_rtrmapbt_cmp_key_with_cur(
190 	struct xfs_btree_cur		*cur,
191 	const union xfs_btree_key	*key)
192 {
193 	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
194 	const struct xfs_rmap_key	*kp = &key->rmap;
195 
196 	return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
197 	       cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
198 	       cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
199 		       offset_keymask(xfs_rmap_irec_offset_pack(rec)));
200 }
201 
202 STATIC int
203 xfs_rtrmapbt_cmp_two_keys(
204 	struct xfs_btree_cur		*cur,
205 	const union xfs_btree_key	*k1,
206 	const union xfs_btree_key	*k2,
207 	const union xfs_btree_key	*mask)
208 {
209 	const struct xfs_rmap_key	*kp1 = &k1->rmap;
210 	const struct xfs_rmap_key	*kp2 = &k2->rmap;
211 	int				d;
212 
213 	/* Doesn't make sense to mask off the physical space part */
214 	ASSERT(!mask || mask->rmap.rm_startblock);
215 
216 	d = cmp_int(be32_to_cpu(kp1->rm_startblock),
217 		    be32_to_cpu(kp2->rm_startblock));
218 	if (d)
219 		return d;
220 
221 	if (!mask || mask->rmap.rm_owner) {
222 		d = cmp_int(be64_to_cpu(kp1->rm_owner),
223 			    be64_to_cpu(kp2->rm_owner));
224 		if (d)
225 			return d;
226 	}
227 
228 	if (!mask || mask->rmap.rm_offset) {
229 		/* Doesn't make sense to allow offset but not owner */
230 		ASSERT(!mask || mask->rmap.rm_owner);
231 
232 		d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
233 			    offset_keymask(be64_to_cpu(kp2->rm_offset)));
234 		if (d)
235 			return d;
236 	}
237 
238 	return 0;
239 }
240 
241 static xfs_failaddr_t
242 xfs_rtrmapbt_verify(
243 	struct xfs_buf		*bp)
244 {
245 	struct xfs_mount	*mp = bp->b_target->bt_mount;
246 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
247 	xfs_failaddr_t		fa;
248 	int			level;
249 
250 	if (!xfs_verify_magic(bp, block->bb_magic))
251 		return __this_address;
252 
253 	if (!xfs_has_rmapbt(mp))
254 		return __this_address;
255 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
256 	if (fa)
257 		return fa;
258 	level = be16_to_cpu(block->bb_level);
259 	if (level > mp->m_rtrmap_maxlevels)
260 		return __this_address;
261 
262 	return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
263 }
264 
265 static void
266 xfs_rtrmapbt_read_verify(
267 	struct xfs_buf	*bp)
268 {
269 	xfs_failaddr_t	fa;
270 
271 	if (!xfs_btree_fsblock_verify_crc(bp))
272 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
273 	else {
274 		fa = xfs_rtrmapbt_verify(bp);
275 		if (fa)
276 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
277 	}
278 
279 	if (bp->b_error)
280 		trace_xfs_btree_corrupt(bp, _RET_IP_);
281 }
282 
283 static void
284 xfs_rtrmapbt_write_verify(
285 	struct xfs_buf	*bp)
286 {
287 	xfs_failaddr_t	fa;
288 
289 	fa = xfs_rtrmapbt_verify(bp);
290 	if (fa) {
291 		trace_xfs_btree_corrupt(bp, _RET_IP_);
292 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
293 		return;
294 	}
295 	xfs_btree_fsblock_calc_crc(bp);
296 
297 }
298 
299 const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
300 	.name			= "xfs_rtrmapbt",
301 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
302 	.verify_read		= xfs_rtrmapbt_read_verify,
303 	.verify_write		= xfs_rtrmapbt_write_verify,
304 	.verify_struct		= xfs_rtrmapbt_verify,
305 };
306 
307 STATIC int
308 xfs_rtrmapbt_keys_inorder(
309 	struct xfs_btree_cur		*cur,
310 	const union xfs_btree_key	*k1,
311 	const union xfs_btree_key	*k2)
312 {
313 	uint32_t			x;
314 	uint32_t			y;
315 	uint64_t			a;
316 	uint64_t			b;
317 
318 	x = be32_to_cpu(k1->rmap.rm_startblock);
319 	y = be32_to_cpu(k2->rmap.rm_startblock);
320 	if (x < y)
321 		return 1;
322 	else if (x > y)
323 		return 0;
324 	a = be64_to_cpu(k1->rmap.rm_owner);
325 	b = be64_to_cpu(k2->rmap.rm_owner);
326 	if (a < b)
327 		return 1;
328 	else if (a > b)
329 		return 0;
330 	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
331 	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
332 	if (a <= b)
333 		return 1;
334 	return 0;
335 }
336 
337 STATIC int
338 xfs_rtrmapbt_recs_inorder(
339 	struct xfs_btree_cur		*cur,
340 	const union xfs_btree_rec	*r1,
341 	const union xfs_btree_rec	*r2)
342 {
343 	uint32_t			x;
344 	uint32_t			y;
345 	uint64_t			a;
346 	uint64_t			b;
347 
348 	x = be32_to_cpu(r1->rmap.rm_startblock);
349 	y = be32_to_cpu(r2->rmap.rm_startblock);
350 	if (x < y)
351 		return 1;
352 	else if (x > y)
353 		return 0;
354 	a = be64_to_cpu(r1->rmap.rm_owner);
355 	b = be64_to_cpu(r2->rmap.rm_owner);
356 	if (a < b)
357 		return 1;
358 	else if (a > b)
359 		return 0;
360 	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
361 	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
362 	if (a <= b)
363 		return 1;
364 	return 0;
365 }
366 
367 STATIC enum xbtree_key_contig
368 xfs_rtrmapbt_keys_contiguous(
369 	struct xfs_btree_cur		*cur,
370 	const union xfs_btree_key	*key1,
371 	const union xfs_btree_key	*key2,
372 	const union xfs_btree_key	*mask)
373 {
374 	ASSERT(!mask || mask->rmap.rm_startblock);
375 
376 	/*
377 	 * We only support checking contiguity of the physical space component.
378 	 * If any callers ever need more specificity than that, they'll have to
379 	 * implement it here.
380 	 */
381 	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
382 
383 	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
384 				 be32_to_cpu(key2->rmap.rm_startblock));
385 }
386 
387 static inline void
388 xfs_rtrmapbt_move_ptrs(
389 	struct xfs_mount	*mp,
390 	struct xfs_btree_block	*broot,
391 	short			old_size,
392 	size_t			new_size,
393 	unsigned int		numrecs)
394 {
395 	void			*dptr;
396 	void			*sptr;
397 
398 	sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size);
399 	dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size);
400 	memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t));
401 }
402 
403 static struct xfs_btree_block *
404 xfs_rtrmapbt_broot_realloc(
405 	struct xfs_btree_cur	*cur,
406 	unsigned int		new_numrecs)
407 {
408 	struct xfs_mount	*mp = cur->bc_mp;
409 	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
410 	struct xfs_btree_block	*broot;
411 	unsigned int		new_size;
412 	unsigned int		old_size = ifp->if_broot_bytes;
413 	const unsigned int	level = cur->bc_nlevels - 1;
414 
415 	new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs);
416 
417 	/* Handle the nop case quietly. */
418 	if (new_size == old_size)
419 		return ifp->if_broot;
420 
421 	if (new_size > old_size) {
422 		unsigned int	old_numrecs;
423 
424 		/*
425 		 * If there wasn't any memory allocated before, just allocate
426 		 * it now and get out.
427 		 */
428 		if (old_size == 0)
429 			return xfs_broot_realloc(ifp, new_size);
430 
431 		/*
432 		 * If there is already an existing if_broot, then we need to
433 		 * realloc it and possibly move the node block pointers because
434 		 * those are not butted up against the btree block header.
435 		 */
436 		old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0);
437 		broot = xfs_broot_realloc(ifp, new_size);
438 		if (level > 0)
439 			xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size,
440 					old_numrecs);
441 		goto out_broot;
442 	}
443 
444 	/*
445 	 * We're reducing numrecs.  If we're going all the way to zero, just
446 	 * free the block.
447 	 */
448 	ASSERT(ifp->if_broot != NULL && old_size > 0);
449 	if (new_size == 0)
450 		return xfs_broot_realloc(ifp, 0);
451 
452 	/*
453 	 * Shrink the btree root by possibly moving the rtrmapbt pointers,
454 	 * since they are not butted up against the btree block header.  Then
455 	 * reallocate broot.
456 	 */
457 	if (level > 0)
458 		xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size,
459 				new_numrecs);
460 	broot = xfs_broot_realloc(ifp, new_size);
461 
462 out_broot:
463 	ASSERT(xfs_rtrmap_droot_space(broot) <=
464 	       xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
465 	return broot;
466 }
467 
468 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
469 	.name			= "rtrmap",
470 	.type			= XFS_BTREE_TYPE_INODE,
471 	.geom_flags		= XFS_BTGEO_OVERLAPPING |
472 				  XFS_BTGEO_IROOT_RECORDS,
473 
474 	.rec_len		= sizeof(struct xfs_rmap_rec),
475 	/* Overlapping btree; 2 keys per pointer. */
476 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
477 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
478 
479 	.lru_refs		= XFS_RMAP_BTREE_REF,
480 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
481 	.sick_mask		= XFS_SICK_RG_RMAPBT,
482 
483 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
484 	.alloc_block		= xfs_btree_alloc_metafile_block,
485 	.free_block		= xfs_btree_free_metafile_block,
486 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
487 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
488 	.get_dmaxrecs		= xfs_rtrmapbt_get_dmaxrecs,
489 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
490 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
491 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
492 	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
493 	.cmp_key_with_cur	= xfs_rtrmapbt_cmp_key_with_cur,
494 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
495 	.cmp_two_keys		= xfs_rtrmapbt_cmp_two_keys,
496 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
497 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
498 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
499 	.broot_realloc		= xfs_rtrmapbt_broot_realloc,
500 };
501 
502 /* Allocate a new rt rmap btree cursor. */
503 struct xfs_btree_cur *
504 xfs_rtrmapbt_init_cursor(
505 	struct xfs_trans	*tp,
506 	struct xfs_rtgroup	*rtg)
507 {
508 	struct xfs_inode	*ip = rtg_rmap(rtg);
509 	struct xfs_mount	*mp = rtg_mount(rtg);
510 	struct xfs_btree_cur	*cur;
511 
512 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
513 
514 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
515 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
516 
517 	cur->bc_ino.ip = ip;
518 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
519 	cur->bc_ino.whichfork = XFS_DATA_FORK;
520 	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
521 	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
522 
523 	return cur;
524 }
525 
526 #ifdef CONFIG_XFS_BTREE_IN_MEM
527 /*
528  * Validate an in-memory realtime rmap btree block.  Callers are allowed to
529  * generate an in-memory btree even if the ondisk feature is not enabled.
530  */
531 static xfs_failaddr_t
532 xfs_rtrmapbt_mem_verify(
533 	struct xfs_buf		*bp)
534 {
535 	struct xfs_mount	*mp = bp->b_mount;
536 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
537 	xfs_failaddr_t		fa;
538 	unsigned int		level;
539 	unsigned int		maxrecs;
540 
541 	if (!xfs_verify_magic(bp, block->bb_magic))
542 		return __this_address;
543 
544 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
545 	if (fa)
546 		return fa;
547 
548 	level = be16_to_cpu(block->bb_level);
549 	if (xfs_has_rmapbt(mp)) {
550 		if (level >= mp->m_rtrmap_maxlevels)
551 			return __this_address;
552 	} else {
553 		if (level >= xfs_rtrmapbt_maxlevels_ondisk())
554 			return __this_address;
555 	}
556 
557 	maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
558 	return xfs_btree_memblock_verify(bp, maxrecs);
559 }
560 
561 static void
562 xfs_rtrmapbt_mem_rw_verify(
563 	struct xfs_buf	*bp)
564 {
565 	xfs_failaddr_t	fa = xfs_rtrmapbt_mem_verify(bp);
566 
567 	if (fa)
568 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
569 }
570 
571 /* skip crc checks on in-memory btrees to save time */
572 static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
573 	.name			= "xfs_rtrmapbt_mem",
574 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
575 	.verify_read		= xfs_rtrmapbt_mem_rw_verify,
576 	.verify_write		= xfs_rtrmapbt_mem_rw_verify,
577 	.verify_struct		= xfs_rtrmapbt_mem_verify,
578 };
579 
580 const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
581 	.type			= XFS_BTREE_TYPE_MEM,
582 	.geom_flags		= XFS_BTGEO_OVERLAPPING,
583 
584 	.rec_len		= sizeof(struct xfs_rmap_rec),
585 	/* Overlapping btree; 2 keys per pointer. */
586 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
587 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
588 
589 	.lru_refs		= XFS_RMAP_BTREE_REF,
590 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2),
591 
592 	.dup_cursor		= xfbtree_dup_cursor,
593 	.set_root		= xfbtree_set_root,
594 	.alloc_block		= xfbtree_alloc_block,
595 	.free_block		= xfbtree_free_block,
596 	.get_minrecs		= xfbtree_get_minrecs,
597 	.get_maxrecs		= xfbtree_get_maxrecs,
598 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
599 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
600 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
601 	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
602 	.cmp_key_with_cur	= xfs_rtrmapbt_cmp_key_with_cur,
603 	.buf_ops		= &xfs_rtrmapbt_mem_buf_ops,
604 	.cmp_two_keys		= xfs_rtrmapbt_cmp_two_keys,
605 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
606 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
607 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
608 };
609 
610 /* Create a cursor for an in-memory btree. */
611 struct xfs_btree_cur *
612 xfs_rtrmapbt_mem_cursor(
613 	struct xfs_rtgroup	*rtg,
614 	struct xfs_trans	*tp,
615 	struct xfbtree		*xfbt)
616 {
617 	struct xfs_mount	*mp = rtg_mount(rtg);
618 	struct xfs_btree_cur	*cur;
619 
620 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops,
621 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
622 	cur->bc_mem.xfbtree = xfbt;
623 	cur->bc_nlevels = xfbt->nlevels;
624 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
625 	return cur;
626 }
627 
628 /* Create an in-memory realtime rmap btree. */
629 int
630 xfs_rtrmapbt_mem_init(
631 	struct xfs_mount	*mp,
632 	struct xfbtree		*xfbt,
633 	struct xfs_buftarg	*btp,
634 	xfs_rgnumber_t		rgno)
635 {
636 	xfbt->owner = rgno;
637 	return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops);
638 }
639 #endif /* CONFIG_XFS_BTREE_IN_MEM */
640 
641 /*
642  * Install a new rt reverse mapping btree root.  Caller is responsible for
643  * invalidating and freeing the old btree blocks.
644  */
645 void
646 xfs_rtrmapbt_commit_staged_btree(
647 	struct xfs_btree_cur	*cur,
648 	struct xfs_trans	*tp)
649 {
650 	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
651 	struct xfs_ifork	*ifp;
652 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
653 
654 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
655 	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
656 
657 	/*
658 	 * Free any resources hanging off the real fork, then shallow-copy the
659 	 * staging fork's contents into the real fork to transfer everything
660 	 * we just built.
661 	 */
662 	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
663 	xfs_idestroy_fork(ifp);
664 	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
665 
666 	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
667 	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
668 	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
669 }
670 
671 /* Calculate number of records in a rt reverse mapping btree block. */
672 static inline unsigned int
673 xfs_rtrmapbt_block_maxrecs(
674 	unsigned int		blocklen,
675 	bool			leaf)
676 {
677 	if (leaf)
678 		return blocklen / sizeof(struct xfs_rmap_rec);
679 	return blocklen /
680 		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
681 }
682 
683 /*
684  * Calculate number of records in an rt reverse mapping btree block.
685  */
686 unsigned int
687 xfs_rtrmapbt_maxrecs(
688 	struct xfs_mount	*mp,
689 	unsigned int		blocklen,
690 	bool			leaf)
691 {
692 	blocklen -= XFS_RTRMAP_BLOCK_LEN;
693 	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
694 }
695 
696 /* Compute the max possible height for realtime reverse mapping btrees. */
697 unsigned int
698 xfs_rtrmapbt_maxlevels_ondisk(void)
699 {
700 	unsigned long long	max_dblocks;
701 	unsigned int		minrecs[2];
702 	unsigned int		blocklen;
703 
704 	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
705 
706 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
707 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
708 
709 	/*
710 	 * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
711 	 *
712 	 * On a reflink filesystem, each block in an rtgroup can have up to
713 	 * 2^32 (per the refcount record format) owners, which means that
714 	 * theoretically we could face up to 2^64 rmap records.  However, we're
715 	 * likely to run out of blocks in the data device long before that
716 	 * happens, which means that we must compute the max height based on
717 	 * what the btree will look like if it consumes almost all the blocks
718 	 * in the data device due to maximal sharing factor.
719 	 */
720 	max_dblocks = -1U; /* max ag count */
721 	max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
722 	return xfs_btree_space_to_height(minrecs, max_dblocks);
723 }
724 
725 int __init
726 xfs_rtrmapbt_init_cur_cache(void)
727 {
728 	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
729 			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
730 			0, 0, NULL);
731 
732 	if (!xfs_rtrmapbt_cur_cache)
733 		return -ENOMEM;
734 	return 0;
735 }
736 
737 void
738 xfs_rtrmapbt_destroy_cur_cache(void)
739 {
740 	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
741 	xfs_rtrmapbt_cur_cache = NULL;
742 }
743 
744 /* Compute the maximum height of an rt reverse mapping btree. */
745 void
746 xfs_rtrmapbt_compute_maxlevels(
747 	struct xfs_mount	*mp)
748 {
749 	unsigned int		d_maxlevels, r_maxlevels;
750 
751 	if (!xfs_has_rtrmapbt(mp)) {
752 		mp->m_rtrmap_maxlevels = 0;
753 		return;
754 	}
755 
756 	/*
757 	 * The realtime rmapbt lives on the data device, which means that its
758 	 * maximum height is constrained by the size of the data device and
759 	 * the height required to store one rmap record for each block in an
760 	 * rt group.
761 	 *
762 	 * On a reflink filesystem, each rt block can have up to 2^32 (per the
763 	 * refcount record format) owners, which means that theoretically we
764 	 * could face up to 2^64 rmap records.  This makes the computation of
765 	 * maxlevels based on record count meaningless, so we only consider the
766 	 * size of the data device.
767 	 */
768 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
769 				mp->m_sb.sb_dblocks);
770 	if (xfs_has_rtreflink(mp)) {
771 		mp->m_rtrmap_maxlevels = d_maxlevels + 1;
772 		return;
773 	}
774 
775 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
776 				mp->m_groups[XG_TYPE_RTG].blocks);
777 
778 	/* Add one level to handle the inode root level. */
779 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
780 }
781 
782 /* Calculate the rtrmap btree size for some records. */
783 unsigned long long
784 xfs_rtrmapbt_calc_size(
785 	struct xfs_mount	*mp,
786 	unsigned long long	len)
787 {
788 	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
789 }
790 
791 /*
792  * Calculate the maximum rmap btree size.
793  */
794 static unsigned long long
795 xfs_rtrmapbt_max_size(
796 	struct xfs_mount	*mp,
797 	xfs_rtblock_t		rtblocks)
798 {
799 	/* Bail out if we're uninitialized, which can happen in mkfs. */
800 	if (mp->m_rtrmap_mxr[0] == 0)
801 		return 0;
802 
803 	return xfs_rtrmapbt_calc_size(mp, rtblocks);
804 }
805 
806 /*
807  * Figure out how many blocks to reserve and how many are used by this btree.
808  */
809 xfs_filblks_t
810 xfs_rtrmapbt_calc_reserves(
811 	struct xfs_mount	*mp)
812 {
813 	uint32_t		blocks = mp->m_groups[XG_TYPE_RTG].blocks;
814 
815 	if (!xfs_has_rtrmapbt(mp))
816 		return 0;
817 
818 	/* Reserve 1% of the rtgroup or enough for 1 block per record. */
819 	return max_t(xfs_filblks_t, blocks / 100,
820 			xfs_rtrmapbt_max_size(mp, blocks));
821 }
822 
823 /* Convert on-disk form of btree root to in-memory form. */
824 STATIC void
825 xfs_rtrmapbt_from_disk(
826 	struct xfs_inode	*ip,
827 	struct xfs_rtrmap_root	*dblock,
828 	unsigned int		dblocklen,
829 	struct xfs_btree_block	*rblock)
830 {
831 	struct xfs_mount	*mp = ip->i_mount;
832 	struct xfs_rmap_key	*fkp;
833 	__be64			*fpp;
834 	struct xfs_rmap_key	*tkp;
835 	__be64			*tpp;
836 	struct xfs_rmap_rec	*frp;
837 	struct xfs_rmap_rec	*trp;
838 	unsigned int		rblocklen = xfs_rtrmap_broot_space(mp, dblock);
839 	unsigned int		numrecs;
840 	unsigned int		maxrecs;
841 
842 	xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
843 
844 	rblock->bb_level = dblock->bb_level;
845 	rblock->bb_numrecs = dblock->bb_numrecs;
846 	numrecs = be16_to_cpu(dblock->bb_numrecs);
847 
848 	if (be16_to_cpu(rblock->bb_level) > 0) {
849 		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
850 		fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
851 		tkp = xfs_rtrmap_key_addr(rblock, 1);
852 		fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
853 		tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
854 		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
855 		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
856 	} else {
857 		frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
858 		trp = xfs_rtrmap_rec_addr(rblock, 1);
859 		memcpy(trp, frp, sizeof(*frp) * numrecs);
860 	}
861 }
862 
863 /* Load a realtime reverse mapping btree root in from disk. */
864 int
865 xfs_iformat_rtrmap(
866 	struct xfs_inode	*ip,
867 	struct xfs_dinode	*dip)
868 {
869 	struct xfs_mount	*mp = ip->i_mount;
870 	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
871 	struct xfs_btree_block	*broot;
872 	unsigned int		numrecs;
873 	unsigned int		level;
874 	int			dsize;
875 
876 	/*
877 	 * growfs must create the rtrmap inodes before adding a realtime volume
878 	 * to the filesystem, so we cannot use the rtrmapbt predicate here.
879 	 */
880 	if (!xfs_has_rmapbt(ip->i_mount)) {
881 		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
882 		return -EFSCORRUPTED;
883 	}
884 
885 	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
886 	numrecs = be16_to_cpu(dfp->bb_numrecs);
887 	level = be16_to_cpu(dfp->bb_level);
888 
889 	if (level > mp->m_rtrmap_maxlevels ||
890 	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
891 		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
892 		return -EFSCORRUPTED;
893 	}
894 
895 	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
896 			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
897 	if (broot)
898 		xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot);
899 	return 0;
900 }
901 
902 /* Convert in-memory form of btree root to on-disk form. */
903 void
904 xfs_rtrmapbt_to_disk(
905 	struct xfs_mount	*mp,
906 	struct xfs_btree_block	*rblock,
907 	unsigned int		rblocklen,
908 	struct xfs_rtrmap_root	*dblock,
909 	unsigned int		dblocklen)
910 {
911 	struct xfs_rmap_key	*fkp;
912 	__be64			*fpp;
913 	struct xfs_rmap_key	*tkp;
914 	__be64			*tpp;
915 	struct xfs_rmap_rec	*frp;
916 	struct xfs_rmap_rec	*trp;
917 	unsigned int		numrecs;
918 	unsigned int		maxrecs;
919 
920 	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
921 	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
922 	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
923 	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
924 	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
925 
926 	dblock->bb_level = rblock->bb_level;
927 	dblock->bb_numrecs = rblock->bb_numrecs;
928 	numrecs = be16_to_cpu(rblock->bb_numrecs);
929 
930 	if (be16_to_cpu(rblock->bb_level) > 0) {
931 		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
932 		fkp = xfs_rtrmap_key_addr(rblock, 1);
933 		tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
934 		fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
935 		tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
936 		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
937 		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
938 	} else {
939 		frp = xfs_rtrmap_rec_addr(rblock, 1);
940 		trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
941 		memcpy(trp, frp, sizeof(*frp) * numrecs);
942 	}
943 }
944 
945 /* Flush a realtime reverse mapping btree root out to disk. */
946 void
947 xfs_iflush_rtrmap(
948 	struct xfs_inode	*ip,
949 	struct xfs_dinode	*dip)
950 {
951 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
952 	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
953 
954 	ASSERT(ifp->if_broot != NULL);
955 	ASSERT(ifp->if_broot_bytes > 0);
956 	ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
957 			xfs_inode_fork_size(ip, XFS_DATA_FORK));
958 	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
959 			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
960 }
961 
962 /*
963  * Create a realtime rmap btree inode.
964  */
965 int
966 xfs_rtrmapbt_create(
967 	struct xfs_rtgroup	*rtg,
968 	struct xfs_inode	*ip,
969 	struct xfs_trans	*tp,
970 	bool			init)
971 {
972 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
973 	struct xfs_mount	*mp = ip->i_mount;
974 	struct xfs_btree_block	*broot;
975 
976 	ifp->if_format = XFS_DINODE_FMT_META_BTREE;
977 	ASSERT(ifp->if_broot_bytes == 0);
978 	ASSERT(ifp->if_bytes == 0);
979 
980 	/* Initialize the empty incore btree root. */
981 	broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0));
982 	if (broot)
983 		xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0,
984 				ip->i_ino);
985 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
986 
987 	return 0;
988 }
989 
990 /*
991  * Initialize an rmap for a realtime superblock using the potentially updated
992  * rt geometry in the provided @mp.
993  */
994 int
995 xfs_rtrmapbt_init_rtsb(
996 	struct xfs_mount	*mp,
997 	struct xfs_rtgroup	*rtg,
998 	struct xfs_trans	*tp)
999 {
1000 	struct xfs_rmap_irec	rmap = {
1001 		.rm_blockcount	= mp->m_sb.sb_rextsize,
1002 		.rm_owner	= XFS_RMAP_OWN_FS,
1003 	};
1004 	struct xfs_btree_cur	*cur;
1005 	int			error;
1006 
1007 	ASSERT(xfs_has_rtsb(mp));
1008 	ASSERT(rtg_rgno(rtg) == 0);
1009 
1010 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
1011 	error = xfs_rmap_map_raw(cur, &rmap);
1012 	xfs_btree_del_cursor(cur, error);
1013 	return error;
1014 }
1015 
1016 /*
1017  * Return the highest rgbno currently tracked by the rmap for this rtg.
1018  */
1019 xfs_rgblock_t
1020 xfs_rtrmap_highest_rgbno(
1021 	struct xfs_rtgroup	*rtg)
1022 {
1023 	struct xfs_btree_block	*block = rtg_rmap(rtg)->i_df.if_broot;
1024 	union xfs_btree_key	key = {};
1025 	struct xfs_btree_cur	*cur;
1026 
1027 	if (block->bb_numrecs == 0)
1028 		return NULLRGBLOCK;
1029 	cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
1030 	xfs_btree_get_keys(cur, block, &key);
1031 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1032 	return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
1033 }
1034