xref: /linux/fs/xfs/libxfs/xfs_rtrmap_btree.c (revision 7f81907b7e3f93dfed2e903af52659baa4944341)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_alloc.h"
19 #include "xfs_btree.h"
20 #include "xfs_btree_staging.h"
21 #include "xfs_metafile.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rtrmap_btree.h"
24 #include "xfs_trace.h"
25 #include "xfs_cksum.h"
26 #include "xfs_error.h"
27 #include "xfs_extent_busy.h"
28 #include "xfs_rtgroup.h"
29 #include "xfs_bmap.h"
30 #include "xfs_health.h"
31 #include "xfs_buf_mem.h"
32 #include "xfs_btree_mem.h"
33 
34 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
35 
36 /*
37  * Realtime Reverse Map btree.
38  *
39  * This is a btree used to track the owner(s) of a given extent in the realtime
40  * device.  See the comments in xfs_rmap_btree.c for more information.
41  *
42  * This tree is basically the same as the regular rmap btree except that it
43  * is rooted in an inode and does not live in free space.
44  */
45 
46 static struct xfs_btree_cur *
47 xfs_rtrmapbt_dup_cursor(
48 	struct xfs_btree_cur	*cur)
49 {
50 	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
51 }
52 
53 STATIC int
54 xfs_rtrmapbt_get_minrecs(
55 	struct xfs_btree_cur	*cur,
56 	int			level)
57 {
58 	if (level == cur->bc_nlevels - 1) {
59 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
60 
61 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
62 				level == 0) / 2;
63 	}
64 
65 	return cur->bc_mp->m_rtrmap_mnr[level != 0];
66 }
67 
68 STATIC int
69 xfs_rtrmapbt_get_maxrecs(
70 	struct xfs_btree_cur	*cur,
71 	int			level)
72 {
73 	if (level == cur->bc_nlevels - 1) {
74 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
75 
76 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
77 				level == 0);
78 	}
79 
80 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
81 }
82 
83 /* Calculate number of records in the ondisk realtime rmap btree inode root. */
84 unsigned int
85 xfs_rtrmapbt_droot_maxrecs(
86 	unsigned int		blocklen,
87 	bool			leaf)
88 {
89 	blocklen -= sizeof(struct xfs_rtrmap_root);
90 
91 	if (leaf)
92 		return blocklen / sizeof(struct xfs_rmap_rec);
93 	return blocklen / (2 * sizeof(struct xfs_rmap_key) +
94 			sizeof(xfs_rtrmap_ptr_t));
95 }
96 
97 /*
98  * Get the maximum records we could store in the on-disk format.
99  *
100  * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but
101  * for the root node this checks the available space in the dinode fork
102  * so that we can resize the in-memory buffer to match it.  After a
103  * resize to the maximum size this function returns the same value
104  * as xfs_rtrmapbt_get_maxrecs for the root node, too.
105  */
106 STATIC int
107 xfs_rtrmapbt_get_dmaxrecs(
108 	struct xfs_btree_cur	*cur,
109 	int			level)
110 {
111 	if (level != cur->bc_nlevels - 1)
112 		return cur->bc_mp->m_rtrmap_mxr[level != 0];
113 	return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0);
114 }
115 
116 /*
117  * Convert the ondisk record's offset field into the ondisk key's offset field.
118  * Fork and bmbt are significant parts of the rmap record key, but written
119  * status is merely a record attribute.
120  */
121 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
122 {
123 	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
124 }
125 
126 STATIC void
127 xfs_rtrmapbt_init_key_from_rec(
128 	union xfs_btree_key		*key,
129 	const union xfs_btree_rec	*rec)
130 {
131 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
132 	key->rmap.rm_owner = rec->rmap.rm_owner;
133 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
134 }
135 
136 STATIC void
137 xfs_rtrmapbt_init_high_key_from_rec(
138 	union xfs_btree_key		*key,
139 	const union xfs_btree_rec	*rec)
140 {
141 	uint64_t			off;
142 	int				adj;
143 
144 	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
145 
146 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
147 	be32_add_cpu(&key->rmap.rm_startblock, adj);
148 	key->rmap.rm_owner = rec->rmap.rm_owner;
149 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
150 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
151 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
152 		return;
153 	off = be64_to_cpu(key->rmap.rm_offset);
154 	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
155 	key->rmap.rm_offset = cpu_to_be64(off);
156 }
157 
158 STATIC void
159 xfs_rtrmapbt_init_rec_from_cur(
160 	struct xfs_btree_cur	*cur,
161 	union xfs_btree_rec	*rec)
162 {
163 	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
164 	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
165 	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
166 	rec->rmap.rm_offset = cpu_to_be64(
167 			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
168 }
169 
170 STATIC void
171 xfs_rtrmapbt_init_ptr_from_cur(
172 	struct xfs_btree_cur	*cur,
173 	union xfs_btree_ptr	*ptr)
174 {
175 	ptr->l = 0;
176 }
177 
178 /*
179  * Mask the appropriate parts of the ondisk key field for a key comparison.
180  * Fork and bmbt are significant parts of the rmap record key, but written
181  * status is merely a record attribute.
182  */
183 static inline uint64_t offset_keymask(uint64_t offset)
184 {
185 	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
186 }
187 
188 STATIC int64_t
189 xfs_rtrmapbt_key_diff(
190 	struct xfs_btree_cur		*cur,
191 	const union xfs_btree_key	*key)
192 {
193 	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
194 	const struct xfs_rmap_key	*kp = &key->rmap;
195 	__u64				x, y;
196 	int64_t				d;
197 
198 	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
199 	if (d)
200 		return d;
201 
202 	x = be64_to_cpu(kp->rm_owner);
203 	y = rec->rm_owner;
204 	if (x > y)
205 		return 1;
206 	else if (y > x)
207 		return -1;
208 
209 	x = offset_keymask(be64_to_cpu(kp->rm_offset));
210 	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
211 	if (x > y)
212 		return 1;
213 	else if (y > x)
214 		return -1;
215 	return 0;
216 }
217 
218 STATIC int64_t
219 xfs_rtrmapbt_diff_two_keys(
220 	struct xfs_btree_cur		*cur,
221 	const union xfs_btree_key	*k1,
222 	const union xfs_btree_key	*k2,
223 	const union xfs_btree_key	*mask)
224 {
225 	const struct xfs_rmap_key	*kp1 = &k1->rmap;
226 	const struct xfs_rmap_key	*kp2 = &k2->rmap;
227 	int64_t				d;
228 	__u64				x, y;
229 
230 	/* Doesn't make sense to mask off the physical space part */
231 	ASSERT(!mask || mask->rmap.rm_startblock);
232 
233 	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
234 		     be32_to_cpu(kp2->rm_startblock);
235 	if (d)
236 		return d;
237 
238 	if (!mask || mask->rmap.rm_owner) {
239 		x = be64_to_cpu(kp1->rm_owner);
240 		y = be64_to_cpu(kp2->rm_owner);
241 		if (x > y)
242 			return 1;
243 		else if (y > x)
244 			return -1;
245 	}
246 
247 	if (!mask || mask->rmap.rm_offset) {
248 		/* Doesn't make sense to allow offset but not owner */
249 		ASSERT(!mask || mask->rmap.rm_owner);
250 
251 		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
252 		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
253 		if (x > y)
254 			return 1;
255 		else if (y > x)
256 			return -1;
257 	}
258 
259 	return 0;
260 }
261 
262 static xfs_failaddr_t
263 xfs_rtrmapbt_verify(
264 	struct xfs_buf		*bp)
265 {
266 	struct xfs_mount	*mp = bp->b_target->bt_mount;
267 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
268 	xfs_failaddr_t		fa;
269 	int			level;
270 
271 	if (!xfs_verify_magic(bp, block->bb_magic))
272 		return __this_address;
273 
274 	if (!xfs_has_rmapbt(mp))
275 		return __this_address;
276 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
277 	if (fa)
278 		return fa;
279 	level = be16_to_cpu(block->bb_level);
280 	if (level > mp->m_rtrmap_maxlevels)
281 		return __this_address;
282 
283 	return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
284 }
285 
286 static void
287 xfs_rtrmapbt_read_verify(
288 	struct xfs_buf	*bp)
289 {
290 	xfs_failaddr_t	fa;
291 
292 	if (!xfs_btree_fsblock_verify_crc(bp))
293 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
294 	else {
295 		fa = xfs_rtrmapbt_verify(bp);
296 		if (fa)
297 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
298 	}
299 
300 	if (bp->b_error)
301 		trace_xfs_btree_corrupt(bp, _RET_IP_);
302 }
303 
304 static void
305 xfs_rtrmapbt_write_verify(
306 	struct xfs_buf	*bp)
307 {
308 	xfs_failaddr_t	fa;
309 
310 	fa = xfs_rtrmapbt_verify(bp);
311 	if (fa) {
312 		trace_xfs_btree_corrupt(bp, _RET_IP_);
313 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
314 		return;
315 	}
316 	xfs_btree_fsblock_calc_crc(bp);
317 
318 }
319 
320 const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
321 	.name			= "xfs_rtrmapbt",
322 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
323 	.verify_read		= xfs_rtrmapbt_read_verify,
324 	.verify_write		= xfs_rtrmapbt_write_verify,
325 	.verify_struct		= xfs_rtrmapbt_verify,
326 };
327 
328 STATIC int
329 xfs_rtrmapbt_keys_inorder(
330 	struct xfs_btree_cur		*cur,
331 	const union xfs_btree_key	*k1,
332 	const union xfs_btree_key	*k2)
333 {
334 	uint32_t			x;
335 	uint32_t			y;
336 	uint64_t			a;
337 	uint64_t			b;
338 
339 	x = be32_to_cpu(k1->rmap.rm_startblock);
340 	y = be32_to_cpu(k2->rmap.rm_startblock);
341 	if (x < y)
342 		return 1;
343 	else if (x > y)
344 		return 0;
345 	a = be64_to_cpu(k1->rmap.rm_owner);
346 	b = be64_to_cpu(k2->rmap.rm_owner);
347 	if (a < b)
348 		return 1;
349 	else if (a > b)
350 		return 0;
351 	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
352 	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
353 	if (a <= b)
354 		return 1;
355 	return 0;
356 }
357 
358 STATIC int
359 xfs_rtrmapbt_recs_inorder(
360 	struct xfs_btree_cur		*cur,
361 	const union xfs_btree_rec	*r1,
362 	const union xfs_btree_rec	*r2)
363 {
364 	uint32_t			x;
365 	uint32_t			y;
366 	uint64_t			a;
367 	uint64_t			b;
368 
369 	x = be32_to_cpu(r1->rmap.rm_startblock);
370 	y = be32_to_cpu(r2->rmap.rm_startblock);
371 	if (x < y)
372 		return 1;
373 	else if (x > y)
374 		return 0;
375 	a = be64_to_cpu(r1->rmap.rm_owner);
376 	b = be64_to_cpu(r2->rmap.rm_owner);
377 	if (a < b)
378 		return 1;
379 	else if (a > b)
380 		return 0;
381 	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
382 	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
383 	if (a <= b)
384 		return 1;
385 	return 0;
386 }
387 
388 STATIC enum xbtree_key_contig
389 xfs_rtrmapbt_keys_contiguous(
390 	struct xfs_btree_cur		*cur,
391 	const union xfs_btree_key	*key1,
392 	const union xfs_btree_key	*key2,
393 	const union xfs_btree_key	*mask)
394 {
395 	ASSERT(!mask || mask->rmap.rm_startblock);
396 
397 	/*
398 	 * We only support checking contiguity of the physical space component.
399 	 * If any callers ever need more specificity than that, they'll have to
400 	 * implement it here.
401 	 */
402 	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
403 
404 	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
405 				 be32_to_cpu(key2->rmap.rm_startblock));
406 }
407 
408 static inline void
409 xfs_rtrmapbt_move_ptrs(
410 	struct xfs_mount	*mp,
411 	struct xfs_btree_block	*broot,
412 	short			old_size,
413 	size_t			new_size,
414 	unsigned int		numrecs)
415 {
416 	void			*dptr;
417 	void			*sptr;
418 
419 	sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size);
420 	dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size);
421 	memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t));
422 }
423 
424 static struct xfs_btree_block *
425 xfs_rtrmapbt_broot_realloc(
426 	struct xfs_btree_cur	*cur,
427 	unsigned int		new_numrecs)
428 {
429 	struct xfs_mount	*mp = cur->bc_mp;
430 	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
431 	struct xfs_btree_block	*broot;
432 	unsigned int		new_size;
433 	unsigned int		old_size = ifp->if_broot_bytes;
434 	const unsigned int	level = cur->bc_nlevels - 1;
435 
436 	new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs);
437 
438 	/* Handle the nop case quietly. */
439 	if (new_size == old_size)
440 		return ifp->if_broot;
441 
442 	if (new_size > old_size) {
443 		unsigned int	old_numrecs;
444 
445 		/*
446 		 * If there wasn't any memory allocated before, just allocate
447 		 * it now and get out.
448 		 */
449 		if (old_size == 0)
450 			return xfs_broot_realloc(ifp, new_size);
451 
452 		/*
453 		 * If there is already an existing if_broot, then we need to
454 		 * realloc it and possibly move the node block pointers because
455 		 * those are not butted up against the btree block header.
456 		 */
457 		old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0);
458 		broot = xfs_broot_realloc(ifp, new_size);
459 		if (level > 0)
460 			xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size,
461 					old_numrecs);
462 		goto out_broot;
463 	}
464 
465 	/*
466 	 * We're reducing numrecs.  If we're going all the way to zero, just
467 	 * free the block.
468 	 */
469 	ASSERT(ifp->if_broot != NULL && old_size > 0);
470 	if (new_size == 0)
471 		return xfs_broot_realloc(ifp, 0);
472 
473 	/*
474 	 * Shrink the btree root by possibly moving the rtrmapbt pointers,
475 	 * since they are not butted up against the btree block header.  Then
476 	 * reallocate broot.
477 	 */
478 	if (level > 0)
479 		xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size,
480 				new_numrecs);
481 	broot = xfs_broot_realloc(ifp, new_size);
482 
483 out_broot:
484 	ASSERT(xfs_rtrmap_droot_space(broot) <=
485 	       xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork));
486 	return broot;
487 }
488 
489 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
490 	.name			= "rtrmap",
491 	.type			= XFS_BTREE_TYPE_INODE,
492 	.geom_flags		= XFS_BTGEO_OVERLAPPING |
493 				  XFS_BTGEO_IROOT_RECORDS,
494 
495 	.rec_len		= sizeof(struct xfs_rmap_rec),
496 	/* Overlapping btree; 2 keys per pointer. */
497 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
498 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
499 
500 	.lru_refs		= XFS_RMAP_BTREE_REF,
501 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
502 	.sick_mask		= XFS_SICK_RG_RMAPBT,
503 
504 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
505 	.alloc_block		= xfs_btree_alloc_metafile_block,
506 	.free_block		= xfs_btree_free_metafile_block,
507 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
508 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
509 	.get_dmaxrecs		= xfs_rtrmapbt_get_dmaxrecs,
510 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
511 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
512 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
513 	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
514 	.key_diff		= xfs_rtrmapbt_key_diff,
515 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
516 	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
517 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
518 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
519 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
520 	.broot_realloc		= xfs_rtrmapbt_broot_realloc,
521 };
522 
523 /* Allocate a new rt rmap btree cursor. */
524 struct xfs_btree_cur *
525 xfs_rtrmapbt_init_cursor(
526 	struct xfs_trans	*tp,
527 	struct xfs_rtgroup	*rtg)
528 {
529 	struct xfs_inode	*ip = rtg_rmap(rtg);
530 	struct xfs_mount	*mp = rtg_mount(rtg);
531 	struct xfs_btree_cur	*cur;
532 
533 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
534 
535 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
536 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
537 
538 	cur->bc_ino.ip = ip;
539 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
540 	cur->bc_ino.whichfork = XFS_DATA_FORK;
541 	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
542 	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
543 
544 	return cur;
545 }
546 
547 #ifdef CONFIG_XFS_BTREE_IN_MEM
548 /*
549  * Validate an in-memory realtime rmap btree block.  Callers are allowed to
550  * generate an in-memory btree even if the ondisk feature is not enabled.
551  */
552 static xfs_failaddr_t
553 xfs_rtrmapbt_mem_verify(
554 	struct xfs_buf		*bp)
555 {
556 	struct xfs_mount	*mp = bp->b_mount;
557 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
558 	xfs_failaddr_t		fa;
559 	unsigned int		level;
560 	unsigned int		maxrecs;
561 
562 	if (!xfs_verify_magic(bp, block->bb_magic))
563 		return __this_address;
564 
565 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
566 	if (fa)
567 		return fa;
568 
569 	level = be16_to_cpu(block->bb_level);
570 	if (xfs_has_rmapbt(mp)) {
571 		if (level >= mp->m_rtrmap_maxlevels)
572 			return __this_address;
573 	} else {
574 		if (level >= xfs_rtrmapbt_maxlevels_ondisk())
575 			return __this_address;
576 	}
577 
578 	maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
579 	return xfs_btree_memblock_verify(bp, maxrecs);
580 }
581 
582 static void
583 xfs_rtrmapbt_mem_rw_verify(
584 	struct xfs_buf	*bp)
585 {
586 	xfs_failaddr_t	fa = xfs_rtrmapbt_mem_verify(bp);
587 
588 	if (fa)
589 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
590 }
591 
592 /* skip crc checks on in-memory btrees to save time */
593 static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = {
594 	.name			= "xfs_rtrmapbt_mem",
595 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
596 	.verify_read		= xfs_rtrmapbt_mem_rw_verify,
597 	.verify_write		= xfs_rtrmapbt_mem_rw_verify,
598 	.verify_struct		= xfs_rtrmapbt_mem_verify,
599 };
600 
601 const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
602 	.type			= XFS_BTREE_TYPE_MEM,
603 	.geom_flags		= XFS_BTGEO_OVERLAPPING,
604 
605 	.rec_len		= sizeof(struct xfs_rmap_rec),
606 	/* Overlapping btree; 2 keys per pointer. */
607 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
608 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
609 
610 	.lru_refs		= XFS_RMAP_BTREE_REF,
611 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2),
612 
613 	.dup_cursor		= xfbtree_dup_cursor,
614 	.set_root		= xfbtree_set_root,
615 	.alloc_block		= xfbtree_alloc_block,
616 	.free_block		= xfbtree_free_block,
617 	.get_minrecs		= xfbtree_get_minrecs,
618 	.get_maxrecs		= xfbtree_get_maxrecs,
619 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
620 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
621 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
622 	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
623 	.key_diff		= xfs_rtrmapbt_key_diff,
624 	.buf_ops		= &xfs_rtrmapbt_mem_buf_ops,
625 	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
626 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
627 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
628 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
629 };
630 
631 /* Create a cursor for an in-memory btree. */
632 struct xfs_btree_cur *
633 xfs_rtrmapbt_mem_cursor(
634 	struct xfs_rtgroup	*rtg,
635 	struct xfs_trans	*tp,
636 	struct xfbtree		*xfbt)
637 {
638 	struct xfs_mount	*mp = rtg_mount(rtg);
639 	struct xfs_btree_cur	*cur;
640 
641 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops,
642 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
643 	cur->bc_mem.xfbtree = xfbt;
644 	cur->bc_nlevels = xfbt->nlevels;
645 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
646 	return cur;
647 }
648 
649 /* Create an in-memory realtime rmap btree. */
650 int
651 xfs_rtrmapbt_mem_init(
652 	struct xfs_mount	*mp,
653 	struct xfbtree		*xfbt,
654 	struct xfs_buftarg	*btp,
655 	xfs_rgnumber_t		rgno)
656 {
657 	xfbt->owner = rgno;
658 	return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops);
659 }
660 #endif /* CONFIG_XFS_BTREE_IN_MEM */
661 
662 /*
663  * Install a new rt reverse mapping btree root.  Caller is responsible for
664  * invalidating and freeing the old btree blocks.
665  */
666 void
667 xfs_rtrmapbt_commit_staged_btree(
668 	struct xfs_btree_cur	*cur,
669 	struct xfs_trans	*tp)
670 {
671 	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
672 	struct xfs_ifork	*ifp;
673 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
674 
675 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
676 	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
677 
678 	/*
679 	 * Free any resources hanging off the real fork, then shallow-copy the
680 	 * staging fork's contents into the real fork to transfer everything
681 	 * we just built.
682 	 */
683 	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
684 	xfs_idestroy_fork(ifp);
685 	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
686 
687 	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
688 	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
689 	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
690 }
691 
692 /* Calculate number of records in a rt reverse mapping btree block. */
693 static inline unsigned int
694 xfs_rtrmapbt_block_maxrecs(
695 	unsigned int		blocklen,
696 	bool			leaf)
697 {
698 	if (leaf)
699 		return blocklen / sizeof(struct xfs_rmap_rec);
700 	return blocklen /
701 		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
702 }
703 
704 /*
705  * Calculate number of records in an rt reverse mapping btree block.
706  */
707 unsigned int
708 xfs_rtrmapbt_maxrecs(
709 	struct xfs_mount	*mp,
710 	unsigned int		blocklen,
711 	bool			leaf)
712 {
713 	blocklen -= XFS_RTRMAP_BLOCK_LEN;
714 	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
715 }
716 
717 /* Compute the max possible height for realtime reverse mapping btrees. */
718 unsigned int
719 xfs_rtrmapbt_maxlevels_ondisk(void)
720 {
721 	unsigned long long	max_dblocks;
722 	unsigned int		minrecs[2];
723 	unsigned int		blocklen;
724 
725 	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
726 
727 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
728 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
729 
730 	/*
731 	 * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs.
732 	 *
733 	 * On a reflink filesystem, each block in an rtgroup can have up to
734 	 * 2^32 (per the refcount record format) owners, which means that
735 	 * theoretically we could face up to 2^64 rmap records.  However, we're
736 	 * likely to run out of blocks in the data device long before that
737 	 * happens, which means that we must compute the max height based on
738 	 * what the btree will look like if it consumes almost all the blocks
739 	 * in the data device due to maximal sharing factor.
740 	 */
741 	max_dblocks = -1U; /* max ag count */
742 	max_dblocks *= XFS_MAX_CRC_AG_BLOCKS;
743 	return xfs_btree_space_to_height(minrecs, max_dblocks);
744 }
745 
746 int __init
747 xfs_rtrmapbt_init_cur_cache(void)
748 {
749 	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
750 			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
751 			0, 0, NULL);
752 
753 	if (!xfs_rtrmapbt_cur_cache)
754 		return -ENOMEM;
755 	return 0;
756 }
757 
758 void
759 xfs_rtrmapbt_destroy_cur_cache(void)
760 {
761 	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
762 	xfs_rtrmapbt_cur_cache = NULL;
763 }
764 
765 /* Compute the maximum height of an rt reverse mapping btree. */
766 void
767 xfs_rtrmapbt_compute_maxlevels(
768 	struct xfs_mount	*mp)
769 {
770 	unsigned int		d_maxlevels, r_maxlevels;
771 
772 	if (!xfs_has_rtrmapbt(mp)) {
773 		mp->m_rtrmap_maxlevels = 0;
774 		return;
775 	}
776 
777 	/*
778 	 * The realtime rmapbt lives on the data device, which means that its
779 	 * maximum height is constrained by the size of the data device and
780 	 * the height required to store one rmap record for each block in an
781 	 * rt group.
782 	 *
783 	 * On a reflink filesystem, each rt block can have up to 2^32 (per the
784 	 * refcount record format) owners, which means that theoretically we
785 	 * could face up to 2^64 rmap records.  This makes the computation of
786 	 * maxlevels based on record count meaningless, so we only consider the
787 	 * size of the data device.
788 	 */
789 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
790 				mp->m_sb.sb_dblocks);
791 	if (xfs_has_rtreflink(mp)) {
792 		mp->m_rtrmap_maxlevels = d_maxlevels + 1;
793 		return;
794 	}
795 
796 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
797 				mp->m_groups[XG_TYPE_RTG].blocks);
798 
799 	/* Add one level to handle the inode root level. */
800 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
801 }
802 
803 /* Calculate the rtrmap btree size for some records. */
804 unsigned long long
805 xfs_rtrmapbt_calc_size(
806 	struct xfs_mount	*mp,
807 	unsigned long long	len)
808 {
809 	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
810 }
811 
812 /*
813  * Calculate the maximum rmap btree size.
814  */
815 static unsigned long long
816 xfs_rtrmapbt_max_size(
817 	struct xfs_mount	*mp,
818 	xfs_rtblock_t		rtblocks)
819 {
820 	/* Bail out if we're uninitialized, which can happen in mkfs. */
821 	if (mp->m_rtrmap_mxr[0] == 0)
822 		return 0;
823 
824 	return xfs_rtrmapbt_calc_size(mp, rtblocks);
825 }
826 
827 /*
828  * Figure out how many blocks to reserve and how many are used by this btree.
829  */
830 xfs_filblks_t
831 xfs_rtrmapbt_calc_reserves(
832 	struct xfs_mount	*mp)
833 {
834 	uint32_t		blocks = mp->m_groups[XG_TYPE_RTG].blocks;
835 
836 	if (!xfs_has_rtrmapbt(mp))
837 		return 0;
838 
839 	/* Reserve 1% of the rtgroup or enough for 1 block per record. */
840 	return max_t(xfs_filblks_t, blocks / 100,
841 			xfs_rtrmapbt_max_size(mp, blocks));
842 }
843 
844 /* Convert on-disk form of btree root to in-memory form. */
845 STATIC void
846 xfs_rtrmapbt_from_disk(
847 	struct xfs_inode	*ip,
848 	struct xfs_rtrmap_root	*dblock,
849 	unsigned int		dblocklen,
850 	struct xfs_btree_block	*rblock)
851 {
852 	struct xfs_mount	*mp = ip->i_mount;
853 	struct xfs_rmap_key	*fkp;
854 	__be64			*fpp;
855 	struct xfs_rmap_key	*tkp;
856 	__be64			*tpp;
857 	struct xfs_rmap_rec	*frp;
858 	struct xfs_rmap_rec	*trp;
859 	unsigned int		rblocklen = xfs_rtrmap_broot_space(mp, dblock);
860 	unsigned int		numrecs;
861 	unsigned int		maxrecs;
862 
863 	xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino);
864 
865 	rblock->bb_level = dblock->bb_level;
866 	rblock->bb_numrecs = dblock->bb_numrecs;
867 	numrecs = be16_to_cpu(dblock->bb_numrecs);
868 
869 	if (be16_to_cpu(rblock->bb_level) > 0) {
870 		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
871 		fkp = xfs_rtrmap_droot_key_addr(dblock, 1);
872 		tkp = xfs_rtrmap_key_addr(rblock, 1);
873 		fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
874 		tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
875 		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
876 		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
877 	} else {
878 		frp = xfs_rtrmap_droot_rec_addr(dblock, 1);
879 		trp = xfs_rtrmap_rec_addr(rblock, 1);
880 		memcpy(trp, frp, sizeof(*frp) * numrecs);
881 	}
882 }
883 
884 /* Load a realtime reverse mapping btree root in from disk. */
885 int
886 xfs_iformat_rtrmap(
887 	struct xfs_inode	*ip,
888 	struct xfs_dinode	*dip)
889 {
890 	struct xfs_mount	*mp = ip->i_mount;
891 	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
892 	struct xfs_btree_block	*broot;
893 	unsigned int		numrecs;
894 	unsigned int		level;
895 	int			dsize;
896 
897 	/*
898 	 * growfs must create the rtrmap inodes before adding a realtime volume
899 	 * to the filesystem, so we cannot use the rtrmapbt predicate here.
900 	 */
901 	if (!xfs_has_rmapbt(ip->i_mount)) {
902 		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
903 		return -EFSCORRUPTED;
904 	}
905 
906 	dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK);
907 	numrecs = be16_to_cpu(dfp->bb_numrecs);
908 	level = be16_to_cpu(dfp->bb_level);
909 
910 	if (level > mp->m_rtrmap_maxlevels ||
911 	    xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) {
912 		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
913 		return -EFSCORRUPTED;
914 	}
915 
916 	broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK),
917 			xfs_rtrmap_broot_space_calc(mp, level, numrecs));
918 	if (broot)
919 		xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot);
920 	return 0;
921 }
922 
923 /* Convert in-memory form of btree root to on-disk form. */
924 void
925 xfs_rtrmapbt_to_disk(
926 	struct xfs_mount	*mp,
927 	struct xfs_btree_block	*rblock,
928 	unsigned int		rblocklen,
929 	struct xfs_rtrmap_root	*dblock,
930 	unsigned int		dblocklen)
931 {
932 	struct xfs_rmap_key	*fkp;
933 	__be64			*fpp;
934 	struct xfs_rmap_key	*tkp;
935 	__be64			*tpp;
936 	struct xfs_rmap_rec	*frp;
937 	struct xfs_rmap_rec	*trp;
938 	unsigned int		numrecs;
939 	unsigned int		maxrecs;
940 
941 	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC));
942 	ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid));
943 	ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL));
944 	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
945 	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
946 
947 	dblock->bb_level = rblock->bb_level;
948 	dblock->bb_numrecs = rblock->bb_numrecs;
949 	numrecs = be16_to_cpu(rblock->bb_numrecs);
950 
951 	if (be16_to_cpu(rblock->bb_level) > 0) {
952 		maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false);
953 		fkp = xfs_rtrmap_key_addr(rblock, 1);
954 		tkp = xfs_rtrmap_droot_key_addr(dblock, 1);
955 		fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
956 		tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs);
957 		memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs);
958 		memcpy(tpp, fpp, sizeof(*fpp) * numrecs);
959 	} else {
960 		frp = xfs_rtrmap_rec_addr(rblock, 1);
961 		trp = xfs_rtrmap_droot_rec_addr(dblock, 1);
962 		memcpy(trp, frp, sizeof(*frp) * numrecs);
963 	}
964 }
965 
966 /* Flush a realtime reverse mapping btree root out to disk. */
967 void
968 xfs_iflush_rtrmap(
969 	struct xfs_inode	*ip,
970 	struct xfs_dinode	*dip)
971 {
972 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
973 	struct xfs_rtrmap_root	*dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
974 
975 	ASSERT(ifp->if_broot != NULL);
976 	ASSERT(ifp->if_broot_bytes > 0);
977 	ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <=
978 			xfs_inode_fork_size(ip, XFS_DATA_FORK));
979 	xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes,
980 			dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK));
981 }
982 
983 /*
984  * Create a realtime rmap btree inode.
985  */
986 int
987 xfs_rtrmapbt_create(
988 	struct xfs_rtgroup	*rtg,
989 	struct xfs_inode	*ip,
990 	struct xfs_trans	*tp,
991 	bool			init)
992 {
993 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
994 	struct xfs_mount	*mp = ip->i_mount;
995 	struct xfs_btree_block	*broot;
996 
997 	ifp->if_format = XFS_DINODE_FMT_META_BTREE;
998 	ASSERT(ifp->if_broot_bytes == 0);
999 	ASSERT(ifp->if_bytes == 0);
1000 
1001 	/* Initialize the empty incore btree root. */
1002 	broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0));
1003 	if (broot)
1004 		xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0,
1005 				ip->i_ino);
1006 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT);
1007 
1008 	return 0;
1009 }
1010 
1011 /*
1012  * Initialize an rmap for a realtime superblock using the potentially updated
1013  * rt geometry in the provided @mp.
1014  */
1015 int
1016 xfs_rtrmapbt_init_rtsb(
1017 	struct xfs_mount	*mp,
1018 	struct xfs_rtgroup	*rtg,
1019 	struct xfs_trans	*tp)
1020 {
1021 	struct xfs_rmap_irec	rmap = {
1022 		.rm_blockcount	= mp->m_sb.sb_rextsize,
1023 		.rm_owner	= XFS_RMAP_OWN_FS,
1024 	};
1025 	struct xfs_btree_cur	*cur;
1026 	int			error;
1027 
1028 	ASSERT(xfs_has_rtsb(mp));
1029 	ASSERT(rtg_rgno(rtg) == 0);
1030 
1031 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
1032 	error = xfs_rmap_map_raw(cur, &rmap);
1033 	xfs_btree_del_cursor(cur, error);
1034 	return error;
1035 }
1036 
1037 /*
1038  * Return the highest rgbno currently tracked by the rmap for this rtg.
1039  */
1040 xfs_rgblock_t
1041 xfs_rtrmap_highest_rgbno(
1042 	struct xfs_rtgroup	*rtg)
1043 {
1044 	struct xfs_btree_block	*block = rtg_rmap(rtg)->i_df.if_broot;
1045 	union xfs_btree_key	key = {};
1046 	struct xfs_btree_cur	*cur;
1047 
1048 	if (block->bb_numrecs == 0)
1049 		return NULLRGBLOCK;
1050 	cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
1051 	xfs_btree_get_keys(cur, block, &key);
1052 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1053 	return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
1054 }
1055