xref: /linux/fs/xfs/libxfs/xfs_rtrmap_btree.c (revision 8491a55cfc73ff5c2c637a70ade51d4d08abb90a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_alloc.h"
19 #include "xfs_btree.h"
20 #include "xfs_btree_staging.h"
21 #include "xfs_metafile.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rtrmap_btree.h"
24 #include "xfs_trace.h"
25 #include "xfs_cksum.h"
26 #include "xfs_error.h"
27 #include "xfs_extent_busy.h"
28 #include "xfs_rtgroup.h"
29 #include "xfs_bmap.h"
30 
31 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
32 
33 /*
34  * Realtime Reverse Map btree.
35  *
36  * This is a btree used to track the owner(s) of a given extent in the realtime
37  * device.  See the comments in xfs_rmap_btree.c for more information.
38  *
39  * This tree is basically the same as the regular rmap btree except that it
40  * is rooted in an inode and does not live in free space.
41  */
42 
43 static struct xfs_btree_cur *
44 xfs_rtrmapbt_dup_cursor(
45 	struct xfs_btree_cur	*cur)
46 {
47 	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
48 }
49 
50 STATIC int
51 xfs_rtrmapbt_get_minrecs(
52 	struct xfs_btree_cur	*cur,
53 	int			level)
54 {
55 	if (level == cur->bc_nlevels - 1) {
56 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
57 
58 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
59 				level == 0) / 2;
60 	}
61 
62 	return cur->bc_mp->m_rtrmap_mnr[level != 0];
63 }
64 
65 STATIC int
66 xfs_rtrmapbt_get_maxrecs(
67 	struct xfs_btree_cur	*cur,
68 	int			level)
69 {
70 	if (level == cur->bc_nlevels - 1) {
71 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
72 
73 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
74 				level == 0);
75 	}
76 
77 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
78 }
79 
80 /*
81  * Convert the ondisk record's offset field into the ondisk key's offset field.
82  * Fork and bmbt are significant parts of the rmap record key, but written
83  * status is merely a record attribute.
84  */
85 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
86 {
87 	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
88 }
89 
90 STATIC void
91 xfs_rtrmapbt_init_key_from_rec(
92 	union xfs_btree_key		*key,
93 	const union xfs_btree_rec	*rec)
94 {
95 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
96 	key->rmap.rm_owner = rec->rmap.rm_owner;
97 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
98 }
99 
100 STATIC void
101 xfs_rtrmapbt_init_high_key_from_rec(
102 	union xfs_btree_key		*key,
103 	const union xfs_btree_rec	*rec)
104 {
105 	uint64_t			off;
106 	int				adj;
107 
108 	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
109 
110 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
111 	be32_add_cpu(&key->rmap.rm_startblock, adj);
112 	key->rmap.rm_owner = rec->rmap.rm_owner;
113 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
114 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
115 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
116 		return;
117 	off = be64_to_cpu(key->rmap.rm_offset);
118 	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
119 	key->rmap.rm_offset = cpu_to_be64(off);
120 }
121 
122 STATIC void
123 xfs_rtrmapbt_init_rec_from_cur(
124 	struct xfs_btree_cur	*cur,
125 	union xfs_btree_rec	*rec)
126 {
127 	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
128 	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
129 	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
130 	rec->rmap.rm_offset = cpu_to_be64(
131 			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
132 }
133 
134 STATIC void
135 xfs_rtrmapbt_init_ptr_from_cur(
136 	struct xfs_btree_cur	*cur,
137 	union xfs_btree_ptr	*ptr)
138 {
139 	ptr->l = 0;
140 }
141 
142 /*
143  * Mask the appropriate parts of the ondisk key field for a key comparison.
144  * Fork and bmbt are significant parts of the rmap record key, but written
145  * status is merely a record attribute.
146  */
147 static inline uint64_t offset_keymask(uint64_t offset)
148 {
149 	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
150 }
151 
152 STATIC int64_t
153 xfs_rtrmapbt_key_diff(
154 	struct xfs_btree_cur		*cur,
155 	const union xfs_btree_key	*key)
156 {
157 	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
158 	const struct xfs_rmap_key	*kp = &key->rmap;
159 	__u64				x, y;
160 	int64_t				d;
161 
162 	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
163 	if (d)
164 		return d;
165 
166 	x = be64_to_cpu(kp->rm_owner);
167 	y = rec->rm_owner;
168 	if (x > y)
169 		return 1;
170 	else if (y > x)
171 		return -1;
172 
173 	x = offset_keymask(be64_to_cpu(kp->rm_offset));
174 	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
175 	if (x > y)
176 		return 1;
177 	else if (y > x)
178 		return -1;
179 	return 0;
180 }
181 
182 STATIC int64_t
183 xfs_rtrmapbt_diff_two_keys(
184 	struct xfs_btree_cur		*cur,
185 	const union xfs_btree_key	*k1,
186 	const union xfs_btree_key	*k2,
187 	const union xfs_btree_key	*mask)
188 {
189 	const struct xfs_rmap_key	*kp1 = &k1->rmap;
190 	const struct xfs_rmap_key	*kp2 = &k2->rmap;
191 	int64_t				d;
192 	__u64				x, y;
193 
194 	/* Doesn't make sense to mask off the physical space part */
195 	ASSERT(!mask || mask->rmap.rm_startblock);
196 
197 	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
198 		     be32_to_cpu(kp2->rm_startblock);
199 	if (d)
200 		return d;
201 
202 	if (!mask || mask->rmap.rm_owner) {
203 		x = be64_to_cpu(kp1->rm_owner);
204 		y = be64_to_cpu(kp2->rm_owner);
205 		if (x > y)
206 			return 1;
207 		else if (y > x)
208 			return -1;
209 	}
210 
211 	if (!mask || mask->rmap.rm_offset) {
212 		/* Doesn't make sense to allow offset but not owner */
213 		ASSERT(!mask || mask->rmap.rm_owner);
214 
215 		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
216 		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
217 		if (x > y)
218 			return 1;
219 		else if (y > x)
220 			return -1;
221 	}
222 
223 	return 0;
224 }
225 
226 static xfs_failaddr_t
227 xfs_rtrmapbt_verify(
228 	struct xfs_buf		*bp)
229 {
230 	struct xfs_mount	*mp = bp->b_target->bt_mount;
231 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
232 	xfs_failaddr_t		fa;
233 	int			level;
234 
235 	if (!xfs_verify_magic(bp, block->bb_magic))
236 		return __this_address;
237 
238 	if (!xfs_has_rmapbt(mp))
239 		return __this_address;
240 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
241 	if (fa)
242 		return fa;
243 	level = be16_to_cpu(block->bb_level);
244 	if (level > mp->m_rtrmap_maxlevels)
245 		return __this_address;
246 
247 	return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
248 }
249 
250 static void
251 xfs_rtrmapbt_read_verify(
252 	struct xfs_buf	*bp)
253 {
254 	xfs_failaddr_t	fa;
255 
256 	if (!xfs_btree_fsblock_verify_crc(bp))
257 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
258 	else {
259 		fa = xfs_rtrmapbt_verify(bp);
260 		if (fa)
261 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
262 	}
263 
264 	if (bp->b_error)
265 		trace_xfs_btree_corrupt(bp, _RET_IP_);
266 }
267 
268 static void
269 xfs_rtrmapbt_write_verify(
270 	struct xfs_buf	*bp)
271 {
272 	xfs_failaddr_t	fa;
273 
274 	fa = xfs_rtrmapbt_verify(bp);
275 	if (fa) {
276 		trace_xfs_btree_corrupt(bp, _RET_IP_);
277 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
278 		return;
279 	}
280 	xfs_btree_fsblock_calc_crc(bp);
281 
282 }
283 
284 const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
285 	.name			= "xfs_rtrmapbt",
286 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
287 	.verify_read		= xfs_rtrmapbt_read_verify,
288 	.verify_write		= xfs_rtrmapbt_write_verify,
289 	.verify_struct		= xfs_rtrmapbt_verify,
290 };
291 
292 STATIC int
293 xfs_rtrmapbt_keys_inorder(
294 	struct xfs_btree_cur		*cur,
295 	const union xfs_btree_key	*k1,
296 	const union xfs_btree_key	*k2)
297 {
298 	uint32_t			x;
299 	uint32_t			y;
300 	uint64_t			a;
301 	uint64_t			b;
302 
303 	x = be32_to_cpu(k1->rmap.rm_startblock);
304 	y = be32_to_cpu(k2->rmap.rm_startblock);
305 	if (x < y)
306 		return 1;
307 	else if (x > y)
308 		return 0;
309 	a = be64_to_cpu(k1->rmap.rm_owner);
310 	b = be64_to_cpu(k2->rmap.rm_owner);
311 	if (a < b)
312 		return 1;
313 	else if (a > b)
314 		return 0;
315 	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
316 	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
317 	if (a <= b)
318 		return 1;
319 	return 0;
320 }
321 
322 STATIC int
323 xfs_rtrmapbt_recs_inorder(
324 	struct xfs_btree_cur		*cur,
325 	const union xfs_btree_rec	*r1,
326 	const union xfs_btree_rec	*r2)
327 {
328 	uint32_t			x;
329 	uint32_t			y;
330 	uint64_t			a;
331 	uint64_t			b;
332 
333 	x = be32_to_cpu(r1->rmap.rm_startblock);
334 	y = be32_to_cpu(r2->rmap.rm_startblock);
335 	if (x < y)
336 		return 1;
337 	else if (x > y)
338 		return 0;
339 	a = be64_to_cpu(r1->rmap.rm_owner);
340 	b = be64_to_cpu(r2->rmap.rm_owner);
341 	if (a < b)
342 		return 1;
343 	else if (a > b)
344 		return 0;
345 	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
346 	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
347 	if (a <= b)
348 		return 1;
349 	return 0;
350 }
351 
352 STATIC enum xbtree_key_contig
353 xfs_rtrmapbt_keys_contiguous(
354 	struct xfs_btree_cur		*cur,
355 	const union xfs_btree_key	*key1,
356 	const union xfs_btree_key	*key2,
357 	const union xfs_btree_key	*mask)
358 {
359 	ASSERT(!mask || mask->rmap.rm_startblock);
360 
361 	/*
362 	 * We only support checking contiguity of the physical space component.
363 	 * If any callers ever need more specificity than that, they'll have to
364 	 * implement it here.
365 	 */
366 	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
367 
368 	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
369 				 be32_to_cpu(key2->rmap.rm_startblock));
370 }
371 
372 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
373 	.name			= "rtrmap",
374 	.type			= XFS_BTREE_TYPE_INODE,
375 	.geom_flags		= XFS_BTGEO_OVERLAPPING |
376 				  XFS_BTGEO_IROOT_RECORDS,
377 
378 	.rec_len		= sizeof(struct xfs_rmap_rec),
379 	/* Overlapping btree; 2 keys per pointer. */
380 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
381 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
382 
383 	.lru_refs		= XFS_RMAP_BTREE_REF,
384 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
385 
386 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
387 	.alloc_block		= xfs_btree_alloc_metafile_block,
388 	.free_block		= xfs_btree_free_metafile_block,
389 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
390 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
391 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
392 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
393 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
394 	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
395 	.key_diff		= xfs_rtrmapbt_key_diff,
396 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
397 	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
398 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
399 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
400 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
401 };
402 
403 /* Allocate a new rt rmap btree cursor. */
404 struct xfs_btree_cur *
405 xfs_rtrmapbt_init_cursor(
406 	struct xfs_trans	*tp,
407 	struct xfs_rtgroup	*rtg)
408 {
409 	struct xfs_inode	*ip = rtg_rmap(rtg);
410 	struct xfs_mount	*mp = rtg_mount(rtg);
411 	struct xfs_btree_cur	*cur;
412 
413 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
414 
415 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
416 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
417 
418 	cur->bc_ino.ip = ip;
419 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
420 	cur->bc_ino.whichfork = XFS_DATA_FORK;
421 	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
422 	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
423 
424 	return cur;
425 }
426 
427 /*
428  * Install a new rt reverse mapping btree root.  Caller is responsible for
429  * invalidating and freeing the old btree blocks.
430  */
431 void
432 xfs_rtrmapbt_commit_staged_btree(
433 	struct xfs_btree_cur	*cur,
434 	struct xfs_trans	*tp)
435 {
436 	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
437 	struct xfs_ifork	*ifp;
438 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
439 
440 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
441 	ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE);
442 
443 	/*
444 	 * Free any resources hanging off the real fork, then shallow-copy the
445 	 * staging fork's contents into the real fork to transfer everything
446 	 * we just built.
447 	 */
448 	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
449 	xfs_idestroy_fork(ifp);
450 	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
451 
452 	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
453 	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
454 	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
455 }
456 
457 /* Calculate number of records in a rt reverse mapping btree block. */
458 static inline unsigned int
459 xfs_rtrmapbt_block_maxrecs(
460 	unsigned int		blocklen,
461 	bool			leaf)
462 {
463 	if (leaf)
464 		return blocklen / sizeof(struct xfs_rmap_rec);
465 	return blocklen /
466 		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
467 }
468 
469 /*
470  * Calculate number of records in an rt reverse mapping btree block.
471  */
472 unsigned int
473 xfs_rtrmapbt_maxrecs(
474 	struct xfs_mount	*mp,
475 	unsigned int		blocklen,
476 	bool			leaf)
477 {
478 	blocklen -= XFS_RTRMAP_BLOCK_LEN;
479 	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
480 }
481 
482 /* Compute the max possible height for realtime reverse mapping btrees. */
483 unsigned int
484 xfs_rtrmapbt_maxlevels_ondisk(void)
485 {
486 	unsigned int		minrecs[2];
487 	unsigned int		blocklen;
488 
489 	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
490 
491 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
492 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
493 
494 	/* We need at most one record for every block in an rt group. */
495 	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
496 }
497 
498 int __init
499 xfs_rtrmapbt_init_cur_cache(void)
500 {
501 	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
502 			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
503 			0, 0, NULL);
504 
505 	if (!xfs_rtrmapbt_cur_cache)
506 		return -ENOMEM;
507 	return 0;
508 }
509 
510 void
511 xfs_rtrmapbt_destroy_cur_cache(void)
512 {
513 	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
514 	xfs_rtrmapbt_cur_cache = NULL;
515 }
516 
517 /* Compute the maximum height of an rt reverse mapping btree. */
518 void
519 xfs_rtrmapbt_compute_maxlevels(
520 	struct xfs_mount	*mp)
521 {
522 	unsigned int		d_maxlevels, r_maxlevels;
523 
524 	if (!xfs_has_rtrmapbt(mp)) {
525 		mp->m_rtrmap_maxlevels = 0;
526 		return;
527 	}
528 
529 	/*
530 	 * The realtime rmapbt lives on the data device, which means that its
531 	 * maximum height is constrained by the size of the data device and
532 	 * the height required to store one rmap record for each block in an
533 	 * rt group.
534 	 */
535 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
536 				mp->m_sb.sb_dblocks);
537 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
538 				mp->m_groups[XG_TYPE_RTG].blocks);
539 
540 	/* Add one level to handle the inode root level. */
541 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
542 }
543 
544 /* Calculate the rtrmap btree size for some records. */
545 static unsigned long long
546 xfs_rtrmapbt_calc_size(
547 	struct xfs_mount	*mp,
548 	unsigned long long	len)
549 {
550 	return xfs_btree_calc_size(mp->m_rtrmap_mnr, len);
551 }
552 
553 /*
554  * Calculate the maximum rmap btree size.
555  */
556 static unsigned long long
557 xfs_rtrmapbt_max_size(
558 	struct xfs_mount	*mp,
559 	xfs_rtblock_t		rtblocks)
560 {
561 	/* Bail out if we're uninitialized, which can happen in mkfs. */
562 	if (mp->m_rtrmap_mxr[0] == 0)
563 		return 0;
564 
565 	return xfs_rtrmapbt_calc_size(mp, rtblocks);
566 }
567 
568 /*
569  * Figure out how many blocks to reserve and how many are used by this btree.
570  */
571 xfs_filblks_t
572 xfs_rtrmapbt_calc_reserves(
573 	struct xfs_mount	*mp)
574 {
575 	uint32_t		blocks = mp->m_groups[XG_TYPE_RTG].blocks;
576 
577 	if (!xfs_has_rtrmapbt(mp))
578 		return 0;
579 
580 	/* Reserve 1% of the rtgroup or enough for 1 block per record. */
581 	return max_t(xfs_filblks_t, blocks / 100,
582 			xfs_rtrmapbt_max_size(mp, blocks));
583 }
584