xref: /linux/fs/xfs/libxfs/xfs_rtrmap_btree.c (revision d386b4024372ea2f06aaa0f2c6c380b45ba0536e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_alloc.h"
19 #include "xfs_btree.h"
20 #include "xfs_btree_staging.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rtrmap_btree.h"
23 #include "xfs_trace.h"
24 #include "xfs_cksum.h"
25 #include "xfs_error.h"
26 #include "xfs_extent_busy.h"
27 #include "xfs_rtgroup.h"
28 #include "xfs_bmap.h"
29 
30 static struct kmem_cache	*xfs_rtrmapbt_cur_cache;
31 
32 /*
33  * Realtime Reverse Map btree.
34  *
35  * This is a btree used to track the owner(s) of a given extent in the realtime
36  * device.  See the comments in xfs_rmap_btree.c for more information.
37  *
38  * This tree is basically the same as the regular rmap btree except that it
39  * is rooted in an inode and does not live in free space.
40  */
41 
42 static struct xfs_btree_cur *
43 xfs_rtrmapbt_dup_cursor(
44 	struct xfs_btree_cur	*cur)
45 {
46 	return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group));
47 }
48 
49 STATIC int
50 xfs_rtrmapbt_get_minrecs(
51 	struct xfs_btree_cur	*cur,
52 	int			level)
53 {
54 	if (level == cur->bc_nlevels - 1) {
55 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
56 
57 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
58 				level == 0) / 2;
59 	}
60 
61 	return cur->bc_mp->m_rtrmap_mnr[level != 0];
62 }
63 
64 STATIC int
65 xfs_rtrmapbt_get_maxrecs(
66 	struct xfs_btree_cur	*cur,
67 	int			level)
68 {
69 	if (level == cur->bc_nlevels - 1) {
70 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
71 
72 		return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes,
73 				level == 0);
74 	}
75 
76 	return cur->bc_mp->m_rtrmap_mxr[level != 0];
77 }
78 
79 /*
80  * Convert the ondisk record's offset field into the ondisk key's offset field.
81  * Fork and bmbt are significant parts of the rmap record key, but written
82  * status is merely a record attribute.
83  */
84 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
85 {
86 	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
87 }
88 
89 STATIC void
90 xfs_rtrmapbt_init_key_from_rec(
91 	union xfs_btree_key		*key,
92 	const union xfs_btree_rec	*rec)
93 {
94 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
95 	key->rmap.rm_owner = rec->rmap.rm_owner;
96 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
97 }
98 
99 STATIC void
100 xfs_rtrmapbt_init_high_key_from_rec(
101 	union xfs_btree_key		*key,
102 	const union xfs_btree_rec	*rec)
103 {
104 	uint64_t			off;
105 	int				adj;
106 
107 	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
108 
109 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
110 	be32_add_cpu(&key->rmap.rm_startblock, adj);
111 	key->rmap.rm_owner = rec->rmap.rm_owner;
112 	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
113 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
114 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
115 		return;
116 	off = be64_to_cpu(key->rmap.rm_offset);
117 	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
118 	key->rmap.rm_offset = cpu_to_be64(off);
119 }
120 
121 STATIC void
122 xfs_rtrmapbt_init_rec_from_cur(
123 	struct xfs_btree_cur	*cur,
124 	union xfs_btree_rec	*rec)
125 {
126 	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
127 	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
128 	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
129 	rec->rmap.rm_offset = cpu_to_be64(
130 			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
131 }
132 
133 STATIC void
134 xfs_rtrmapbt_init_ptr_from_cur(
135 	struct xfs_btree_cur	*cur,
136 	union xfs_btree_ptr	*ptr)
137 {
138 	ptr->l = 0;
139 }
140 
141 /*
142  * Mask the appropriate parts of the ondisk key field for a key comparison.
143  * Fork and bmbt are significant parts of the rmap record key, but written
144  * status is merely a record attribute.
145  */
146 static inline uint64_t offset_keymask(uint64_t offset)
147 {
148 	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
149 }
150 
151 STATIC int64_t
152 xfs_rtrmapbt_key_diff(
153 	struct xfs_btree_cur		*cur,
154 	const union xfs_btree_key	*key)
155 {
156 	struct xfs_rmap_irec		*rec = &cur->bc_rec.r;
157 	const struct xfs_rmap_key	*kp = &key->rmap;
158 	__u64				x, y;
159 	int64_t				d;
160 
161 	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
162 	if (d)
163 		return d;
164 
165 	x = be64_to_cpu(kp->rm_owner);
166 	y = rec->rm_owner;
167 	if (x > y)
168 		return 1;
169 	else if (y > x)
170 		return -1;
171 
172 	x = offset_keymask(be64_to_cpu(kp->rm_offset));
173 	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
174 	if (x > y)
175 		return 1;
176 	else if (y > x)
177 		return -1;
178 	return 0;
179 }
180 
181 STATIC int64_t
182 xfs_rtrmapbt_diff_two_keys(
183 	struct xfs_btree_cur		*cur,
184 	const union xfs_btree_key	*k1,
185 	const union xfs_btree_key	*k2,
186 	const union xfs_btree_key	*mask)
187 {
188 	const struct xfs_rmap_key	*kp1 = &k1->rmap;
189 	const struct xfs_rmap_key	*kp2 = &k2->rmap;
190 	int64_t				d;
191 	__u64				x, y;
192 
193 	/* Doesn't make sense to mask off the physical space part */
194 	ASSERT(!mask || mask->rmap.rm_startblock);
195 
196 	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
197 		     be32_to_cpu(kp2->rm_startblock);
198 	if (d)
199 		return d;
200 
201 	if (!mask || mask->rmap.rm_owner) {
202 		x = be64_to_cpu(kp1->rm_owner);
203 		y = be64_to_cpu(kp2->rm_owner);
204 		if (x > y)
205 			return 1;
206 		else if (y > x)
207 			return -1;
208 	}
209 
210 	if (!mask || mask->rmap.rm_offset) {
211 		/* Doesn't make sense to allow offset but not owner */
212 		ASSERT(!mask || mask->rmap.rm_owner);
213 
214 		x = offset_keymask(be64_to_cpu(kp1->rm_offset));
215 		y = offset_keymask(be64_to_cpu(kp2->rm_offset));
216 		if (x > y)
217 			return 1;
218 		else if (y > x)
219 			return -1;
220 	}
221 
222 	return 0;
223 }
224 
225 static xfs_failaddr_t
226 xfs_rtrmapbt_verify(
227 	struct xfs_buf		*bp)
228 {
229 	struct xfs_mount	*mp = bp->b_target->bt_mount;
230 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
231 	xfs_failaddr_t		fa;
232 	int			level;
233 
234 	if (!xfs_verify_magic(bp, block->bb_magic))
235 		return __this_address;
236 
237 	if (!xfs_has_rmapbt(mp))
238 		return __this_address;
239 	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
240 	if (fa)
241 		return fa;
242 	level = be16_to_cpu(block->bb_level);
243 	if (level > mp->m_rtrmap_maxlevels)
244 		return __this_address;
245 
246 	return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]);
247 }
248 
249 static void
250 xfs_rtrmapbt_read_verify(
251 	struct xfs_buf	*bp)
252 {
253 	xfs_failaddr_t	fa;
254 
255 	if (!xfs_btree_fsblock_verify_crc(bp))
256 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
257 	else {
258 		fa = xfs_rtrmapbt_verify(bp);
259 		if (fa)
260 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
261 	}
262 
263 	if (bp->b_error)
264 		trace_xfs_btree_corrupt(bp, _RET_IP_);
265 }
266 
267 static void
268 xfs_rtrmapbt_write_verify(
269 	struct xfs_buf	*bp)
270 {
271 	xfs_failaddr_t	fa;
272 
273 	fa = xfs_rtrmapbt_verify(bp);
274 	if (fa) {
275 		trace_xfs_btree_corrupt(bp, _RET_IP_);
276 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
277 		return;
278 	}
279 	xfs_btree_fsblock_calc_crc(bp);
280 
281 }
282 
283 const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = {
284 	.name			= "xfs_rtrmapbt",
285 	.magic			= { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) },
286 	.verify_read		= xfs_rtrmapbt_read_verify,
287 	.verify_write		= xfs_rtrmapbt_write_verify,
288 	.verify_struct		= xfs_rtrmapbt_verify,
289 };
290 
291 STATIC int
292 xfs_rtrmapbt_keys_inorder(
293 	struct xfs_btree_cur		*cur,
294 	const union xfs_btree_key	*k1,
295 	const union xfs_btree_key	*k2)
296 {
297 	uint32_t			x;
298 	uint32_t			y;
299 	uint64_t			a;
300 	uint64_t			b;
301 
302 	x = be32_to_cpu(k1->rmap.rm_startblock);
303 	y = be32_to_cpu(k2->rmap.rm_startblock);
304 	if (x < y)
305 		return 1;
306 	else if (x > y)
307 		return 0;
308 	a = be64_to_cpu(k1->rmap.rm_owner);
309 	b = be64_to_cpu(k2->rmap.rm_owner);
310 	if (a < b)
311 		return 1;
312 	else if (a > b)
313 		return 0;
314 	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
315 	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
316 	if (a <= b)
317 		return 1;
318 	return 0;
319 }
320 
321 STATIC int
322 xfs_rtrmapbt_recs_inorder(
323 	struct xfs_btree_cur		*cur,
324 	const union xfs_btree_rec	*r1,
325 	const union xfs_btree_rec	*r2)
326 {
327 	uint32_t			x;
328 	uint32_t			y;
329 	uint64_t			a;
330 	uint64_t			b;
331 
332 	x = be32_to_cpu(r1->rmap.rm_startblock);
333 	y = be32_to_cpu(r2->rmap.rm_startblock);
334 	if (x < y)
335 		return 1;
336 	else if (x > y)
337 		return 0;
338 	a = be64_to_cpu(r1->rmap.rm_owner);
339 	b = be64_to_cpu(r2->rmap.rm_owner);
340 	if (a < b)
341 		return 1;
342 	else if (a > b)
343 		return 0;
344 	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
345 	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
346 	if (a <= b)
347 		return 1;
348 	return 0;
349 }
350 
351 STATIC enum xbtree_key_contig
352 xfs_rtrmapbt_keys_contiguous(
353 	struct xfs_btree_cur		*cur,
354 	const union xfs_btree_key	*key1,
355 	const union xfs_btree_key	*key2,
356 	const union xfs_btree_key	*mask)
357 {
358 	ASSERT(!mask || mask->rmap.rm_startblock);
359 
360 	/*
361 	 * We only support checking contiguity of the physical space component.
362 	 * If any callers ever need more specificity than that, they'll have to
363 	 * implement it here.
364 	 */
365 	ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
366 
367 	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
368 				 be32_to_cpu(key2->rmap.rm_startblock));
369 }
370 
371 const struct xfs_btree_ops xfs_rtrmapbt_ops = {
372 	.name			= "rtrmap",
373 	.type			= XFS_BTREE_TYPE_INODE,
374 	.geom_flags		= XFS_BTGEO_OVERLAPPING |
375 				  XFS_BTGEO_IROOT_RECORDS,
376 
377 	.rec_len		= sizeof(struct xfs_rmap_rec),
378 	/* Overlapping btree; 2 keys per pointer. */
379 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
380 	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
381 
382 	.lru_refs		= XFS_RMAP_BTREE_REF,
383 	.statoff		= XFS_STATS_CALC_INDEX(xs_rtrmap_2),
384 
385 	.dup_cursor		= xfs_rtrmapbt_dup_cursor,
386 	.alloc_block		= xfs_btree_alloc_metafile_block,
387 	.free_block		= xfs_btree_free_metafile_block,
388 	.get_minrecs		= xfs_rtrmapbt_get_minrecs,
389 	.get_maxrecs		= xfs_rtrmapbt_get_maxrecs,
390 	.init_key_from_rec	= xfs_rtrmapbt_init_key_from_rec,
391 	.init_high_key_from_rec	= xfs_rtrmapbt_init_high_key_from_rec,
392 	.init_rec_from_cur	= xfs_rtrmapbt_init_rec_from_cur,
393 	.init_ptr_from_cur	= xfs_rtrmapbt_init_ptr_from_cur,
394 	.key_diff		= xfs_rtrmapbt_key_diff,
395 	.buf_ops		= &xfs_rtrmapbt_buf_ops,
396 	.diff_two_keys		= xfs_rtrmapbt_diff_two_keys,
397 	.keys_inorder		= xfs_rtrmapbt_keys_inorder,
398 	.recs_inorder		= xfs_rtrmapbt_recs_inorder,
399 	.keys_contiguous	= xfs_rtrmapbt_keys_contiguous,
400 };
401 
402 /* Allocate a new rt rmap btree cursor. */
403 struct xfs_btree_cur *
404 xfs_rtrmapbt_init_cursor(
405 	struct xfs_trans	*tp,
406 	struct xfs_rtgroup	*rtg)
407 {
408 	struct xfs_inode	*ip = NULL;
409 	struct xfs_mount	*mp = rtg_mount(rtg);
410 	struct xfs_btree_cur	*cur;
411 
412 	return NULL; /* XXX */
413 
414 	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
415 
416 	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops,
417 			mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache);
418 
419 	cur->bc_ino.ip = ip;
420 	cur->bc_group = xfs_group_hold(rtg_group(rtg));
421 	cur->bc_ino.whichfork = XFS_DATA_FORK;
422 	cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1;
423 	cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK);
424 
425 	return cur;
426 }
427 
428 /*
429  * Install a new rt reverse mapping btree root.  Caller is responsible for
430  * invalidating and freeing the old btree blocks.
431  */
432 void
433 xfs_rtrmapbt_commit_staged_btree(
434 	struct xfs_btree_cur	*cur,
435 	struct xfs_trans	*tp)
436 {
437 	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
438 	struct xfs_ifork	*ifp;
439 	int			flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT;
440 
441 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
442 
443 	/*
444 	 * Free any resources hanging off the real fork, then shallow-copy the
445 	 * staging fork's contents into the real fork to transfer everything
446 	 * we just built.
447 	 */
448 	ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK);
449 	xfs_idestroy_fork(ifp);
450 	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
451 
452 	cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno;
453 	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
454 	xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK);
455 }
456 
457 /* Calculate number of records in a rt reverse mapping btree block. */
458 static inline unsigned int
459 xfs_rtrmapbt_block_maxrecs(
460 	unsigned int		blocklen,
461 	bool			leaf)
462 {
463 	if (leaf)
464 		return blocklen / sizeof(struct xfs_rmap_rec);
465 	return blocklen /
466 		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t));
467 }
468 
469 /*
470  * Calculate number of records in an rt reverse mapping btree block.
471  */
472 unsigned int
473 xfs_rtrmapbt_maxrecs(
474 	struct xfs_mount	*mp,
475 	unsigned int		blocklen,
476 	bool			leaf)
477 {
478 	blocklen -= XFS_RTRMAP_BLOCK_LEN;
479 	return xfs_rtrmapbt_block_maxrecs(blocklen, leaf);
480 }
481 
482 /* Compute the max possible height for realtime reverse mapping btrees. */
483 unsigned int
484 xfs_rtrmapbt_maxlevels_ondisk(void)
485 {
486 	unsigned int		minrecs[2];
487 	unsigned int		blocklen;
488 
489 	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
490 
491 	minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2;
492 	minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2;
493 
494 	/* We need at most one record for every block in an rt group. */
495 	return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS);
496 }
497 
498 int __init
499 xfs_rtrmapbt_init_cur_cache(void)
500 {
501 	xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur",
502 			xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()),
503 			0, 0, NULL);
504 
505 	if (!xfs_rtrmapbt_cur_cache)
506 		return -ENOMEM;
507 	return 0;
508 }
509 
510 void
511 xfs_rtrmapbt_destroy_cur_cache(void)
512 {
513 	kmem_cache_destroy(xfs_rtrmapbt_cur_cache);
514 	xfs_rtrmapbt_cur_cache = NULL;
515 }
516 
517 /* Compute the maximum height of an rt reverse mapping btree. */
518 void
519 xfs_rtrmapbt_compute_maxlevels(
520 	struct xfs_mount	*mp)
521 {
522 	unsigned int		d_maxlevels, r_maxlevels;
523 
524 	if (!xfs_has_rtrmapbt(mp)) {
525 		mp->m_rtrmap_maxlevels = 0;
526 		return;
527 	}
528 
529 	/*
530 	 * The realtime rmapbt lives on the data device, which means that its
531 	 * maximum height is constrained by the size of the data device and
532 	 * the height required to store one rmap record for each block in an
533 	 * rt group.
534 	 */
535 	d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr,
536 				mp->m_sb.sb_dblocks);
537 	r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr,
538 				mp->m_groups[XG_TYPE_RTG].blocks);
539 
540 	/* Add one level to handle the inode root level. */
541 	mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1;
542 }
543