1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_sb.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_alloc.h" 19 #include "xfs_btree.h" 20 #include "xfs_btree_staging.h" 21 #include "xfs_metafile.h" 22 #include "xfs_rmap.h" 23 #include "xfs_rtrmap_btree.h" 24 #include "xfs_trace.h" 25 #include "xfs_cksum.h" 26 #include "xfs_error.h" 27 #include "xfs_extent_busy.h" 28 #include "xfs_rtgroup.h" 29 #include "xfs_bmap.h" 30 31 static struct kmem_cache *xfs_rtrmapbt_cur_cache; 32 33 /* 34 * Realtime Reverse Map btree. 35 * 36 * This is a btree used to track the owner(s) of a given extent in the realtime 37 * device. See the comments in xfs_rmap_btree.c for more information. 38 * 39 * This tree is basically the same as the regular rmap btree except that it 40 * is rooted in an inode and does not live in free space. 41 */ 42 43 static struct xfs_btree_cur * 44 xfs_rtrmapbt_dup_cursor( 45 struct xfs_btree_cur *cur) 46 { 47 return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); 48 } 49 50 STATIC int 51 xfs_rtrmapbt_get_minrecs( 52 struct xfs_btree_cur *cur, 53 int level) 54 { 55 if (level == cur->bc_nlevels - 1) { 56 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 57 58 return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 59 level == 0) / 2; 60 } 61 62 return cur->bc_mp->m_rtrmap_mnr[level != 0]; 63 } 64 65 STATIC int 66 xfs_rtrmapbt_get_maxrecs( 67 struct xfs_btree_cur *cur, 68 int level) 69 { 70 if (level == cur->bc_nlevels - 1) { 71 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 72 73 return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 74 level == 0); 75 } 76 77 return cur->bc_mp->m_rtrmap_mxr[level != 0]; 78 } 79 80 /* 81 * Convert the ondisk record's offset field into the ondisk key's offset field. 82 * Fork and bmbt are significant parts of the rmap record key, but written 83 * status is merely a record attribute. 84 */ 85 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) 86 { 87 return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); 88 } 89 90 STATIC void 91 xfs_rtrmapbt_init_key_from_rec( 92 union xfs_btree_key *key, 93 const union xfs_btree_rec *rec) 94 { 95 key->rmap.rm_startblock = rec->rmap.rm_startblock; 96 key->rmap.rm_owner = rec->rmap.rm_owner; 97 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 98 } 99 100 STATIC void 101 xfs_rtrmapbt_init_high_key_from_rec( 102 union xfs_btree_key *key, 103 const union xfs_btree_rec *rec) 104 { 105 uint64_t off; 106 int adj; 107 108 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 109 110 key->rmap.rm_startblock = rec->rmap.rm_startblock; 111 be32_add_cpu(&key->rmap.rm_startblock, adj); 112 key->rmap.rm_owner = rec->rmap.rm_owner; 113 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 114 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 115 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 116 return; 117 off = be64_to_cpu(key->rmap.rm_offset); 118 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 119 key->rmap.rm_offset = cpu_to_be64(off); 120 } 121 122 STATIC void 123 xfs_rtrmapbt_init_rec_from_cur( 124 struct xfs_btree_cur *cur, 125 union xfs_btree_rec *rec) 126 { 127 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 128 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 129 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 130 rec->rmap.rm_offset = cpu_to_be64( 131 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 132 } 133 134 STATIC void 135 xfs_rtrmapbt_init_ptr_from_cur( 136 struct xfs_btree_cur *cur, 137 union xfs_btree_ptr *ptr) 138 { 139 ptr->l = 0; 140 } 141 142 /* 143 * Mask the appropriate parts of the ondisk key field for a key comparison. 144 * Fork and bmbt are significant parts of the rmap record key, but written 145 * status is merely a record attribute. 146 */ 147 static inline uint64_t offset_keymask(uint64_t offset) 148 { 149 return offset & ~XFS_RMAP_OFF_UNWRITTEN; 150 } 151 152 STATIC int64_t 153 xfs_rtrmapbt_key_diff( 154 struct xfs_btree_cur *cur, 155 const union xfs_btree_key *key) 156 { 157 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 158 const struct xfs_rmap_key *kp = &key->rmap; 159 __u64 x, y; 160 int64_t d; 161 162 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 163 if (d) 164 return d; 165 166 x = be64_to_cpu(kp->rm_owner); 167 y = rec->rm_owner; 168 if (x > y) 169 return 1; 170 else if (y > x) 171 return -1; 172 173 x = offset_keymask(be64_to_cpu(kp->rm_offset)); 174 y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); 175 if (x > y) 176 return 1; 177 else if (y > x) 178 return -1; 179 return 0; 180 } 181 182 STATIC int64_t 183 xfs_rtrmapbt_diff_two_keys( 184 struct xfs_btree_cur *cur, 185 const union xfs_btree_key *k1, 186 const union xfs_btree_key *k2, 187 const union xfs_btree_key *mask) 188 { 189 const struct xfs_rmap_key *kp1 = &k1->rmap; 190 const struct xfs_rmap_key *kp2 = &k2->rmap; 191 int64_t d; 192 __u64 x, y; 193 194 /* Doesn't make sense to mask off the physical space part */ 195 ASSERT(!mask || mask->rmap.rm_startblock); 196 197 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 198 be32_to_cpu(kp2->rm_startblock); 199 if (d) 200 return d; 201 202 if (!mask || mask->rmap.rm_owner) { 203 x = be64_to_cpu(kp1->rm_owner); 204 y = be64_to_cpu(kp2->rm_owner); 205 if (x > y) 206 return 1; 207 else if (y > x) 208 return -1; 209 } 210 211 if (!mask || mask->rmap.rm_offset) { 212 /* Doesn't make sense to allow offset but not owner */ 213 ASSERT(!mask || mask->rmap.rm_owner); 214 215 x = offset_keymask(be64_to_cpu(kp1->rm_offset)); 216 y = offset_keymask(be64_to_cpu(kp2->rm_offset)); 217 if (x > y) 218 return 1; 219 else if (y > x) 220 return -1; 221 } 222 223 return 0; 224 } 225 226 static xfs_failaddr_t 227 xfs_rtrmapbt_verify( 228 struct xfs_buf *bp) 229 { 230 struct xfs_mount *mp = bp->b_target->bt_mount; 231 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 232 xfs_failaddr_t fa; 233 int level; 234 235 if (!xfs_verify_magic(bp, block->bb_magic)) 236 return __this_address; 237 238 if (!xfs_has_rmapbt(mp)) 239 return __this_address; 240 fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 241 if (fa) 242 return fa; 243 level = be16_to_cpu(block->bb_level); 244 if (level > mp->m_rtrmap_maxlevels) 245 return __this_address; 246 247 return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]); 248 } 249 250 static void 251 xfs_rtrmapbt_read_verify( 252 struct xfs_buf *bp) 253 { 254 xfs_failaddr_t fa; 255 256 if (!xfs_btree_fsblock_verify_crc(bp)) 257 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 258 else { 259 fa = xfs_rtrmapbt_verify(bp); 260 if (fa) 261 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 262 } 263 264 if (bp->b_error) 265 trace_xfs_btree_corrupt(bp, _RET_IP_); 266 } 267 268 static void 269 xfs_rtrmapbt_write_verify( 270 struct xfs_buf *bp) 271 { 272 xfs_failaddr_t fa; 273 274 fa = xfs_rtrmapbt_verify(bp); 275 if (fa) { 276 trace_xfs_btree_corrupt(bp, _RET_IP_); 277 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 278 return; 279 } 280 xfs_btree_fsblock_calc_crc(bp); 281 282 } 283 284 const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = { 285 .name = "xfs_rtrmapbt", 286 .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, 287 .verify_read = xfs_rtrmapbt_read_verify, 288 .verify_write = xfs_rtrmapbt_write_verify, 289 .verify_struct = xfs_rtrmapbt_verify, 290 }; 291 292 STATIC int 293 xfs_rtrmapbt_keys_inorder( 294 struct xfs_btree_cur *cur, 295 const union xfs_btree_key *k1, 296 const union xfs_btree_key *k2) 297 { 298 uint32_t x; 299 uint32_t y; 300 uint64_t a; 301 uint64_t b; 302 303 x = be32_to_cpu(k1->rmap.rm_startblock); 304 y = be32_to_cpu(k2->rmap.rm_startblock); 305 if (x < y) 306 return 1; 307 else if (x > y) 308 return 0; 309 a = be64_to_cpu(k1->rmap.rm_owner); 310 b = be64_to_cpu(k2->rmap.rm_owner); 311 if (a < b) 312 return 1; 313 else if (a > b) 314 return 0; 315 a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); 316 b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); 317 if (a <= b) 318 return 1; 319 return 0; 320 } 321 322 STATIC int 323 xfs_rtrmapbt_recs_inorder( 324 struct xfs_btree_cur *cur, 325 const union xfs_btree_rec *r1, 326 const union xfs_btree_rec *r2) 327 { 328 uint32_t x; 329 uint32_t y; 330 uint64_t a; 331 uint64_t b; 332 333 x = be32_to_cpu(r1->rmap.rm_startblock); 334 y = be32_to_cpu(r2->rmap.rm_startblock); 335 if (x < y) 336 return 1; 337 else if (x > y) 338 return 0; 339 a = be64_to_cpu(r1->rmap.rm_owner); 340 b = be64_to_cpu(r2->rmap.rm_owner); 341 if (a < b) 342 return 1; 343 else if (a > b) 344 return 0; 345 a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); 346 b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); 347 if (a <= b) 348 return 1; 349 return 0; 350 } 351 352 STATIC enum xbtree_key_contig 353 xfs_rtrmapbt_keys_contiguous( 354 struct xfs_btree_cur *cur, 355 const union xfs_btree_key *key1, 356 const union xfs_btree_key *key2, 357 const union xfs_btree_key *mask) 358 { 359 ASSERT(!mask || mask->rmap.rm_startblock); 360 361 /* 362 * We only support checking contiguity of the physical space component. 363 * If any callers ever need more specificity than that, they'll have to 364 * implement it here. 365 */ 366 ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); 367 368 return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), 369 be32_to_cpu(key2->rmap.rm_startblock)); 370 } 371 372 const struct xfs_btree_ops xfs_rtrmapbt_ops = { 373 .name = "rtrmap", 374 .type = XFS_BTREE_TYPE_INODE, 375 .geom_flags = XFS_BTGEO_OVERLAPPING | 376 XFS_BTGEO_IROOT_RECORDS, 377 378 .rec_len = sizeof(struct xfs_rmap_rec), 379 /* Overlapping btree; 2 keys per pointer. */ 380 .key_len = 2 * sizeof(struct xfs_rmap_key), 381 .ptr_len = XFS_BTREE_LONG_PTR_LEN, 382 383 .lru_refs = XFS_RMAP_BTREE_REF, 384 .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2), 385 386 .dup_cursor = xfs_rtrmapbt_dup_cursor, 387 .alloc_block = xfs_btree_alloc_metafile_block, 388 .free_block = xfs_btree_free_metafile_block, 389 .get_minrecs = xfs_rtrmapbt_get_minrecs, 390 .get_maxrecs = xfs_rtrmapbt_get_maxrecs, 391 .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, 392 .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, 393 .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, 394 .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, 395 .key_diff = xfs_rtrmapbt_key_diff, 396 .buf_ops = &xfs_rtrmapbt_buf_ops, 397 .diff_two_keys = xfs_rtrmapbt_diff_two_keys, 398 .keys_inorder = xfs_rtrmapbt_keys_inorder, 399 .recs_inorder = xfs_rtrmapbt_recs_inorder, 400 .keys_contiguous = xfs_rtrmapbt_keys_contiguous, 401 }; 402 403 /* Allocate a new rt rmap btree cursor. */ 404 struct xfs_btree_cur * 405 xfs_rtrmapbt_init_cursor( 406 struct xfs_trans *tp, 407 struct xfs_rtgroup *rtg) 408 { 409 struct xfs_inode *ip = rtg_rmap(rtg); 410 struct xfs_mount *mp = rtg_mount(rtg); 411 struct xfs_btree_cur *cur; 412 413 xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); 414 415 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops, 416 mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); 417 418 cur->bc_ino.ip = ip; 419 cur->bc_group = xfs_group_hold(rtg_group(rtg)); 420 cur->bc_ino.whichfork = XFS_DATA_FORK; 421 cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; 422 cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); 423 424 return cur; 425 } 426 427 /* 428 * Install a new rt reverse mapping btree root. Caller is responsible for 429 * invalidating and freeing the old btree blocks. 430 */ 431 void 432 xfs_rtrmapbt_commit_staged_btree( 433 struct xfs_btree_cur *cur, 434 struct xfs_trans *tp) 435 { 436 struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; 437 struct xfs_ifork *ifp; 438 int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; 439 440 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 441 ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); 442 443 /* 444 * Free any resources hanging off the real fork, then shallow-copy the 445 * staging fork's contents into the real fork to transfer everything 446 * we just built. 447 */ 448 ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); 449 xfs_idestroy_fork(ifp); 450 memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); 451 452 cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; 453 xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); 454 xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); 455 } 456 457 /* Calculate number of records in a rt reverse mapping btree block. */ 458 static inline unsigned int 459 xfs_rtrmapbt_block_maxrecs( 460 unsigned int blocklen, 461 bool leaf) 462 { 463 if (leaf) 464 return blocklen / sizeof(struct xfs_rmap_rec); 465 return blocklen / 466 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t)); 467 } 468 469 /* 470 * Calculate number of records in an rt reverse mapping btree block. 471 */ 472 unsigned int 473 xfs_rtrmapbt_maxrecs( 474 struct xfs_mount *mp, 475 unsigned int blocklen, 476 bool leaf) 477 { 478 blocklen -= XFS_RTRMAP_BLOCK_LEN; 479 return xfs_rtrmapbt_block_maxrecs(blocklen, leaf); 480 } 481 482 /* Compute the max possible height for realtime reverse mapping btrees. */ 483 unsigned int 484 xfs_rtrmapbt_maxlevels_ondisk(void) 485 { 486 unsigned int minrecs[2]; 487 unsigned int blocklen; 488 489 blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 490 491 minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; 492 minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; 493 494 /* We need at most one record for every block in an rt group. */ 495 return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); 496 } 497 498 int __init 499 xfs_rtrmapbt_init_cur_cache(void) 500 { 501 xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur", 502 xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()), 503 0, 0, NULL); 504 505 if (!xfs_rtrmapbt_cur_cache) 506 return -ENOMEM; 507 return 0; 508 } 509 510 void 511 xfs_rtrmapbt_destroy_cur_cache(void) 512 { 513 kmem_cache_destroy(xfs_rtrmapbt_cur_cache); 514 xfs_rtrmapbt_cur_cache = NULL; 515 } 516 517 /* Compute the maximum height of an rt reverse mapping btree. */ 518 void 519 xfs_rtrmapbt_compute_maxlevels( 520 struct xfs_mount *mp) 521 { 522 unsigned int d_maxlevels, r_maxlevels; 523 524 if (!xfs_has_rtrmapbt(mp)) { 525 mp->m_rtrmap_maxlevels = 0; 526 return; 527 } 528 529 /* 530 * The realtime rmapbt lives on the data device, which means that its 531 * maximum height is constrained by the size of the data device and 532 * the height required to store one rmap record for each block in an 533 * rt group. 534 */ 535 d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, 536 mp->m_sb.sb_dblocks); 537 r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, 538 mp->m_groups[XG_TYPE_RTG].blocks); 539 540 /* Add one level to handle the inode root level. */ 541 mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1; 542 } 543