1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_trans.h" 14 #include "xfs_alloc.h" 15 #include "xfs_btree.h" 16 #include "xfs_btree_staging.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_health.h" 20 #include "xfs_trace.h" 21 #include "xfs_error.h" 22 #include "xfs_extent_busy.h" 23 #include "xfs_ag.h" 24 #include "xfs_ag_resv.h" 25 #include "xfs_buf_mem.h" 26 #include "xfs_btree_mem.h" 27 28 static struct kmem_cache *xfs_rmapbt_cur_cache; 29 30 /* 31 * Reverse map btree. 32 * 33 * This is a per-ag tree used to track the owner(s) of a given extent. With 34 * reflink it is possible for there to be multiple owners, which is a departure 35 * from classic XFS. Owner records for data extents are inserted when the 36 * extent is mapped and removed when an extent is unmapped. Owner records for 37 * all other block types (i.e. metadata) are inserted when an extent is 38 * allocated and removed when an extent is freed. There can only be one owner 39 * of a metadata extent, usually an inode or some other metadata structure like 40 * an AG btree. 41 * 42 * The rmap btree is part of the free space management, so blocks for the tree 43 * are sourced from the agfl. Hence we need transaction reservation support for 44 * this tree so that the freelist is always large enough. This also impacts on 45 * the minimum space we need to leave free in the AG. 46 * 47 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 48 * but it is the only way to enforce unique keys when a block can be owned by 49 * multiple files at any offset. There's no need to order/search by extent 50 * size for online updating/management of the tree. It is intended that most 51 * reverse lookups will be to find the owner(s) of a particular block, or to 52 * try to recover tree and file data from corrupt primary metadata. 53 */ 54 55 static struct xfs_btree_cur * 56 xfs_rmapbt_dup_cursor( 57 struct xfs_btree_cur *cur) 58 { 59 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 60 cur->bc_ag.agbp, cur->bc_ag.pag); 61 } 62 63 STATIC void 64 xfs_rmapbt_set_root( 65 struct xfs_btree_cur *cur, 66 const union xfs_btree_ptr *ptr, 67 int inc) 68 { 69 struct xfs_buf *agbp = cur->bc_ag.agbp; 70 struct xfs_agf *agf = agbp->b_addr; 71 72 ASSERT(ptr->s != 0); 73 74 agf->agf_rmap_root = ptr->s; 75 be32_add_cpu(&agf->agf_rmap_level, inc); 76 cur->bc_ag.pag->pagf_rmap_level += inc; 77 78 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 79 } 80 81 STATIC int 82 xfs_rmapbt_alloc_block( 83 struct xfs_btree_cur *cur, 84 const union xfs_btree_ptr *start, 85 union xfs_btree_ptr *new, 86 int *stat) 87 { 88 struct xfs_buf *agbp = cur->bc_ag.agbp; 89 struct xfs_agf *agf = agbp->b_addr; 90 struct xfs_perag *pag = cur->bc_ag.pag; 91 struct xfs_alloc_arg args = { .len = 1 }; 92 int error; 93 xfs_agblock_t bno; 94 95 /* Allocate the new block from the freelist. If we can't, give up. */ 96 error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, 97 &bno, 1); 98 if (error) 99 return error; 100 if (bno == NULLAGBLOCK) { 101 *stat = 0; 102 return 0; 103 } 104 105 xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); 106 107 new->s = cpu_to_be32(bno); 108 be32_add_cpu(&agf->agf_rmap_blocks, 1); 109 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 110 111 /* 112 * Since rmapbt blocks are sourced from the AGFL, they are allocated one 113 * at a time and the reservation updates don't require a transaction. 114 */ 115 xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); 116 117 *stat = 1; 118 return 0; 119 } 120 121 STATIC int 122 xfs_rmapbt_free_block( 123 struct xfs_btree_cur *cur, 124 struct xfs_buf *bp) 125 { 126 struct xfs_buf *agbp = cur->bc_ag.agbp; 127 struct xfs_agf *agf = agbp->b_addr; 128 struct xfs_perag *pag = cur->bc_ag.pag; 129 xfs_agblock_t bno; 130 int error; 131 132 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); 133 be32_add_cpu(&agf->agf_rmap_blocks, -1); 134 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 135 error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); 136 if (error) 137 return error; 138 139 xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, 140 XFS_EXTENT_BUSY_SKIP_DISCARD); 141 142 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 143 return 0; 144 } 145 146 STATIC int 147 xfs_rmapbt_get_minrecs( 148 struct xfs_btree_cur *cur, 149 int level) 150 { 151 return cur->bc_mp->m_rmap_mnr[level != 0]; 152 } 153 154 STATIC int 155 xfs_rmapbt_get_maxrecs( 156 struct xfs_btree_cur *cur, 157 int level) 158 { 159 return cur->bc_mp->m_rmap_mxr[level != 0]; 160 } 161 162 /* 163 * Convert the ondisk record's offset field into the ondisk key's offset field. 164 * Fork and bmbt are significant parts of the rmap record key, but written 165 * status is merely a record attribute. 166 */ 167 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) 168 { 169 return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); 170 } 171 172 STATIC void 173 xfs_rmapbt_init_key_from_rec( 174 union xfs_btree_key *key, 175 const union xfs_btree_rec *rec) 176 { 177 key->rmap.rm_startblock = rec->rmap.rm_startblock; 178 key->rmap.rm_owner = rec->rmap.rm_owner; 179 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 180 } 181 182 /* 183 * The high key for a reverse mapping record can be computed by shifting 184 * the startblock and offset to the highest value that would still map 185 * to that record. In practice this means that we add blockcount-1 to 186 * the startblock for all records, and if the record is for a data/attr 187 * fork mapping, we add blockcount-1 to the offset too. 188 */ 189 STATIC void 190 xfs_rmapbt_init_high_key_from_rec( 191 union xfs_btree_key *key, 192 const union xfs_btree_rec *rec) 193 { 194 uint64_t off; 195 int adj; 196 197 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 198 199 key->rmap.rm_startblock = rec->rmap.rm_startblock; 200 be32_add_cpu(&key->rmap.rm_startblock, adj); 201 key->rmap.rm_owner = rec->rmap.rm_owner; 202 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 203 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 204 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 205 return; 206 off = be64_to_cpu(key->rmap.rm_offset); 207 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 208 key->rmap.rm_offset = cpu_to_be64(off); 209 } 210 211 STATIC void 212 xfs_rmapbt_init_rec_from_cur( 213 struct xfs_btree_cur *cur, 214 union xfs_btree_rec *rec) 215 { 216 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 217 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 218 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 219 rec->rmap.rm_offset = cpu_to_be64( 220 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 221 } 222 223 STATIC void 224 xfs_rmapbt_init_ptr_from_cur( 225 struct xfs_btree_cur *cur, 226 union xfs_btree_ptr *ptr) 227 { 228 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 229 230 ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); 231 232 ptr->s = agf->agf_rmap_root; 233 } 234 235 /* 236 * Mask the appropriate parts of the ondisk key field for a key comparison. 237 * Fork and bmbt are significant parts of the rmap record key, but written 238 * status is merely a record attribute. 239 */ 240 static inline uint64_t offset_keymask(uint64_t offset) 241 { 242 return offset & ~XFS_RMAP_OFF_UNWRITTEN; 243 } 244 245 STATIC int64_t 246 xfs_rmapbt_key_diff( 247 struct xfs_btree_cur *cur, 248 const union xfs_btree_key *key) 249 { 250 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 251 const struct xfs_rmap_key *kp = &key->rmap; 252 __u64 x, y; 253 int64_t d; 254 255 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 256 if (d) 257 return d; 258 259 x = be64_to_cpu(kp->rm_owner); 260 y = rec->rm_owner; 261 if (x > y) 262 return 1; 263 else if (y > x) 264 return -1; 265 266 x = offset_keymask(be64_to_cpu(kp->rm_offset)); 267 y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); 268 if (x > y) 269 return 1; 270 else if (y > x) 271 return -1; 272 return 0; 273 } 274 275 STATIC int64_t 276 xfs_rmapbt_diff_two_keys( 277 struct xfs_btree_cur *cur, 278 const union xfs_btree_key *k1, 279 const union xfs_btree_key *k2, 280 const union xfs_btree_key *mask) 281 { 282 const struct xfs_rmap_key *kp1 = &k1->rmap; 283 const struct xfs_rmap_key *kp2 = &k2->rmap; 284 int64_t d; 285 __u64 x, y; 286 287 /* Doesn't make sense to mask off the physical space part */ 288 ASSERT(!mask || mask->rmap.rm_startblock); 289 290 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 291 be32_to_cpu(kp2->rm_startblock); 292 if (d) 293 return d; 294 295 if (!mask || mask->rmap.rm_owner) { 296 x = be64_to_cpu(kp1->rm_owner); 297 y = be64_to_cpu(kp2->rm_owner); 298 if (x > y) 299 return 1; 300 else if (y > x) 301 return -1; 302 } 303 304 if (!mask || mask->rmap.rm_offset) { 305 /* Doesn't make sense to allow offset but not owner */ 306 ASSERT(!mask || mask->rmap.rm_owner); 307 308 x = offset_keymask(be64_to_cpu(kp1->rm_offset)); 309 y = offset_keymask(be64_to_cpu(kp2->rm_offset)); 310 if (x > y) 311 return 1; 312 else if (y > x) 313 return -1; 314 } 315 316 return 0; 317 } 318 319 static xfs_failaddr_t 320 xfs_rmapbt_verify( 321 struct xfs_buf *bp) 322 { 323 struct xfs_mount *mp = bp->b_mount; 324 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 325 struct xfs_perag *pag = bp->b_pag; 326 xfs_failaddr_t fa; 327 unsigned int level; 328 329 /* 330 * magic number and level verification 331 * 332 * During growfs operations, we can't verify the exact level or owner as 333 * the perag is not fully initialised and hence not attached to the 334 * buffer. In this case, check against the maximum tree depth. 335 * 336 * Similarly, during log recovery we will have a perag structure 337 * attached, but the agf information will not yet have been initialised 338 * from the on disk AGF. Again, we can only check against maximum limits 339 * in this case. 340 */ 341 if (!xfs_verify_magic(bp, block->bb_magic)) 342 return __this_address; 343 344 if (!xfs_has_rmapbt(mp)) 345 return __this_address; 346 fa = xfs_btree_agblock_v5hdr_verify(bp); 347 if (fa) 348 return fa; 349 350 level = be16_to_cpu(block->bb_level); 351 if (pag && xfs_perag_initialised_agf(pag)) { 352 unsigned int maxlevel = pag->pagf_rmap_level; 353 354 #ifdef CONFIG_XFS_ONLINE_REPAIR 355 /* 356 * Online repair could be rewriting the free space btrees, so 357 * we'll validate against the larger of either tree while this 358 * is going on. 359 */ 360 maxlevel = max_t(unsigned int, maxlevel, 361 pag->pagf_repair_rmap_level); 362 #endif 363 if (level >= maxlevel) 364 return __this_address; 365 } else if (level >= mp->m_rmap_maxlevels) 366 return __this_address; 367 368 return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); 369 } 370 371 static void 372 xfs_rmapbt_read_verify( 373 struct xfs_buf *bp) 374 { 375 xfs_failaddr_t fa; 376 377 if (!xfs_btree_agblock_verify_crc(bp)) 378 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 379 else { 380 fa = xfs_rmapbt_verify(bp); 381 if (fa) 382 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 383 } 384 385 if (bp->b_error) 386 trace_xfs_btree_corrupt(bp, _RET_IP_); 387 } 388 389 static void 390 xfs_rmapbt_write_verify( 391 struct xfs_buf *bp) 392 { 393 xfs_failaddr_t fa; 394 395 fa = xfs_rmapbt_verify(bp); 396 if (fa) { 397 trace_xfs_btree_corrupt(bp, _RET_IP_); 398 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 399 return; 400 } 401 xfs_btree_agblock_calc_crc(bp); 402 403 } 404 405 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 406 .name = "xfs_rmapbt", 407 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 408 .verify_read = xfs_rmapbt_read_verify, 409 .verify_write = xfs_rmapbt_write_verify, 410 .verify_struct = xfs_rmapbt_verify, 411 }; 412 413 STATIC int 414 xfs_rmapbt_keys_inorder( 415 struct xfs_btree_cur *cur, 416 const union xfs_btree_key *k1, 417 const union xfs_btree_key *k2) 418 { 419 uint32_t x; 420 uint32_t y; 421 uint64_t a; 422 uint64_t b; 423 424 x = be32_to_cpu(k1->rmap.rm_startblock); 425 y = be32_to_cpu(k2->rmap.rm_startblock); 426 if (x < y) 427 return 1; 428 else if (x > y) 429 return 0; 430 a = be64_to_cpu(k1->rmap.rm_owner); 431 b = be64_to_cpu(k2->rmap.rm_owner); 432 if (a < b) 433 return 1; 434 else if (a > b) 435 return 0; 436 a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); 437 b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); 438 if (a <= b) 439 return 1; 440 return 0; 441 } 442 443 STATIC int 444 xfs_rmapbt_recs_inorder( 445 struct xfs_btree_cur *cur, 446 const union xfs_btree_rec *r1, 447 const union xfs_btree_rec *r2) 448 { 449 uint32_t x; 450 uint32_t y; 451 uint64_t a; 452 uint64_t b; 453 454 x = be32_to_cpu(r1->rmap.rm_startblock); 455 y = be32_to_cpu(r2->rmap.rm_startblock); 456 if (x < y) 457 return 1; 458 else if (x > y) 459 return 0; 460 a = be64_to_cpu(r1->rmap.rm_owner); 461 b = be64_to_cpu(r2->rmap.rm_owner); 462 if (a < b) 463 return 1; 464 else if (a > b) 465 return 0; 466 a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); 467 b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); 468 if (a <= b) 469 return 1; 470 return 0; 471 } 472 473 STATIC enum xbtree_key_contig 474 xfs_rmapbt_keys_contiguous( 475 struct xfs_btree_cur *cur, 476 const union xfs_btree_key *key1, 477 const union xfs_btree_key *key2, 478 const union xfs_btree_key *mask) 479 { 480 ASSERT(!mask || mask->rmap.rm_startblock); 481 482 /* 483 * We only support checking contiguity of the physical space component. 484 * If any callers ever need more specificity than that, they'll have to 485 * implement it here. 486 */ 487 ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); 488 489 return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), 490 be32_to_cpu(key2->rmap.rm_startblock)); 491 } 492 493 const struct xfs_btree_ops xfs_rmapbt_ops = { 494 .name = "rmap", 495 .type = XFS_BTREE_TYPE_AG, 496 .geom_flags = XFS_BTGEO_OVERLAPPING, 497 498 .rec_len = sizeof(struct xfs_rmap_rec), 499 /* Overlapping btree; 2 keys per pointer. */ 500 .key_len = 2 * sizeof(struct xfs_rmap_key), 501 .ptr_len = XFS_BTREE_SHORT_PTR_LEN, 502 503 .lru_refs = XFS_RMAP_BTREE_REF, 504 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), 505 .sick_mask = XFS_SICK_AG_RMAPBT, 506 507 .dup_cursor = xfs_rmapbt_dup_cursor, 508 .set_root = xfs_rmapbt_set_root, 509 .alloc_block = xfs_rmapbt_alloc_block, 510 .free_block = xfs_rmapbt_free_block, 511 .get_minrecs = xfs_rmapbt_get_minrecs, 512 .get_maxrecs = xfs_rmapbt_get_maxrecs, 513 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 514 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 515 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 516 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 517 .key_diff = xfs_rmapbt_key_diff, 518 .buf_ops = &xfs_rmapbt_buf_ops, 519 .diff_two_keys = xfs_rmapbt_diff_two_keys, 520 .keys_inorder = xfs_rmapbt_keys_inorder, 521 .recs_inorder = xfs_rmapbt_recs_inorder, 522 .keys_contiguous = xfs_rmapbt_keys_contiguous, 523 }; 524 525 /* 526 * Create a new reverse mapping btree cursor. 527 * 528 * For staging cursors tp and agbp are NULL. 529 */ 530 struct xfs_btree_cur * 531 xfs_rmapbt_init_cursor( 532 struct xfs_mount *mp, 533 struct xfs_trans *tp, 534 struct xfs_buf *agbp, 535 struct xfs_perag *pag) 536 { 537 struct xfs_btree_cur *cur; 538 539 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, 540 mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); 541 cur->bc_ag.pag = xfs_perag_hold(pag); 542 cur->bc_ag.agbp = agbp; 543 if (agbp) { 544 struct xfs_agf *agf = agbp->b_addr; 545 546 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); 547 } 548 return cur; 549 } 550 551 #ifdef CONFIG_XFS_BTREE_IN_MEM 552 static inline unsigned int 553 xfs_rmapbt_mem_block_maxrecs( 554 unsigned int blocklen, 555 bool leaf) 556 { 557 if (leaf) 558 return blocklen / sizeof(struct xfs_rmap_rec); 559 return blocklen / 560 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); 561 } 562 563 /* 564 * Validate an in-memory rmap btree block. Callers are allowed to generate an 565 * in-memory btree even if the ondisk feature is not enabled. 566 */ 567 static xfs_failaddr_t 568 xfs_rmapbt_mem_verify( 569 struct xfs_buf *bp) 570 { 571 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 572 xfs_failaddr_t fa; 573 unsigned int level; 574 unsigned int maxrecs; 575 576 if (!xfs_verify_magic(bp, block->bb_magic)) 577 return __this_address; 578 579 fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 580 if (fa) 581 return fa; 582 583 level = be16_to_cpu(block->bb_level); 584 if (level >= xfs_rmapbt_maxlevels_ondisk()) 585 return __this_address; 586 587 maxrecs = xfs_rmapbt_mem_block_maxrecs( 588 XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); 589 return xfs_btree_memblock_verify(bp, maxrecs); 590 } 591 592 static void 593 xfs_rmapbt_mem_rw_verify( 594 struct xfs_buf *bp) 595 { 596 xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); 597 598 if (fa) 599 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 600 } 601 602 /* skip crc checks on in-memory btrees to save time */ 603 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { 604 .name = "xfs_rmapbt_mem", 605 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 606 .verify_read = xfs_rmapbt_mem_rw_verify, 607 .verify_write = xfs_rmapbt_mem_rw_verify, 608 .verify_struct = xfs_rmapbt_mem_verify, 609 }; 610 611 const struct xfs_btree_ops xfs_rmapbt_mem_ops = { 612 .name = "mem_rmap", 613 .type = XFS_BTREE_TYPE_MEM, 614 .geom_flags = XFS_BTGEO_OVERLAPPING, 615 616 .rec_len = sizeof(struct xfs_rmap_rec), 617 /* Overlapping btree; 2 keys per pointer. */ 618 .key_len = 2 * sizeof(struct xfs_rmap_key), 619 .ptr_len = XFS_BTREE_LONG_PTR_LEN, 620 621 .lru_refs = XFS_RMAP_BTREE_REF, 622 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), 623 624 .dup_cursor = xfbtree_dup_cursor, 625 .set_root = xfbtree_set_root, 626 .alloc_block = xfbtree_alloc_block, 627 .free_block = xfbtree_free_block, 628 .get_minrecs = xfbtree_get_minrecs, 629 .get_maxrecs = xfbtree_get_maxrecs, 630 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 631 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 632 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 633 .init_ptr_from_cur = xfbtree_init_ptr_from_cur, 634 .key_diff = xfs_rmapbt_key_diff, 635 .buf_ops = &xfs_rmapbt_mem_buf_ops, 636 .diff_two_keys = xfs_rmapbt_diff_two_keys, 637 .keys_inorder = xfs_rmapbt_keys_inorder, 638 .recs_inorder = xfs_rmapbt_recs_inorder, 639 .keys_contiguous = xfs_rmapbt_keys_contiguous, 640 }; 641 642 /* Create a cursor for an in-memory btree. */ 643 struct xfs_btree_cur * 644 xfs_rmapbt_mem_cursor( 645 struct xfs_perag *pag, 646 struct xfs_trans *tp, 647 struct xfbtree *xfbt) 648 { 649 struct xfs_btree_cur *cur; 650 struct xfs_mount *mp = pag->pag_mount; 651 652 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, 653 xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); 654 cur->bc_mem.xfbtree = xfbt; 655 cur->bc_nlevels = xfbt->nlevels; 656 657 cur->bc_mem.pag = xfs_perag_hold(pag); 658 return cur; 659 } 660 661 /* Create an in-memory rmap btree. */ 662 int 663 xfs_rmapbt_mem_init( 664 struct xfs_mount *mp, 665 struct xfbtree *xfbt, 666 struct xfs_buftarg *btp, 667 xfs_agnumber_t agno) 668 { 669 xfbt->owner = agno; 670 return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); 671 } 672 673 /* Compute the max possible height for reverse mapping btrees in memory. */ 674 static unsigned int 675 xfs_rmapbt_mem_maxlevels(void) 676 { 677 unsigned int minrecs[2]; 678 unsigned int blocklen; 679 680 blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 681 682 minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; 683 minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; 684 685 /* 686 * How tall can an in-memory rmap btree become if we filled the entire 687 * AG with rmap records? 688 */ 689 return xfs_btree_compute_maxlevels(minrecs, 690 XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); 691 } 692 #else 693 # define xfs_rmapbt_mem_maxlevels() (0) 694 #endif /* CONFIG_XFS_BTREE_IN_MEM */ 695 696 /* 697 * Install a new reverse mapping btree root. Caller is responsible for 698 * invalidating and freeing the old btree blocks. 699 */ 700 void 701 xfs_rmapbt_commit_staged_btree( 702 struct xfs_btree_cur *cur, 703 struct xfs_trans *tp, 704 struct xfs_buf *agbp) 705 { 706 struct xfs_agf *agf = agbp->b_addr; 707 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 708 709 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 710 711 agf->agf_rmap_root = cpu_to_be32(afake->af_root); 712 agf->agf_rmap_level = cpu_to_be32(afake->af_levels); 713 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); 714 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | 715 XFS_AGF_RMAP_BLOCKS); 716 xfs_btree_commit_afakeroot(cur, tp, agbp); 717 } 718 719 /* Calculate number of records in a reverse mapping btree block. */ 720 static inline unsigned int 721 xfs_rmapbt_block_maxrecs( 722 unsigned int blocklen, 723 bool leaf) 724 { 725 if (leaf) 726 return blocklen / sizeof(struct xfs_rmap_rec); 727 return blocklen / 728 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 729 } 730 731 /* 732 * Calculate number of records in an rmap btree block. 733 */ 734 int 735 xfs_rmapbt_maxrecs( 736 int blocklen, 737 int leaf) 738 { 739 blocklen -= XFS_RMAP_BLOCK_LEN; 740 return xfs_rmapbt_block_maxrecs(blocklen, leaf); 741 } 742 743 /* Compute the max possible height for reverse mapping btrees. */ 744 unsigned int 745 xfs_rmapbt_maxlevels_ondisk(void) 746 { 747 unsigned int minrecs[2]; 748 unsigned int blocklen; 749 750 blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; 751 752 minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2; 753 minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2; 754 755 /* 756 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs. 757 * 758 * On a reflink filesystem, each AG block can have up to 2^32 (per the 759 * refcount record format) owners, which means that theoretically we 760 * could face up to 2^64 rmap records. However, we're likely to run 761 * out of blocks in the AG long before that happens, which means that 762 * we must compute the max height based on what the btree will look 763 * like if it consumes almost all the blocks in the AG due to maximal 764 * sharing factor. 765 */ 766 return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), 767 xfs_rmapbt_mem_maxlevels()); 768 } 769 770 /* Compute the maximum height of an rmap btree. */ 771 void 772 xfs_rmapbt_compute_maxlevels( 773 struct xfs_mount *mp) 774 { 775 if (!xfs_has_rmapbt(mp)) { 776 mp->m_rmap_maxlevels = 0; 777 return; 778 } 779 780 if (xfs_has_reflink(mp)) { 781 /* 782 * Compute the asymptotic maxlevels for an rmap btree on a 783 * filesystem that supports reflink. 784 * 785 * On a reflink filesystem, each AG block can have up to 2^32 786 * (per the refcount record format) owners, which means that 787 * theoretically we could face up to 2^64 rmap records. 788 * However, we're likely to run out of blocks in the AG long 789 * before that happens, which means that we must compute the 790 * max height based on what the btree will look like if it 791 * consumes almost all the blocks in the AG due to maximal 792 * sharing factor. 793 */ 794 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr, 795 mp->m_sb.sb_agblocks); 796 } else { 797 /* 798 * If there's no block sharing, compute the maximum rmapbt 799 * height assuming one rmap record per AG block. 800 */ 801 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 802 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 803 } 804 ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk()); 805 } 806 807 /* Calculate the refcount btree size for some records. */ 808 xfs_extlen_t 809 xfs_rmapbt_calc_size( 810 struct xfs_mount *mp, 811 unsigned long long len) 812 { 813 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 814 } 815 816 /* 817 * Calculate the maximum refcount btree size. 818 */ 819 xfs_extlen_t 820 xfs_rmapbt_max_size( 821 struct xfs_mount *mp, 822 xfs_agblock_t agblocks) 823 { 824 /* Bail out if we're uninitialized, which can happen in mkfs. */ 825 if (mp->m_rmap_mxr[0] == 0) 826 return 0; 827 828 return xfs_rmapbt_calc_size(mp, agblocks); 829 } 830 831 /* 832 * Figure out how many blocks to reserve and how many are used by this btree. 833 */ 834 int 835 xfs_rmapbt_calc_reserves( 836 struct xfs_mount *mp, 837 struct xfs_trans *tp, 838 struct xfs_perag *pag, 839 xfs_extlen_t *ask, 840 xfs_extlen_t *used) 841 { 842 struct xfs_buf *agbp; 843 struct xfs_agf *agf; 844 xfs_agblock_t agblocks; 845 xfs_extlen_t tree_len; 846 int error; 847 848 if (!xfs_has_rmapbt(mp)) 849 return 0; 850 851 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 852 if (error) 853 return error; 854 855 agf = agbp->b_addr; 856 agblocks = be32_to_cpu(agf->agf_length); 857 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 858 xfs_trans_brelse(tp, agbp); 859 860 /* 861 * The log is permanently allocated, so the space it occupies will 862 * never be available for the kinds of things that would require btree 863 * expansion. We therefore can pretend the space isn't there. 864 */ 865 if (xfs_ag_contains_log(mp, pag->pag_agno)) 866 agblocks -= mp->m_sb.sb_logblocks; 867 868 /* Reserve 1% of the AG or enough for 1 block per record. */ 869 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 870 *used += tree_len; 871 872 return error; 873 } 874 875 int __init 876 xfs_rmapbt_init_cur_cache(void) 877 { 878 xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur", 879 xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()), 880 0, 0, NULL); 881 882 if (!xfs_rmapbt_cur_cache) 883 return -ENOMEM; 884 return 0; 885 } 886 887 void 888 xfs_rmapbt_destroy_cur_cache(void) 889 { 890 kmem_cache_destroy(xfs_rmapbt_cur_cache); 891 xfs_rmapbt_cur_cache = NULL; 892 } 893