1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_trans.h" 14 #include "xfs_alloc.h" 15 #include "xfs_btree.h" 16 #include "xfs_btree_staging.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_health.h" 20 #include "xfs_trace.h" 21 #include "xfs_error.h" 22 #include "xfs_extent_busy.h" 23 #include "xfs_ag.h" 24 #include "xfs_ag_resv.h" 25 #include "xfs_buf_mem.h" 26 #include "xfs_btree_mem.h" 27 28 static struct kmem_cache *xfs_rmapbt_cur_cache; 29 30 /* 31 * Reverse map btree. 32 * 33 * This is a per-ag tree used to track the owner(s) of a given extent. With 34 * reflink it is possible for there to be multiple owners, which is a departure 35 * from classic XFS. Owner records for data extents are inserted when the 36 * extent is mapped and removed when an extent is unmapped. Owner records for 37 * all other block types (i.e. metadata) are inserted when an extent is 38 * allocated and removed when an extent is freed. There can only be one owner 39 * of a metadata extent, usually an inode or some other metadata structure like 40 * an AG btree. 41 * 42 * The rmap btree is part of the free space management, so blocks for the tree 43 * are sourced from the agfl. Hence we need transaction reservation support for 44 * this tree so that the freelist is always large enough. This also impacts on 45 * the minimum space we need to leave free in the AG. 46 * 47 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 48 * but it is the only way to enforce unique keys when a block can be owned by 49 * multiple files at any offset. There's no need to order/search by extent 50 * size for online updating/management of the tree. It is intended that most 51 * reverse lookups will be to find the owner(s) of a particular block, or to 52 * try to recover tree and file data from corrupt primary metadata. 53 */ 54 55 static struct xfs_btree_cur * 56 xfs_rmapbt_dup_cursor( 57 struct xfs_btree_cur *cur) 58 { 59 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 60 cur->bc_ag.agbp, cur->bc_ag.pag); 61 } 62 63 STATIC void 64 xfs_rmapbt_set_root( 65 struct xfs_btree_cur *cur, 66 const union xfs_btree_ptr *ptr, 67 int inc) 68 { 69 struct xfs_buf *agbp = cur->bc_ag.agbp; 70 struct xfs_agf *agf = agbp->b_addr; 71 72 ASSERT(ptr->s != 0); 73 74 agf->agf_rmap_root = ptr->s; 75 be32_add_cpu(&agf->agf_rmap_level, inc); 76 cur->bc_ag.pag->pagf_rmap_level += inc; 77 78 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 79 } 80 81 STATIC int 82 xfs_rmapbt_alloc_block( 83 struct xfs_btree_cur *cur, 84 const union xfs_btree_ptr *start, 85 union xfs_btree_ptr *new, 86 int *stat) 87 { 88 struct xfs_buf *agbp = cur->bc_ag.agbp; 89 struct xfs_agf *agf = agbp->b_addr; 90 struct xfs_perag *pag = cur->bc_ag.pag; 91 int error; 92 xfs_agblock_t bno; 93 94 /* Allocate the new block from the freelist. If we can't, give up. */ 95 error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, 96 &bno, 1); 97 if (error) 98 return error; 99 if (bno == NULLAGBLOCK) { 100 *stat = 0; 101 return 0; 102 } 103 104 xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); 105 106 new->s = cpu_to_be32(bno); 107 be32_add_cpu(&agf->agf_rmap_blocks, 1); 108 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 109 110 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); 111 112 *stat = 1; 113 return 0; 114 } 115 116 STATIC int 117 xfs_rmapbt_free_block( 118 struct xfs_btree_cur *cur, 119 struct xfs_buf *bp) 120 { 121 struct xfs_buf *agbp = cur->bc_ag.agbp; 122 struct xfs_agf *agf = agbp->b_addr; 123 struct xfs_perag *pag = cur->bc_ag.pag; 124 xfs_agblock_t bno; 125 int error; 126 127 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); 128 be32_add_cpu(&agf->agf_rmap_blocks, -1); 129 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 130 error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); 131 if (error) 132 return error; 133 134 xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, 135 XFS_EXTENT_BUSY_SKIP_DISCARD); 136 137 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 138 return 0; 139 } 140 141 STATIC int 142 xfs_rmapbt_get_minrecs( 143 struct xfs_btree_cur *cur, 144 int level) 145 { 146 return cur->bc_mp->m_rmap_mnr[level != 0]; 147 } 148 149 STATIC int 150 xfs_rmapbt_get_maxrecs( 151 struct xfs_btree_cur *cur, 152 int level) 153 { 154 return cur->bc_mp->m_rmap_mxr[level != 0]; 155 } 156 157 /* 158 * Convert the ondisk record's offset field into the ondisk key's offset field. 159 * Fork and bmbt are significant parts of the rmap record key, but written 160 * status is merely a record attribute. 161 */ 162 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) 163 { 164 return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); 165 } 166 167 STATIC void 168 xfs_rmapbt_init_key_from_rec( 169 union xfs_btree_key *key, 170 const union xfs_btree_rec *rec) 171 { 172 key->rmap.rm_startblock = rec->rmap.rm_startblock; 173 key->rmap.rm_owner = rec->rmap.rm_owner; 174 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 175 } 176 177 /* 178 * The high key for a reverse mapping record can be computed by shifting 179 * the startblock and offset to the highest value that would still map 180 * to that record. In practice this means that we add blockcount-1 to 181 * the startblock for all records, and if the record is for a data/attr 182 * fork mapping, we add blockcount-1 to the offset too. 183 */ 184 STATIC void 185 xfs_rmapbt_init_high_key_from_rec( 186 union xfs_btree_key *key, 187 const union xfs_btree_rec *rec) 188 { 189 uint64_t off; 190 int adj; 191 192 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 193 194 key->rmap.rm_startblock = rec->rmap.rm_startblock; 195 be32_add_cpu(&key->rmap.rm_startblock, adj); 196 key->rmap.rm_owner = rec->rmap.rm_owner; 197 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 198 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 199 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 200 return; 201 off = be64_to_cpu(key->rmap.rm_offset); 202 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 203 key->rmap.rm_offset = cpu_to_be64(off); 204 } 205 206 STATIC void 207 xfs_rmapbt_init_rec_from_cur( 208 struct xfs_btree_cur *cur, 209 union xfs_btree_rec *rec) 210 { 211 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 212 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 213 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 214 rec->rmap.rm_offset = cpu_to_be64( 215 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 216 } 217 218 STATIC void 219 xfs_rmapbt_init_ptr_from_cur( 220 struct xfs_btree_cur *cur, 221 union xfs_btree_ptr *ptr) 222 { 223 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 224 225 ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); 226 227 ptr->s = agf->agf_rmap_root; 228 } 229 230 /* 231 * Mask the appropriate parts of the ondisk key field for a key comparison. 232 * Fork and bmbt are significant parts of the rmap record key, but written 233 * status is merely a record attribute. 234 */ 235 static inline uint64_t offset_keymask(uint64_t offset) 236 { 237 return offset & ~XFS_RMAP_OFF_UNWRITTEN; 238 } 239 240 STATIC int64_t 241 xfs_rmapbt_key_diff( 242 struct xfs_btree_cur *cur, 243 const union xfs_btree_key *key) 244 { 245 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 246 const struct xfs_rmap_key *kp = &key->rmap; 247 __u64 x, y; 248 int64_t d; 249 250 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 251 if (d) 252 return d; 253 254 x = be64_to_cpu(kp->rm_owner); 255 y = rec->rm_owner; 256 if (x > y) 257 return 1; 258 else if (y > x) 259 return -1; 260 261 x = offset_keymask(be64_to_cpu(kp->rm_offset)); 262 y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); 263 if (x > y) 264 return 1; 265 else if (y > x) 266 return -1; 267 return 0; 268 } 269 270 STATIC int64_t 271 xfs_rmapbt_diff_two_keys( 272 struct xfs_btree_cur *cur, 273 const union xfs_btree_key *k1, 274 const union xfs_btree_key *k2, 275 const union xfs_btree_key *mask) 276 { 277 const struct xfs_rmap_key *kp1 = &k1->rmap; 278 const struct xfs_rmap_key *kp2 = &k2->rmap; 279 int64_t d; 280 __u64 x, y; 281 282 /* Doesn't make sense to mask off the physical space part */ 283 ASSERT(!mask || mask->rmap.rm_startblock); 284 285 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 286 be32_to_cpu(kp2->rm_startblock); 287 if (d) 288 return d; 289 290 if (!mask || mask->rmap.rm_owner) { 291 x = be64_to_cpu(kp1->rm_owner); 292 y = be64_to_cpu(kp2->rm_owner); 293 if (x > y) 294 return 1; 295 else if (y > x) 296 return -1; 297 } 298 299 if (!mask || mask->rmap.rm_offset) { 300 /* Doesn't make sense to allow offset but not owner */ 301 ASSERT(!mask || mask->rmap.rm_owner); 302 303 x = offset_keymask(be64_to_cpu(kp1->rm_offset)); 304 y = offset_keymask(be64_to_cpu(kp2->rm_offset)); 305 if (x > y) 306 return 1; 307 else if (y > x) 308 return -1; 309 } 310 311 return 0; 312 } 313 314 static xfs_failaddr_t 315 xfs_rmapbt_verify( 316 struct xfs_buf *bp) 317 { 318 struct xfs_mount *mp = bp->b_mount; 319 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 320 struct xfs_perag *pag = bp->b_pag; 321 xfs_failaddr_t fa; 322 unsigned int level; 323 324 /* 325 * magic number and level verification 326 * 327 * During growfs operations, we can't verify the exact level or owner as 328 * the perag is not fully initialised and hence not attached to the 329 * buffer. In this case, check against the maximum tree depth. 330 * 331 * Similarly, during log recovery we will have a perag structure 332 * attached, but the agf information will not yet have been initialised 333 * from the on disk AGF. Again, we can only check against maximum limits 334 * in this case. 335 */ 336 if (!xfs_verify_magic(bp, block->bb_magic)) 337 return __this_address; 338 339 if (!xfs_has_rmapbt(mp)) 340 return __this_address; 341 fa = xfs_btree_agblock_v5hdr_verify(bp); 342 if (fa) 343 return fa; 344 345 level = be16_to_cpu(block->bb_level); 346 if (pag && xfs_perag_initialised_agf(pag)) { 347 unsigned int maxlevel = pag->pagf_rmap_level; 348 349 #ifdef CONFIG_XFS_ONLINE_REPAIR 350 /* 351 * Online repair could be rewriting the free space btrees, so 352 * we'll validate against the larger of either tree while this 353 * is going on. 354 */ 355 maxlevel = max_t(unsigned int, maxlevel, 356 pag->pagf_repair_rmap_level); 357 #endif 358 if (level >= maxlevel) 359 return __this_address; 360 } else if (level >= mp->m_rmap_maxlevels) 361 return __this_address; 362 363 return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); 364 } 365 366 static void 367 xfs_rmapbt_read_verify( 368 struct xfs_buf *bp) 369 { 370 xfs_failaddr_t fa; 371 372 if (!xfs_btree_agblock_verify_crc(bp)) 373 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 374 else { 375 fa = xfs_rmapbt_verify(bp); 376 if (fa) 377 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 378 } 379 380 if (bp->b_error) 381 trace_xfs_btree_corrupt(bp, _RET_IP_); 382 } 383 384 static void 385 xfs_rmapbt_write_verify( 386 struct xfs_buf *bp) 387 { 388 xfs_failaddr_t fa; 389 390 fa = xfs_rmapbt_verify(bp); 391 if (fa) { 392 trace_xfs_btree_corrupt(bp, _RET_IP_); 393 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 394 return; 395 } 396 xfs_btree_agblock_calc_crc(bp); 397 398 } 399 400 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 401 .name = "xfs_rmapbt", 402 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 403 .verify_read = xfs_rmapbt_read_verify, 404 .verify_write = xfs_rmapbt_write_verify, 405 .verify_struct = xfs_rmapbt_verify, 406 }; 407 408 STATIC int 409 xfs_rmapbt_keys_inorder( 410 struct xfs_btree_cur *cur, 411 const union xfs_btree_key *k1, 412 const union xfs_btree_key *k2) 413 { 414 uint32_t x; 415 uint32_t y; 416 uint64_t a; 417 uint64_t b; 418 419 x = be32_to_cpu(k1->rmap.rm_startblock); 420 y = be32_to_cpu(k2->rmap.rm_startblock); 421 if (x < y) 422 return 1; 423 else if (x > y) 424 return 0; 425 a = be64_to_cpu(k1->rmap.rm_owner); 426 b = be64_to_cpu(k2->rmap.rm_owner); 427 if (a < b) 428 return 1; 429 else if (a > b) 430 return 0; 431 a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); 432 b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); 433 if (a <= b) 434 return 1; 435 return 0; 436 } 437 438 STATIC int 439 xfs_rmapbt_recs_inorder( 440 struct xfs_btree_cur *cur, 441 const union xfs_btree_rec *r1, 442 const union xfs_btree_rec *r2) 443 { 444 uint32_t x; 445 uint32_t y; 446 uint64_t a; 447 uint64_t b; 448 449 x = be32_to_cpu(r1->rmap.rm_startblock); 450 y = be32_to_cpu(r2->rmap.rm_startblock); 451 if (x < y) 452 return 1; 453 else if (x > y) 454 return 0; 455 a = be64_to_cpu(r1->rmap.rm_owner); 456 b = be64_to_cpu(r2->rmap.rm_owner); 457 if (a < b) 458 return 1; 459 else if (a > b) 460 return 0; 461 a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); 462 b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); 463 if (a <= b) 464 return 1; 465 return 0; 466 } 467 468 STATIC enum xbtree_key_contig 469 xfs_rmapbt_keys_contiguous( 470 struct xfs_btree_cur *cur, 471 const union xfs_btree_key *key1, 472 const union xfs_btree_key *key2, 473 const union xfs_btree_key *mask) 474 { 475 ASSERT(!mask || mask->rmap.rm_startblock); 476 477 /* 478 * We only support checking contiguity of the physical space component. 479 * If any callers ever need more specificity than that, they'll have to 480 * implement it here. 481 */ 482 ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); 483 484 return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), 485 be32_to_cpu(key2->rmap.rm_startblock)); 486 } 487 488 const struct xfs_btree_ops xfs_rmapbt_ops = { 489 .name = "rmap", 490 .type = XFS_BTREE_TYPE_AG, 491 .geom_flags = XFS_BTGEO_OVERLAPPING, 492 493 .rec_len = sizeof(struct xfs_rmap_rec), 494 /* Overlapping btree; 2 keys per pointer. */ 495 .key_len = 2 * sizeof(struct xfs_rmap_key), 496 .ptr_len = XFS_BTREE_SHORT_PTR_LEN, 497 498 .lru_refs = XFS_RMAP_BTREE_REF, 499 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), 500 .sick_mask = XFS_SICK_AG_RMAPBT, 501 502 .dup_cursor = xfs_rmapbt_dup_cursor, 503 .set_root = xfs_rmapbt_set_root, 504 .alloc_block = xfs_rmapbt_alloc_block, 505 .free_block = xfs_rmapbt_free_block, 506 .get_minrecs = xfs_rmapbt_get_minrecs, 507 .get_maxrecs = xfs_rmapbt_get_maxrecs, 508 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 509 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 510 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 511 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 512 .key_diff = xfs_rmapbt_key_diff, 513 .buf_ops = &xfs_rmapbt_buf_ops, 514 .diff_two_keys = xfs_rmapbt_diff_two_keys, 515 .keys_inorder = xfs_rmapbt_keys_inorder, 516 .recs_inorder = xfs_rmapbt_recs_inorder, 517 .keys_contiguous = xfs_rmapbt_keys_contiguous, 518 }; 519 520 /* 521 * Create a new reverse mapping btree cursor. 522 * 523 * For staging cursors tp and agbp are NULL. 524 */ 525 struct xfs_btree_cur * 526 xfs_rmapbt_init_cursor( 527 struct xfs_mount *mp, 528 struct xfs_trans *tp, 529 struct xfs_buf *agbp, 530 struct xfs_perag *pag) 531 { 532 struct xfs_btree_cur *cur; 533 534 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, 535 mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); 536 cur->bc_ag.pag = xfs_perag_hold(pag); 537 cur->bc_ag.agbp = agbp; 538 if (agbp) { 539 struct xfs_agf *agf = agbp->b_addr; 540 541 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); 542 } 543 return cur; 544 } 545 546 #ifdef CONFIG_XFS_BTREE_IN_MEM 547 static inline unsigned int 548 xfs_rmapbt_mem_block_maxrecs( 549 unsigned int blocklen, 550 bool leaf) 551 { 552 if (leaf) 553 return blocklen / sizeof(struct xfs_rmap_rec); 554 return blocklen / 555 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); 556 } 557 558 /* 559 * Validate an in-memory rmap btree block. Callers are allowed to generate an 560 * in-memory btree even if the ondisk feature is not enabled. 561 */ 562 static xfs_failaddr_t 563 xfs_rmapbt_mem_verify( 564 struct xfs_buf *bp) 565 { 566 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 567 xfs_failaddr_t fa; 568 unsigned int level; 569 unsigned int maxrecs; 570 571 if (!xfs_verify_magic(bp, block->bb_magic)) 572 return __this_address; 573 574 fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 575 if (fa) 576 return fa; 577 578 level = be16_to_cpu(block->bb_level); 579 if (level >= xfs_rmapbt_maxlevels_ondisk()) 580 return __this_address; 581 582 maxrecs = xfs_rmapbt_mem_block_maxrecs( 583 XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); 584 return xfs_btree_memblock_verify(bp, maxrecs); 585 } 586 587 static void 588 xfs_rmapbt_mem_rw_verify( 589 struct xfs_buf *bp) 590 { 591 xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); 592 593 if (fa) 594 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 595 } 596 597 /* skip crc checks on in-memory btrees to save time */ 598 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { 599 .name = "xfs_rmapbt_mem", 600 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 601 .verify_read = xfs_rmapbt_mem_rw_verify, 602 .verify_write = xfs_rmapbt_mem_rw_verify, 603 .verify_struct = xfs_rmapbt_mem_verify, 604 }; 605 606 const struct xfs_btree_ops xfs_rmapbt_mem_ops = { 607 .name = "mem_rmap", 608 .type = XFS_BTREE_TYPE_MEM, 609 .geom_flags = XFS_BTGEO_OVERLAPPING, 610 611 .rec_len = sizeof(struct xfs_rmap_rec), 612 /* Overlapping btree; 2 keys per pointer. */ 613 .key_len = 2 * sizeof(struct xfs_rmap_key), 614 .ptr_len = XFS_BTREE_LONG_PTR_LEN, 615 616 .lru_refs = XFS_RMAP_BTREE_REF, 617 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), 618 619 .dup_cursor = xfbtree_dup_cursor, 620 .set_root = xfbtree_set_root, 621 .alloc_block = xfbtree_alloc_block, 622 .free_block = xfbtree_free_block, 623 .get_minrecs = xfbtree_get_minrecs, 624 .get_maxrecs = xfbtree_get_maxrecs, 625 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 626 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 627 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 628 .init_ptr_from_cur = xfbtree_init_ptr_from_cur, 629 .key_diff = xfs_rmapbt_key_diff, 630 .buf_ops = &xfs_rmapbt_mem_buf_ops, 631 .diff_two_keys = xfs_rmapbt_diff_two_keys, 632 .keys_inorder = xfs_rmapbt_keys_inorder, 633 .recs_inorder = xfs_rmapbt_recs_inorder, 634 .keys_contiguous = xfs_rmapbt_keys_contiguous, 635 }; 636 637 /* Create a cursor for an in-memory btree. */ 638 struct xfs_btree_cur * 639 xfs_rmapbt_mem_cursor( 640 struct xfs_perag *pag, 641 struct xfs_trans *tp, 642 struct xfbtree *xfbt) 643 { 644 struct xfs_btree_cur *cur; 645 struct xfs_mount *mp = pag->pag_mount; 646 647 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, 648 xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); 649 cur->bc_mem.xfbtree = xfbt; 650 cur->bc_nlevels = xfbt->nlevels; 651 652 cur->bc_mem.pag = xfs_perag_hold(pag); 653 return cur; 654 } 655 656 /* Create an in-memory rmap btree. */ 657 int 658 xfs_rmapbt_mem_init( 659 struct xfs_mount *mp, 660 struct xfbtree *xfbt, 661 struct xfs_buftarg *btp, 662 xfs_agnumber_t agno) 663 { 664 xfbt->owner = agno; 665 return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); 666 } 667 668 /* Compute the max possible height for reverse mapping btrees in memory. */ 669 static unsigned int 670 xfs_rmapbt_mem_maxlevels(void) 671 { 672 unsigned int minrecs[2]; 673 unsigned int blocklen; 674 675 blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 676 677 minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; 678 minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; 679 680 /* 681 * How tall can an in-memory rmap btree become if we filled the entire 682 * AG with rmap records? 683 */ 684 return xfs_btree_compute_maxlevels(minrecs, 685 XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); 686 } 687 #else 688 # define xfs_rmapbt_mem_maxlevels() (0) 689 #endif /* CONFIG_XFS_BTREE_IN_MEM */ 690 691 /* 692 * Install a new reverse mapping btree root. Caller is responsible for 693 * invalidating and freeing the old btree blocks. 694 */ 695 void 696 xfs_rmapbt_commit_staged_btree( 697 struct xfs_btree_cur *cur, 698 struct xfs_trans *tp, 699 struct xfs_buf *agbp) 700 { 701 struct xfs_agf *agf = agbp->b_addr; 702 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 703 704 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 705 706 agf->agf_rmap_root = cpu_to_be32(afake->af_root); 707 agf->agf_rmap_level = cpu_to_be32(afake->af_levels); 708 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); 709 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | 710 XFS_AGF_RMAP_BLOCKS); 711 xfs_btree_commit_afakeroot(cur, tp, agbp); 712 } 713 714 /* Calculate number of records in a reverse mapping btree block. */ 715 static inline unsigned int 716 xfs_rmapbt_block_maxrecs( 717 unsigned int blocklen, 718 bool leaf) 719 { 720 if (leaf) 721 return blocklen / sizeof(struct xfs_rmap_rec); 722 return blocklen / 723 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 724 } 725 726 /* 727 * Calculate number of records in an rmap btree block. 728 */ 729 int 730 xfs_rmapbt_maxrecs( 731 int blocklen, 732 int leaf) 733 { 734 blocklen -= XFS_RMAP_BLOCK_LEN; 735 return xfs_rmapbt_block_maxrecs(blocklen, leaf); 736 } 737 738 /* Compute the max possible height for reverse mapping btrees. */ 739 unsigned int 740 xfs_rmapbt_maxlevels_ondisk(void) 741 { 742 unsigned int minrecs[2]; 743 unsigned int blocklen; 744 745 blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; 746 747 minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2; 748 minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2; 749 750 /* 751 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs. 752 * 753 * On a reflink filesystem, each AG block can have up to 2^32 (per the 754 * refcount record format) owners, which means that theoretically we 755 * could face up to 2^64 rmap records. However, we're likely to run 756 * out of blocks in the AG long before that happens, which means that 757 * we must compute the max height based on what the btree will look 758 * like if it consumes almost all the blocks in the AG due to maximal 759 * sharing factor. 760 */ 761 return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), 762 xfs_rmapbt_mem_maxlevels()); 763 } 764 765 /* Compute the maximum height of an rmap btree. */ 766 void 767 xfs_rmapbt_compute_maxlevels( 768 struct xfs_mount *mp) 769 { 770 if (!xfs_has_rmapbt(mp)) { 771 mp->m_rmap_maxlevels = 0; 772 return; 773 } 774 775 if (xfs_has_reflink(mp)) { 776 /* 777 * Compute the asymptotic maxlevels for an rmap btree on a 778 * filesystem that supports reflink. 779 * 780 * On a reflink filesystem, each AG block can have up to 2^32 781 * (per the refcount record format) owners, which means that 782 * theoretically we could face up to 2^64 rmap records. 783 * However, we're likely to run out of blocks in the AG long 784 * before that happens, which means that we must compute the 785 * max height based on what the btree will look like if it 786 * consumes almost all the blocks in the AG due to maximal 787 * sharing factor. 788 */ 789 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr, 790 mp->m_sb.sb_agblocks); 791 } else { 792 /* 793 * If there's no block sharing, compute the maximum rmapbt 794 * height assuming one rmap record per AG block. 795 */ 796 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 797 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 798 } 799 ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk()); 800 } 801 802 /* Calculate the refcount btree size for some records. */ 803 xfs_extlen_t 804 xfs_rmapbt_calc_size( 805 struct xfs_mount *mp, 806 unsigned long long len) 807 { 808 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 809 } 810 811 /* 812 * Calculate the maximum refcount btree size. 813 */ 814 xfs_extlen_t 815 xfs_rmapbt_max_size( 816 struct xfs_mount *mp, 817 xfs_agblock_t agblocks) 818 { 819 /* Bail out if we're uninitialized, which can happen in mkfs. */ 820 if (mp->m_rmap_mxr[0] == 0) 821 return 0; 822 823 return xfs_rmapbt_calc_size(mp, agblocks); 824 } 825 826 /* 827 * Figure out how many blocks to reserve and how many are used by this btree. 828 */ 829 int 830 xfs_rmapbt_calc_reserves( 831 struct xfs_mount *mp, 832 struct xfs_trans *tp, 833 struct xfs_perag *pag, 834 xfs_extlen_t *ask, 835 xfs_extlen_t *used) 836 { 837 struct xfs_buf *agbp; 838 struct xfs_agf *agf; 839 xfs_agblock_t agblocks; 840 xfs_extlen_t tree_len; 841 int error; 842 843 if (!xfs_has_rmapbt(mp)) 844 return 0; 845 846 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 847 if (error) 848 return error; 849 850 agf = agbp->b_addr; 851 agblocks = be32_to_cpu(agf->agf_length); 852 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 853 xfs_trans_brelse(tp, agbp); 854 855 /* 856 * The log is permanently allocated, so the space it occupies will 857 * never be available for the kinds of things that would require btree 858 * expansion. We therefore can pretend the space isn't there. 859 */ 860 if (xfs_ag_contains_log(mp, pag->pag_agno)) 861 agblocks -= mp->m_sb.sb_logblocks; 862 863 /* Reserve 1% of the AG or enough for 1 block per record. */ 864 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 865 *used += tree_len; 866 867 return error; 868 } 869 870 int __init 871 xfs_rmapbt_init_cur_cache(void) 872 { 873 xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur", 874 xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()), 875 0, 0, NULL); 876 877 if (!xfs_rmapbt_cur_cache) 878 return -ENOMEM; 879 return 0; 880 } 881 882 void 883 xfs_rmapbt_destroy_cur_cache(void) 884 { 885 kmem_cache_destroy(xfs_rmapbt_cur_cache); 886 xfs_rmapbt_cur_cache = NULL; 887 } 888