1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_trans.h" 14 #include "xfs_alloc.h" 15 #include "xfs_btree.h" 16 #include "xfs_btree_staging.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_health.h" 20 #include "xfs_trace.h" 21 #include "xfs_error.h" 22 #include "xfs_extent_busy.h" 23 #include "xfs_ag.h" 24 #include "xfs_ag_resv.h" 25 #include "xfs_buf_mem.h" 26 #include "xfs_btree_mem.h" 27 28 static struct kmem_cache *xfs_rmapbt_cur_cache; 29 30 /* 31 * Reverse map btree. 32 * 33 * This is a per-ag tree used to track the owner(s) of a given extent. With 34 * reflink it is possible for there to be multiple owners, which is a departure 35 * from classic XFS. Owner records for data extents are inserted when the 36 * extent is mapped and removed when an extent is unmapped. Owner records for 37 * all other block types (i.e. metadata) are inserted when an extent is 38 * allocated and removed when an extent is freed. There can only be one owner 39 * of a metadata extent, usually an inode or some other metadata structure like 40 * an AG btree. 41 * 42 * The rmap btree is part of the free space management, so blocks for the tree 43 * are sourced from the agfl. Hence we need transaction reservation support for 44 * this tree so that the freelist is always large enough. This also impacts on 45 * the minimum space we need to leave free in the AG. 46 * 47 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 48 * but it is the only way to enforce unique keys when a block can be owned by 49 * multiple files at any offset. There's no need to order/search by extent 50 * size for online updating/management of the tree. It is intended that most 51 * reverse lookups will be to find the owner(s) of a particular block, or to 52 * try to recover tree and file data from corrupt primary metadata. 53 */ 54 55 static struct xfs_btree_cur * 56 xfs_rmapbt_dup_cursor( 57 struct xfs_btree_cur *cur) 58 { 59 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 60 cur->bc_ag.agbp, to_perag(cur->bc_group)); 61 } 62 63 STATIC void 64 xfs_rmapbt_set_root( 65 struct xfs_btree_cur *cur, 66 const union xfs_btree_ptr *ptr, 67 int inc) 68 { 69 struct xfs_buf *agbp = cur->bc_ag.agbp; 70 struct xfs_agf *agf = agbp->b_addr; 71 struct xfs_perag *pag = to_perag(cur->bc_group); 72 73 ASSERT(ptr->s != 0); 74 75 agf->agf_rmap_root = ptr->s; 76 be32_add_cpu(&agf->agf_rmap_level, inc); 77 pag->pagf_rmap_level += inc; 78 79 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 80 } 81 82 STATIC int 83 xfs_rmapbt_alloc_block( 84 struct xfs_btree_cur *cur, 85 const union xfs_btree_ptr *start, 86 union xfs_btree_ptr *new, 87 int *stat) 88 { 89 struct xfs_buf *agbp = cur->bc_ag.agbp; 90 struct xfs_agf *agf = agbp->b_addr; 91 struct xfs_perag *pag = to_perag(cur->bc_group); 92 struct xfs_alloc_arg args = { .len = 1 }; 93 int error; 94 xfs_agblock_t bno; 95 96 /* Allocate the new block from the freelist. If we can't, give up. */ 97 error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, 98 &bno, 1); 99 if (error) 100 return error; 101 if (bno == NULLAGBLOCK) { 102 *stat = 0; 103 return 0; 104 } 105 106 xfs_extent_busy_reuse(pag_group(pag), bno, 1, false); 107 108 new->s = cpu_to_be32(bno); 109 be32_add_cpu(&agf->agf_rmap_blocks, 1); 110 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 111 112 /* 113 * Since rmapbt blocks are sourced from the AGFL, they are allocated one 114 * at a time and the reservation updates don't require a transaction. 115 */ 116 xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); 117 118 *stat = 1; 119 return 0; 120 } 121 122 STATIC int 123 xfs_rmapbt_free_block( 124 struct xfs_btree_cur *cur, 125 struct xfs_buf *bp) 126 { 127 struct xfs_buf *agbp = cur->bc_ag.agbp; 128 struct xfs_agf *agf = agbp->b_addr; 129 struct xfs_perag *pag = to_perag(cur->bc_group); 130 xfs_agblock_t bno; 131 int error; 132 133 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); 134 be32_add_cpu(&agf->agf_rmap_blocks, -1); 135 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 136 error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); 137 if (error) 138 return error; 139 140 xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1, 141 XFS_EXTENT_BUSY_SKIP_DISCARD); 142 143 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 144 return 0; 145 } 146 147 STATIC int 148 xfs_rmapbt_get_minrecs( 149 struct xfs_btree_cur *cur, 150 int level) 151 { 152 return cur->bc_mp->m_rmap_mnr[level != 0]; 153 } 154 155 STATIC int 156 xfs_rmapbt_get_maxrecs( 157 struct xfs_btree_cur *cur, 158 int level) 159 { 160 return cur->bc_mp->m_rmap_mxr[level != 0]; 161 } 162 163 /* 164 * Convert the ondisk record's offset field into the ondisk key's offset field. 165 * Fork and bmbt are significant parts of the rmap record key, but written 166 * status is merely a record attribute. 167 */ 168 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) 169 { 170 return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); 171 } 172 173 STATIC void 174 xfs_rmapbt_init_key_from_rec( 175 union xfs_btree_key *key, 176 const union xfs_btree_rec *rec) 177 { 178 key->rmap.rm_startblock = rec->rmap.rm_startblock; 179 key->rmap.rm_owner = rec->rmap.rm_owner; 180 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 181 } 182 183 /* 184 * The high key for a reverse mapping record can be computed by shifting 185 * the startblock and offset to the highest value that would still map 186 * to that record. In practice this means that we add blockcount-1 to 187 * the startblock for all records, and if the record is for a data/attr 188 * fork mapping, we add blockcount-1 to the offset too. 189 */ 190 STATIC void 191 xfs_rmapbt_init_high_key_from_rec( 192 union xfs_btree_key *key, 193 const union xfs_btree_rec *rec) 194 { 195 uint64_t off; 196 int adj; 197 198 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 199 200 key->rmap.rm_startblock = rec->rmap.rm_startblock; 201 be32_add_cpu(&key->rmap.rm_startblock, adj); 202 key->rmap.rm_owner = rec->rmap.rm_owner; 203 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 204 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 205 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 206 return; 207 off = be64_to_cpu(key->rmap.rm_offset); 208 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 209 key->rmap.rm_offset = cpu_to_be64(off); 210 } 211 212 STATIC void 213 xfs_rmapbt_init_rec_from_cur( 214 struct xfs_btree_cur *cur, 215 union xfs_btree_rec *rec) 216 { 217 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 218 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 219 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 220 rec->rmap.rm_offset = cpu_to_be64( 221 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 222 } 223 224 STATIC void 225 xfs_rmapbt_init_ptr_from_cur( 226 struct xfs_btree_cur *cur, 227 union xfs_btree_ptr *ptr) 228 { 229 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 230 231 ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); 232 233 ptr->s = agf->agf_rmap_root; 234 } 235 236 /* 237 * Mask the appropriate parts of the ondisk key field for a key comparison. 238 * Fork and bmbt are significant parts of the rmap record key, but written 239 * status is merely a record attribute. 240 */ 241 static inline uint64_t offset_keymask(uint64_t offset) 242 { 243 return offset & ~XFS_RMAP_OFF_UNWRITTEN; 244 } 245 246 STATIC int 247 xfs_rmapbt_cmp_key_with_cur( 248 struct xfs_btree_cur *cur, 249 const union xfs_btree_key *key) 250 { 251 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 252 const struct xfs_rmap_key *kp = &key->rmap; 253 254 return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?: 255 cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?: 256 cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)), 257 offset_keymask(xfs_rmap_irec_offset_pack(rec))); 258 } 259 260 STATIC int 261 xfs_rmapbt_cmp_two_keys( 262 struct xfs_btree_cur *cur, 263 const union xfs_btree_key *k1, 264 const union xfs_btree_key *k2, 265 const union xfs_btree_key *mask) 266 { 267 const struct xfs_rmap_key *kp1 = &k1->rmap; 268 const struct xfs_rmap_key *kp2 = &k2->rmap; 269 int d; 270 271 /* Doesn't make sense to mask off the physical space part */ 272 ASSERT(!mask || mask->rmap.rm_startblock); 273 274 d = cmp_int(be32_to_cpu(kp1->rm_startblock), 275 be32_to_cpu(kp2->rm_startblock)); 276 if (d) 277 return d; 278 279 if (!mask || mask->rmap.rm_owner) { 280 d = cmp_int(be64_to_cpu(kp1->rm_owner), 281 be64_to_cpu(kp2->rm_owner)); 282 if (d) 283 return d; 284 } 285 286 if (!mask || mask->rmap.rm_offset) { 287 /* Doesn't make sense to allow offset but not owner */ 288 ASSERT(!mask || mask->rmap.rm_owner); 289 290 d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)), 291 offset_keymask(be64_to_cpu(kp2->rm_offset))); 292 if (d) 293 return d; 294 } 295 296 return 0; 297 } 298 299 static xfs_failaddr_t 300 xfs_rmapbt_verify( 301 struct xfs_buf *bp) 302 { 303 struct xfs_mount *mp = bp->b_mount; 304 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 305 struct xfs_perag *pag = bp->b_pag; 306 xfs_failaddr_t fa; 307 unsigned int level; 308 309 /* 310 * magic number and level verification 311 * 312 * During growfs operations, we can't verify the exact level or owner as 313 * the perag is not fully initialised and hence not attached to the 314 * buffer. In this case, check against the maximum tree depth. 315 * 316 * Similarly, during log recovery we will have a perag structure 317 * attached, but the agf information will not yet have been initialised 318 * from the on disk AGF. Again, we can only check against maximum limits 319 * in this case. 320 */ 321 if (!xfs_verify_magic(bp, block->bb_magic)) 322 return __this_address; 323 324 if (!xfs_has_rmapbt(mp)) 325 return __this_address; 326 fa = xfs_btree_agblock_v5hdr_verify(bp); 327 if (fa) 328 return fa; 329 330 level = be16_to_cpu(block->bb_level); 331 if (pag && xfs_perag_initialised_agf(pag)) { 332 unsigned int maxlevel = pag->pagf_rmap_level; 333 334 #ifdef CONFIG_XFS_ONLINE_REPAIR 335 /* 336 * Online repair could be rewriting the free space btrees, so 337 * we'll validate against the larger of either tree while this 338 * is going on. 339 */ 340 maxlevel = max_t(unsigned int, maxlevel, 341 pag->pagf_repair_rmap_level); 342 #endif 343 if (level >= maxlevel) 344 return __this_address; 345 } else if (level >= mp->m_rmap_maxlevels) 346 return __this_address; 347 348 return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); 349 } 350 351 static void 352 xfs_rmapbt_read_verify( 353 struct xfs_buf *bp) 354 { 355 xfs_failaddr_t fa; 356 357 if (!xfs_btree_agblock_verify_crc(bp)) 358 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 359 else { 360 fa = xfs_rmapbt_verify(bp); 361 if (fa) 362 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 363 } 364 365 if (bp->b_error) 366 trace_xfs_btree_corrupt(bp, _RET_IP_); 367 } 368 369 static void 370 xfs_rmapbt_write_verify( 371 struct xfs_buf *bp) 372 { 373 xfs_failaddr_t fa; 374 375 fa = xfs_rmapbt_verify(bp); 376 if (fa) { 377 trace_xfs_btree_corrupt(bp, _RET_IP_); 378 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 379 return; 380 } 381 xfs_btree_agblock_calc_crc(bp); 382 383 } 384 385 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 386 .name = "xfs_rmapbt", 387 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 388 .verify_read = xfs_rmapbt_read_verify, 389 .verify_write = xfs_rmapbt_write_verify, 390 .verify_struct = xfs_rmapbt_verify, 391 }; 392 393 STATIC int 394 xfs_rmapbt_keys_inorder( 395 struct xfs_btree_cur *cur, 396 const union xfs_btree_key *k1, 397 const union xfs_btree_key *k2) 398 { 399 uint32_t x; 400 uint32_t y; 401 uint64_t a; 402 uint64_t b; 403 404 x = be32_to_cpu(k1->rmap.rm_startblock); 405 y = be32_to_cpu(k2->rmap.rm_startblock); 406 if (x < y) 407 return 1; 408 else if (x > y) 409 return 0; 410 a = be64_to_cpu(k1->rmap.rm_owner); 411 b = be64_to_cpu(k2->rmap.rm_owner); 412 if (a < b) 413 return 1; 414 else if (a > b) 415 return 0; 416 a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); 417 b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); 418 if (a <= b) 419 return 1; 420 return 0; 421 } 422 423 STATIC int 424 xfs_rmapbt_recs_inorder( 425 struct xfs_btree_cur *cur, 426 const union xfs_btree_rec *r1, 427 const union xfs_btree_rec *r2) 428 { 429 uint32_t x; 430 uint32_t y; 431 uint64_t a; 432 uint64_t b; 433 434 x = be32_to_cpu(r1->rmap.rm_startblock); 435 y = be32_to_cpu(r2->rmap.rm_startblock); 436 if (x < y) 437 return 1; 438 else if (x > y) 439 return 0; 440 a = be64_to_cpu(r1->rmap.rm_owner); 441 b = be64_to_cpu(r2->rmap.rm_owner); 442 if (a < b) 443 return 1; 444 else if (a > b) 445 return 0; 446 a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); 447 b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); 448 if (a <= b) 449 return 1; 450 return 0; 451 } 452 453 STATIC enum xbtree_key_contig 454 xfs_rmapbt_keys_contiguous( 455 struct xfs_btree_cur *cur, 456 const union xfs_btree_key *key1, 457 const union xfs_btree_key *key2, 458 const union xfs_btree_key *mask) 459 { 460 ASSERT(!mask || mask->rmap.rm_startblock); 461 462 /* 463 * We only support checking contiguity of the physical space component. 464 * If any callers ever need more specificity than that, they'll have to 465 * implement it here. 466 */ 467 ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); 468 469 return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), 470 be32_to_cpu(key2->rmap.rm_startblock)); 471 } 472 473 const struct xfs_btree_ops xfs_rmapbt_ops = { 474 .name = "rmap", 475 .type = XFS_BTREE_TYPE_AG, 476 .geom_flags = XFS_BTGEO_OVERLAPPING, 477 478 .rec_len = sizeof(struct xfs_rmap_rec), 479 /* Overlapping btree; 2 keys per pointer. */ 480 .key_len = 2 * sizeof(struct xfs_rmap_key), 481 .ptr_len = XFS_BTREE_SHORT_PTR_LEN, 482 483 .lru_refs = XFS_RMAP_BTREE_REF, 484 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), 485 .sick_mask = XFS_SICK_AG_RMAPBT, 486 487 .dup_cursor = xfs_rmapbt_dup_cursor, 488 .set_root = xfs_rmapbt_set_root, 489 .alloc_block = xfs_rmapbt_alloc_block, 490 .free_block = xfs_rmapbt_free_block, 491 .get_minrecs = xfs_rmapbt_get_minrecs, 492 .get_maxrecs = xfs_rmapbt_get_maxrecs, 493 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 494 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 495 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 496 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 497 .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, 498 .buf_ops = &xfs_rmapbt_buf_ops, 499 .cmp_two_keys = xfs_rmapbt_cmp_two_keys, 500 .keys_inorder = xfs_rmapbt_keys_inorder, 501 .recs_inorder = xfs_rmapbt_recs_inorder, 502 .keys_contiguous = xfs_rmapbt_keys_contiguous, 503 }; 504 505 /* 506 * Create a new reverse mapping btree cursor. 507 * 508 * For staging cursors tp and agbp are NULL. 509 */ 510 struct xfs_btree_cur * 511 xfs_rmapbt_init_cursor( 512 struct xfs_mount *mp, 513 struct xfs_trans *tp, 514 struct xfs_buf *agbp, 515 struct xfs_perag *pag) 516 { 517 struct xfs_btree_cur *cur; 518 519 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, 520 mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); 521 cur->bc_group = xfs_group_hold(pag_group(pag)); 522 cur->bc_ag.agbp = agbp; 523 if (agbp) { 524 struct xfs_agf *agf = agbp->b_addr; 525 526 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); 527 } 528 return cur; 529 } 530 531 #ifdef CONFIG_XFS_BTREE_IN_MEM 532 static inline unsigned int 533 xfs_rmapbt_mem_block_maxrecs( 534 unsigned int blocklen, 535 bool leaf) 536 { 537 if (leaf) 538 return blocklen / sizeof(struct xfs_rmap_rec); 539 return blocklen / 540 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); 541 } 542 543 /* 544 * Validate an in-memory rmap btree block. Callers are allowed to generate an 545 * in-memory btree even if the ondisk feature is not enabled. 546 */ 547 static xfs_failaddr_t 548 xfs_rmapbt_mem_verify( 549 struct xfs_buf *bp) 550 { 551 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 552 xfs_failaddr_t fa; 553 unsigned int level; 554 unsigned int maxrecs; 555 556 if (!xfs_verify_magic(bp, block->bb_magic)) 557 return __this_address; 558 559 fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 560 if (fa) 561 return fa; 562 563 level = be16_to_cpu(block->bb_level); 564 if (level >= xfs_rmapbt_maxlevels_ondisk()) 565 return __this_address; 566 567 maxrecs = xfs_rmapbt_mem_block_maxrecs( 568 XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); 569 return xfs_btree_memblock_verify(bp, maxrecs); 570 } 571 572 static void 573 xfs_rmapbt_mem_rw_verify( 574 struct xfs_buf *bp) 575 { 576 xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); 577 578 if (fa) 579 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 580 } 581 582 /* skip crc checks on in-memory btrees to save time */ 583 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { 584 .name = "xfs_rmapbt_mem", 585 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 586 .verify_read = xfs_rmapbt_mem_rw_verify, 587 .verify_write = xfs_rmapbt_mem_rw_verify, 588 .verify_struct = xfs_rmapbt_mem_verify, 589 }; 590 591 const struct xfs_btree_ops xfs_rmapbt_mem_ops = { 592 .name = "mem_rmap", 593 .type = XFS_BTREE_TYPE_MEM, 594 .geom_flags = XFS_BTGEO_OVERLAPPING, 595 596 .rec_len = sizeof(struct xfs_rmap_rec), 597 /* Overlapping btree; 2 keys per pointer. */ 598 .key_len = 2 * sizeof(struct xfs_rmap_key), 599 .ptr_len = XFS_BTREE_LONG_PTR_LEN, 600 601 .lru_refs = XFS_RMAP_BTREE_REF, 602 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), 603 604 .dup_cursor = xfbtree_dup_cursor, 605 .set_root = xfbtree_set_root, 606 .alloc_block = xfbtree_alloc_block, 607 .free_block = xfbtree_free_block, 608 .get_minrecs = xfbtree_get_minrecs, 609 .get_maxrecs = xfbtree_get_maxrecs, 610 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 611 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 612 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 613 .init_ptr_from_cur = xfbtree_init_ptr_from_cur, 614 .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, 615 .buf_ops = &xfs_rmapbt_mem_buf_ops, 616 .cmp_two_keys = xfs_rmapbt_cmp_two_keys, 617 .keys_inorder = xfs_rmapbt_keys_inorder, 618 .recs_inorder = xfs_rmapbt_recs_inorder, 619 .keys_contiguous = xfs_rmapbt_keys_contiguous, 620 }; 621 622 /* Create a cursor for an in-memory btree. */ 623 struct xfs_btree_cur * 624 xfs_rmapbt_mem_cursor( 625 struct xfs_perag *pag, 626 struct xfs_trans *tp, 627 struct xfbtree *xfbt) 628 { 629 struct xfs_btree_cur *cur; 630 631 cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops, 632 xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); 633 cur->bc_mem.xfbtree = xfbt; 634 cur->bc_nlevels = xfbt->nlevels; 635 636 cur->bc_group = xfs_group_hold(pag_group(pag)); 637 return cur; 638 } 639 640 /* Create an in-memory rmap btree. */ 641 int 642 xfs_rmapbt_mem_init( 643 struct xfs_mount *mp, 644 struct xfbtree *xfbt, 645 struct xfs_buftarg *btp, 646 xfs_agnumber_t agno) 647 { 648 xfbt->owner = agno; 649 return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); 650 } 651 652 /* Compute the max possible height for reverse mapping btrees in memory. */ 653 static unsigned int 654 xfs_rmapbt_mem_maxlevels(void) 655 { 656 unsigned int minrecs[2]; 657 unsigned int blocklen; 658 659 blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 660 661 minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; 662 minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; 663 664 /* 665 * How tall can an in-memory rmap btree become if we filled the entire 666 * AG with rmap records? 667 */ 668 return xfs_btree_compute_maxlevels(minrecs, 669 XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); 670 } 671 #else 672 # define xfs_rmapbt_mem_maxlevels() (0) 673 #endif /* CONFIG_XFS_BTREE_IN_MEM */ 674 675 /* 676 * Install a new reverse mapping btree root. Caller is responsible for 677 * invalidating and freeing the old btree blocks. 678 */ 679 void 680 xfs_rmapbt_commit_staged_btree( 681 struct xfs_btree_cur *cur, 682 struct xfs_trans *tp, 683 struct xfs_buf *agbp) 684 { 685 struct xfs_agf *agf = agbp->b_addr; 686 struct xbtree_afakeroot *afake = cur->bc_ag.afake; 687 688 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 689 690 agf->agf_rmap_root = cpu_to_be32(afake->af_root); 691 agf->agf_rmap_level = cpu_to_be32(afake->af_levels); 692 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); 693 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | 694 XFS_AGF_RMAP_BLOCKS); 695 xfs_btree_commit_afakeroot(cur, tp, agbp); 696 } 697 698 /* Calculate number of records in a reverse mapping btree block. */ 699 static inline unsigned int 700 xfs_rmapbt_block_maxrecs( 701 unsigned int blocklen, 702 bool leaf) 703 { 704 if (leaf) 705 return blocklen / sizeof(struct xfs_rmap_rec); 706 return blocklen / 707 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 708 } 709 710 /* 711 * Calculate number of records in an rmap btree block. 712 */ 713 unsigned int 714 xfs_rmapbt_maxrecs( 715 struct xfs_mount *mp, 716 unsigned int blocklen, 717 bool leaf) 718 { 719 blocklen -= XFS_RMAP_BLOCK_LEN; 720 return xfs_rmapbt_block_maxrecs(blocklen, leaf); 721 } 722 723 /* Compute the max possible height for reverse mapping btrees. */ 724 unsigned int 725 xfs_rmapbt_maxlevels_ondisk(void) 726 { 727 unsigned int minrecs[2]; 728 unsigned int blocklen; 729 730 blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; 731 732 minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2; 733 minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2; 734 735 /* 736 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs. 737 * 738 * On a reflink filesystem, each AG block can have up to 2^32 (per the 739 * refcount record format) owners, which means that theoretically we 740 * could face up to 2^64 rmap records. However, we're likely to run 741 * out of blocks in the AG long before that happens, which means that 742 * we must compute the max height based on what the btree will look 743 * like if it consumes almost all the blocks in the AG due to maximal 744 * sharing factor. 745 */ 746 return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), 747 xfs_rmapbt_mem_maxlevels()); 748 } 749 750 /* Compute the maximum height of an rmap btree. */ 751 void 752 xfs_rmapbt_compute_maxlevels( 753 struct xfs_mount *mp) 754 { 755 if (!xfs_has_rmapbt(mp)) { 756 mp->m_rmap_maxlevels = 0; 757 return; 758 } 759 760 if (xfs_has_reflink(mp)) { 761 /* 762 * Compute the asymptotic maxlevels for an rmap btree on a 763 * filesystem that supports reflink. 764 * 765 * On a reflink filesystem, each AG block can have up to 2^32 766 * (per the refcount record format) owners, which means that 767 * theoretically we could face up to 2^64 rmap records. 768 * However, we're likely to run out of blocks in the AG long 769 * before that happens, which means that we must compute the 770 * max height based on what the btree will look like if it 771 * consumes almost all the blocks in the AG due to maximal 772 * sharing factor. 773 */ 774 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr, 775 mp->m_sb.sb_agblocks); 776 } else { 777 /* 778 * If there's no block sharing, compute the maximum rmapbt 779 * height assuming one rmap record per AG block. 780 */ 781 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 782 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 783 } 784 ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk()); 785 } 786 787 /* Calculate the refcount btree size for some records. */ 788 xfs_extlen_t 789 xfs_rmapbt_calc_size( 790 struct xfs_mount *mp, 791 unsigned long long len) 792 { 793 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 794 } 795 796 /* 797 * Calculate the maximum refcount btree size. 798 */ 799 xfs_extlen_t 800 xfs_rmapbt_max_size( 801 struct xfs_mount *mp, 802 xfs_agblock_t agblocks) 803 { 804 /* Bail out if we're uninitialized, which can happen in mkfs. */ 805 if (mp->m_rmap_mxr[0] == 0) 806 return 0; 807 808 return xfs_rmapbt_calc_size(mp, agblocks); 809 } 810 811 /* 812 * Figure out how many blocks to reserve and how many are used by this btree. 813 */ 814 int 815 xfs_rmapbt_calc_reserves( 816 struct xfs_mount *mp, 817 struct xfs_trans *tp, 818 struct xfs_perag *pag, 819 xfs_extlen_t *ask, 820 xfs_extlen_t *used) 821 { 822 struct xfs_buf *agbp; 823 struct xfs_agf *agf; 824 xfs_agblock_t agblocks; 825 xfs_extlen_t tree_len; 826 int error; 827 828 if (!xfs_has_rmapbt(mp)) 829 return 0; 830 831 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 832 if (error) 833 return error; 834 835 agf = agbp->b_addr; 836 agblocks = be32_to_cpu(agf->agf_length); 837 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 838 xfs_trans_brelse(tp, agbp); 839 840 /* 841 * The log is permanently allocated, so the space it occupies will 842 * never be available for the kinds of things that would require btree 843 * expansion. We therefore can pretend the space isn't there. 844 */ 845 if (xfs_ag_contains_log(mp, pag_agno(pag))) 846 agblocks -= mp->m_sb.sb_logblocks; 847 848 /* Reserve 1% of the AG or enough for 1 block per record. */ 849 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 850 *used += tree_len; 851 852 return error; 853 } 854 855 int __init 856 xfs_rmapbt_init_cur_cache(void) 857 { 858 xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur", 859 xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()), 860 0, 0, NULL); 861 862 if (!xfs_rmapbt_cur_cache) 863 return -ENOMEM; 864 return 0; 865 } 866 867 void 868 xfs_rmapbt_destroy_cur_cache(void) 869 { 870 kmem_cache_destroy(xfs_rmapbt_cur_cache); 871 xfs_rmapbt_cur_cache = NULL; 872 } 873