1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2014 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_trans.h"
14 #include "xfs_alloc.h"
15 #include "xfs_btree.h"
16 #include "xfs_btree_staging.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_health.h"
20 #include "xfs_trace.h"
21 #include "xfs_error.h"
22 #include "xfs_extent_busy.h"
23 #include "xfs_ag.h"
24 #include "xfs_ag_resv.h"
25 #include "xfs_buf_mem.h"
26 #include "xfs_btree_mem.h"
27
28 static struct kmem_cache *xfs_rmapbt_cur_cache;
29
30 /*
31 * Reverse map btree.
32 *
33 * This is a per-ag tree used to track the owner(s) of a given extent. With
34 * reflink it is possible for there to be multiple owners, which is a departure
35 * from classic XFS. Owner records for data extents are inserted when the
36 * extent is mapped and removed when an extent is unmapped. Owner records for
37 * all other block types (i.e. metadata) are inserted when an extent is
38 * allocated and removed when an extent is freed. There can only be one owner
39 * of a metadata extent, usually an inode or some other metadata structure like
40 * an AG btree.
41 *
42 * The rmap btree is part of the free space management, so blocks for the tree
43 * are sourced from the agfl. Hence we need transaction reservation support for
44 * this tree so that the freelist is always large enough. This also impacts on
45 * the minimum space we need to leave free in the AG.
46 *
47 * The tree is ordered by [ag block, owner, offset]. This is a large key size,
48 * but it is the only way to enforce unique keys when a block can be owned by
49 * multiple files at any offset. There's no need to order/search by extent
50 * size for online updating/management of the tree. It is intended that most
51 * reverse lookups will be to find the owner(s) of a particular block, or to
52 * try to recover tree and file data from corrupt primary metadata.
53 */
54
55 static struct xfs_btree_cur *
xfs_rmapbt_dup_cursor(struct xfs_btree_cur * cur)56 xfs_rmapbt_dup_cursor(
57 struct xfs_btree_cur *cur)
58 {
59 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
60 cur->bc_ag.agbp, to_perag(cur->bc_group));
61 }
62
63 STATIC void
xfs_rmapbt_set_root(struct xfs_btree_cur * cur,const union xfs_btree_ptr * ptr,int inc)64 xfs_rmapbt_set_root(
65 struct xfs_btree_cur *cur,
66 const union xfs_btree_ptr *ptr,
67 int inc)
68 {
69 struct xfs_buf *agbp = cur->bc_ag.agbp;
70 struct xfs_agf *agf = agbp->b_addr;
71 struct xfs_perag *pag = to_perag(cur->bc_group);
72
73 ASSERT(ptr->s != 0);
74
75 agf->agf_rmap_root = ptr->s;
76 be32_add_cpu(&agf->agf_rmap_level, inc);
77 pag->pagf_rmap_level += inc;
78
79 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
80 }
81
82 STATIC int
xfs_rmapbt_alloc_block(struct xfs_btree_cur * cur,const union xfs_btree_ptr * start,union xfs_btree_ptr * new,int * stat)83 xfs_rmapbt_alloc_block(
84 struct xfs_btree_cur *cur,
85 const union xfs_btree_ptr *start,
86 union xfs_btree_ptr *new,
87 int *stat)
88 {
89 struct xfs_buf *agbp = cur->bc_ag.agbp;
90 struct xfs_agf *agf = agbp->b_addr;
91 struct xfs_perag *pag = to_perag(cur->bc_group);
92 struct xfs_alloc_arg args = { .len = 1 };
93 int error;
94 xfs_agblock_t bno;
95
96 /* Allocate the new block from the freelist. If we can't, give up. */
97 error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
98 &bno, 1);
99 if (error)
100 return error;
101 if (bno == NULLAGBLOCK) {
102 *stat = 0;
103 return 0;
104 }
105
106 xfs_extent_busy_reuse(pag_group(pag), bno, 1, false);
107
108 new->s = cpu_to_be32(bno);
109 be32_add_cpu(&agf->agf_rmap_blocks, 1);
110 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
111
112 /*
113 * Since rmapbt blocks are sourced from the AGFL, they are allocated one
114 * at a time and the reservation updates don't require a transaction.
115 */
116 xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
117
118 *stat = 1;
119 return 0;
120 }
121
122 STATIC int
xfs_rmapbt_free_block(struct xfs_btree_cur * cur,struct xfs_buf * bp)123 xfs_rmapbt_free_block(
124 struct xfs_btree_cur *cur,
125 struct xfs_buf *bp)
126 {
127 struct xfs_buf *agbp = cur->bc_ag.agbp;
128 struct xfs_agf *agf = agbp->b_addr;
129 struct xfs_perag *pag = to_perag(cur->bc_group);
130 xfs_agblock_t bno;
131 int error;
132
133 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
134 be32_add_cpu(&agf->agf_rmap_blocks, -1);
135 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
136 error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
137 if (error)
138 return error;
139
140 xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1,
141 XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
144 return 0;
145 }
146
147 STATIC int
xfs_rmapbt_get_minrecs(struct xfs_btree_cur * cur,int level)148 xfs_rmapbt_get_minrecs(
149 struct xfs_btree_cur *cur,
150 int level)
151 {
152 return cur->bc_mp->m_rmap_mnr[level != 0];
153 }
154
155 STATIC int
xfs_rmapbt_get_maxrecs(struct xfs_btree_cur * cur,int level)156 xfs_rmapbt_get_maxrecs(
157 struct xfs_btree_cur *cur,
158 int level)
159 {
160 return cur->bc_mp->m_rmap_mxr[level != 0];
161 }
162
163 /*
164 * Convert the ondisk record's offset field into the ondisk key's offset field.
165 * Fork and bmbt are significant parts of the rmap record key, but written
166 * status is merely a record attribute.
167 */
ondisk_rec_offset_to_key(const union xfs_btree_rec * rec)168 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
169 {
170 return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
171 }
172
173 STATIC void
xfs_rmapbt_init_key_from_rec(union xfs_btree_key * key,const union xfs_btree_rec * rec)174 xfs_rmapbt_init_key_from_rec(
175 union xfs_btree_key *key,
176 const union xfs_btree_rec *rec)
177 {
178 key->rmap.rm_startblock = rec->rmap.rm_startblock;
179 key->rmap.rm_owner = rec->rmap.rm_owner;
180 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
181 }
182
183 /*
184 * The high key for a reverse mapping record can be computed by shifting
185 * the startblock and offset to the highest value that would still map
186 * to that record. In practice this means that we add blockcount-1 to
187 * the startblock for all records, and if the record is for a data/attr
188 * fork mapping, we add blockcount-1 to the offset too.
189 */
190 STATIC void
xfs_rmapbt_init_high_key_from_rec(union xfs_btree_key * key,const union xfs_btree_rec * rec)191 xfs_rmapbt_init_high_key_from_rec(
192 union xfs_btree_key *key,
193 const union xfs_btree_rec *rec)
194 {
195 uint64_t off;
196 int adj;
197
198 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
199
200 key->rmap.rm_startblock = rec->rmap.rm_startblock;
201 be32_add_cpu(&key->rmap.rm_startblock, adj);
202 key->rmap.rm_owner = rec->rmap.rm_owner;
203 key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
204 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
205 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
206 return;
207 off = be64_to_cpu(key->rmap.rm_offset);
208 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
209 key->rmap.rm_offset = cpu_to_be64(off);
210 }
211
212 STATIC void
xfs_rmapbt_init_rec_from_cur(struct xfs_btree_cur * cur,union xfs_btree_rec * rec)213 xfs_rmapbt_init_rec_from_cur(
214 struct xfs_btree_cur *cur,
215 union xfs_btree_rec *rec)
216 {
217 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
218 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
219 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
220 rec->rmap.rm_offset = cpu_to_be64(
221 xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
222 }
223
224 STATIC void
xfs_rmapbt_init_ptr_from_cur(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)225 xfs_rmapbt_init_ptr_from_cur(
226 struct xfs_btree_cur *cur,
227 union xfs_btree_ptr *ptr)
228 {
229 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
230
231 ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
232
233 ptr->s = agf->agf_rmap_root;
234 }
235
236 /*
237 * Mask the appropriate parts of the ondisk key field for a key comparison.
238 * Fork and bmbt are significant parts of the rmap record key, but written
239 * status is merely a record attribute.
240 */
offset_keymask(uint64_t offset)241 static inline uint64_t offset_keymask(uint64_t offset)
242 {
243 return offset & ~XFS_RMAP_OFF_UNWRITTEN;
244 }
245
246 STATIC int
xfs_rmapbt_cmp_key_with_cur(struct xfs_btree_cur * cur,const union xfs_btree_key * key)247 xfs_rmapbt_cmp_key_with_cur(
248 struct xfs_btree_cur *cur,
249 const union xfs_btree_key *key)
250 {
251 struct xfs_rmap_irec *rec = &cur->bc_rec.r;
252 const struct xfs_rmap_key *kp = &key->rmap;
253
254 return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
255 cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
256 cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
257 offset_keymask(xfs_rmap_irec_offset_pack(rec)));
258 }
259
260 STATIC int
xfs_rmapbt_cmp_two_keys(struct xfs_btree_cur * cur,const union xfs_btree_key * k1,const union xfs_btree_key * k2,const union xfs_btree_key * mask)261 xfs_rmapbt_cmp_two_keys(
262 struct xfs_btree_cur *cur,
263 const union xfs_btree_key *k1,
264 const union xfs_btree_key *k2,
265 const union xfs_btree_key *mask)
266 {
267 const struct xfs_rmap_key *kp1 = &k1->rmap;
268 const struct xfs_rmap_key *kp2 = &k2->rmap;
269 int d;
270
271 /* Doesn't make sense to mask off the physical space part */
272 ASSERT(!mask || mask->rmap.rm_startblock);
273
274 d = cmp_int(be32_to_cpu(kp1->rm_startblock),
275 be32_to_cpu(kp2->rm_startblock));
276 if (d)
277 return d;
278
279 if (!mask || mask->rmap.rm_owner) {
280 d = cmp_int(be64_to_cpu(kp1->rm_owner),
281 be64_to_cpu(kp2->rm_owner));
282 if (d)
283 return d;
284 }
285
286 if (!mask || mask->rmap.rm_offset) {
287 /* Doesn't make sense to allow offset but not owner */
288 ASSERT(!mask || mask->rmap.rm_owner);
289
290 d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
291 offset_keymask(be64_to_cpu(kp2->rm_offset)));
292 if (d)
293 return d;
294 }
295
296 return 0;
297 }
298
299 static xfs_failaddr_t
xfs_rmapbt_verify(struct xfs_buf * bp)300 xfs_rmapbt_verify(
301 struct xfs_buf *bp)
302 {
303 struct xfs_mount *mp = bp->b_mount;
304 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
305 struct xfs_perag *pag = bp->b_pag;
306 xfs_failaddr_t fa;
307 unsigned int level;
308
309 /*
310 * magic number and level verification
311 *
312 * During growfs operations, we can't verify the exact level or owner as
313 * the perag is not fully initialised and hence not attached to the
314 * buffer. In this case, check against the maximum tree depth.
315 *
316 * Similarly, during log recovery we will have a perag structure
317 * attached, but the agf information will not yet have been initialised
318 * from the on disk AGF. Again, we can only check against maximum limits
319 * in this case.
320 */
321 if (!xfs_verify_magic(bp, block->bb_magic))
322 return __this_address;
323
324 if (!xfs_has_rmapbt(mp))
325 return __this_address;
326 fa = xfs_btree_agblock_v5hdr_verify(bp);
327 if (fa)
328 return fa;
329
330 level = be16_to_cpu(block->bb_level);
331 if (pag && xfs_perag_initialised_agf(pag)) {
332 unsigned int maxlevel = pag->pagf_rmap_level;
333
334 #ifdef CONFIG_XFS_ONLINE_REPAIR
335 /*
336 * Online repair could be rewriting the free space btrees, so
337 * we'll validate against the larger of either tree while this
338 * is going on.
339 */
340 maxlevel = max_t(unsigned int, maxlevel,
341 pag->pagf_repair_rmap_level);
342 #endif
343 if (level >= maxlevel)
344 return __this_address;
345 } else if (level >= mp->m_rmap_maxlevels)
346 return __this_address;
347
348 return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
349 }
350
351 static void
xfs_rmapbt_read_verify(struct xfs_buf * bp)352 xfs_rmapbt_read_verify(
353 struct xfs_buf *bp)
354 {
355 xfs_failaddr_t fa;
356
357 if (!xfs_btree_agblock_verify_crc(bp))
358 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
359 else {
360 fa = xfs_rmapbt_verify(bp);
361 if (fa)
362 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
363 }
364
365 if (bp->b_error)
366 trace_xfs_btree_corrupt(bp, _RET_IP_);
367 }
368
369 static void
xfs_rmapbt_write_verify(struct xfs_buf * bp)370 xfs_rmapbt_write_verify(
371 struct xfs_buf *bp)
372 {
373 xfs_failaddr_t fa;
374
375 fa = xfs_rmapbt_verify(bp);
376 if (fa) {
377 trace_xfs_btree_corrupt(bp, _RET_IP_);
378 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
379 return;
380 }
381 xfs_btree_agblock_calc_crc(bp);
382
383 }
384
385 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
386 .name = "xfs_rmapbt",
387 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
388 .verify_read = xfs_rmapbt_read_verify,
389 .verify_write = xfs_rmapbt_write_verify,
390 .verify_struct = xfs_rmapbt_verify,
391 };
392
393 STATIC int
xfs_rmapbt_keys_inorder(struct xfs_btree_cur * cur,const union xfs_btree_key * k1,const union xfs_btree_key * k2)394 xfs_rmapbt_keys_inorder(
395 struct xfs_btree_cur *cur,
396 const union xfs_btree_key *k1,
397 const union xfs_btree_key *k2)
398 {
399 uint32_t x;
400 uint32_t y;
401 uint64_t a;
402 uint64_t b;
403
404 x = be32_to_cpu(k1->rmap.rm_startblock);
405 y = be32_to_cpu(k2->rmap.rm_startblock);
406 if (x < y)
407 return 1;
408 else if (x > y)
409 return 0;
410 a = be64_to_cpu(k1->rmap.rm_owner);
411 b = be64_to_cpu(k2->rmap.rm_owner);
412 if (a < b)
413 return 1;
414 else if (a > b)
415 return 0;
416 a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
417 b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
418 if (a <= b)
419 return 1;
420 return 0;
421 }
422
423 STATIC int
xfs_rmapbt_recs_inorder(struct xfs_btree_cur * cur,const union xfs_btree_rec * r1,const union xfs_btree_rec * r2)424 xfs_rmapbt_recs_inorder(
425 struct xfs_btree_cur *cur,
426 const union xfs_btree_rec *r1,
427 const union xfs_btree_rec *r2)
428 {
429 uint32_t x;
430 uint32_t y;
431 uint64_t a;
432 uint64_t b;
433
434 x = be32_to_cpu(r1->rmap.rm_startblock);
435 y = be32_to_cpu(r2->rmap.rm_startblock);
436 if (x < y)
437 return 1;
438 else if (x > y)
439 return 0;
440 a = be64_to_cpu(r1->rmap.rm_owner);
441 b = be64_to_cpu(r2->rmap.rm_owner);
442 if (a < b)
443 return 1;
444 else if (a > b)
445 return 0;
446 a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
447 b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
448 if (a <= b)
449 return 1;
450 return 0;
451 }
452
453 STATIC enum xbtree_key_contig
xfs_rmapbt_keys_contiguous(struct xfs_btree_cur * cur,const union xfs_btree_key * key1,const union xfs_btree_key * key2,const union xfs_btree_key * mask)454 xfs_rmapbt_keys_contiguous(
455 struct xfs_btree_cur *cur,
456 const union xfs_btree_key *key1,
457 const union xfs_btree_key *key2,
458 const union xfs_btree_key *mask)
459 {
460 ASSERT(!mask || mask->rmap.rm_startblock);
461
462 /*
463 * We only support checking contiguity of the physical space component.
464 * If any callers ever need more specificity than that, they'll have to
465 * implement it here.
466 */
467 ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
468
469 return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
470 be32_to_cpu(key2->rmap.rm_startblock));
471 }
472
473 const struct xfs_btree_ops xfs_rmapbt_ops = {
474 .name = "rmap",
475 .type = XFS_BTREE_TYPE_AG,
476 .geom_flags = XFS_BTGEO_OVERLAPPING,
477
478 .rec_len = sizeof(struct xfs_rmap_rec),
479 /* Overlapping btree; 2 keys per pointer. */
480 .key_len = 2 * sizeof(struct xfs_rmap_key),
481 .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
482
483 .lru_refs = XFS_RMAP_BTREE_REF,
484 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
485 .sick_mask = XFS_SICK_AG_RMAPBT,
486
487 .dup_cursor = xfs_rmapbt_dup_cursor,
488 .set_root = xfs_rmapbt_set_root,
489 .alloc_block = xfs_rmapbt_alloc_block,
490 .free_block = xfs_rmapbt_free_block,
491 .get_minrecs = xfs_rmapbt_get_minrecs,
492 .get_maxrecs = xfs_rmapbt_get_maxrecs,
493 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
494 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
495 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
496 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
497 .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
498 .buf_ops = &xfs_rmapbt_buf_ops,
499 .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
500 .keys_inorder = xfs_rmapbt_keys_inorder,
501 .recs_inorder = xfs_rmapbt_recs_inorder,
502 .keys_contiguous = xfs_rmapbt_keys_contiguous,
503 };
504
505 /*
506 * Create a new reverse mapping btree cursor.
507 *
508 * For staging cursors tp and agbp are NULL.
509 */
510 struct xfs_btree_cur *
xfs_rmapbt_init_cursor(struct xfs_mount * mp,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_perag * pag)511 xfs_rmapbt_init_cursor(
512 struct xfs_mount *mp,
513 struct xfs_trans *tp,
514 struct xfs_buf *agbp,
515 struct xfs_perag *pag)
516 {
517 struct xfs_btree_cur *cur;
518
519 cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
520 mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
521 cur->bc_group = xfs_group_hold(pag_group(pag));
522 cur->bc_ag.agbp = agbp;
523 if (agbp) {
524 struct xfs_agf *agf = agbp->b_addr;
525
526 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
527 }
528 return cur;
529 }
530
531 #ifdef CONFIG_XFS_BTREE_IN_MEM
532 static inline unsigned int
xfs_rmapbt_mem_block_maxrecs(unsigned int blocklen,bool leaf)533 xfs_rmapbt_mem_block_maxrecs(
534 unsigned int blocklen,
535 bool leaf)
536 {
537 if (leaf)
538 return blocklen / sizeof(struct xfs_rmap_rec);
539 return blocklen /
540 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
541 }
542
543 /*
544 * Validate an in-memory rmap btree block. Callers are allowed to generate an
545 * in-memory btree even if the ondisk feature is not enabled.
546 */
547 static xfs_failaddr_t
xfs_rmapbt_mem_verify(struct xfs_buf * bp)548 xfs_rmapbt_mem_verify(
549 struct xfs_buf *bp)
550 {
551 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
552 xfs_failaddr_t fa;
553 unsigned int level;
554 unsigned int maxrecs;
555
556 if (!xfs_verify_magic(bp, block->bb_magic))
557 return __this_address;
558
559 fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
560 if (fa)
561 return fa;
562
563 level = be16_to_cpu(block->bb_level);
564 if (level >= xfs_rmapbt_maxlevels_ondisk())
565 return __this_address;
566
567 maxrecs = xfs_rmapbt_mem_block_maxrecs(
568 XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
569 return xfs_btree_memblock_verify(bp, maxrecs);
570 }
571
572 static void
xfs_rmapbt_mem_rw_verify(struct xfs_buf * bp)573 xfs_rmapbt_mem_rw_verify(
574 struct xfs_buf *bp)
575 {
576 xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp);
577
578 if (fa)
579 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
580 }
581
582 /* skip crc checks on in-memory btrees to save time */
583 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
584 .name = "xfs_rmapbt_mem",
585 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
586 .verify_read = xfs_rmapbt_mem_rw_verify,
587 .verify_write = xfs_rmapbt_mem_rw_verify,
588 .verify_struct = xfs_rmapbt_mem_verify,
589 };
590
591 const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
592 .name = "mem_rmap",
593 .type = XFS_BTREE_TYPE_MEM,
594 .geom_flags = XFS_BTGEO_OVERLAPPING,
595
596 .rec_len = sizeof(struct xfs_rmap_rec),
597 /* Overlapping btree; 2 keys per pointer. */
598 .key_len = 2 * sizeof(struct xfs_rmap_key),
599 .ptr_len = XFS_BTREE_LONG_PTR_LEN,
600
601 .lru_refs = XFS_RMAP_BTREE_REF,
602 .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
603
604 .dup_cursor = xfbtree_dup_cursor,
605 .set_root = xfbtree_set_root,
606 .alloc_block = xfbtree_alloc_block,
607 .free_block = xfbtree_free_block,
608 .get_minrecs = xfbtree_get_minrecs,
609 .get_maxrecs = xfbtree_get_maxrecs,
610 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
611 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
612 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
613 .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
614 .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
615 .buf_ops = &xfs_rmapbt_mem_buf_ops,
616 .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
617 .keys_inorder = xfs_rmapbt_keys_inorder,
618 .recs_inorder = xfs_rmapbt_recs_inorder,
619 .keys_contiguous = xfs_rmapbt_keys_contiguous,
620 };
621
622 /* Create a cursor for an in-memory btree. */
623 struct xfs_btree_cur *
xfs_rmapbt_mem_cursor(struct xfs_perag * pag,struct xfs_trans * tp,struct xfbtree * xfbt)624 xfs_rmapbt_mem_cursor(
625 struct xfs_perag *pag,
626 struct xfs_trans *tp,
627 struct xfbtree *xfbt)
628 {
629 struct xfs_btree_cur *cur;
630
631 cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops,
632 xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
633 cur->bc_mem.xfbtree = xfbt;
634 cur->bc_nlevels = xfbt->nlevels;
635
636 cur->bc_group = xfs_group_hold(pag_group(pag));
637 return cur;
638 }
639
640 /* Create an in-memory rmap btree. */
641 int
xfs_rmapbt_mem_init(struct xfs_mount * mp,struct xfbtree * xfbt,struct xfs_buftarg * btp,xfs_agnumber_t agno)642 xfs_rmapbt_mem_init(
643 struct xfs_mount *mp,
644 struct xfbtree *xfbt,
645 struct xfs_buftarg *btp,
646 xfs_agnumber_t agno)
647 {
648 xfbt->owner = agno;
649 return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
650 }
651
652 /* Compute the max possible height for reverse mapping btrees in memory. */
653 static unsigned int
xfs_rmapbt_mem_maxlevels(void)654 xfs_rmapbt_mem_maxlevels(void)
655 {
656 unsigned int minrecs[2];
657 unsigned int blocklen;
658
659 blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
660
661 minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
662 minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
663
664 /*
665 * How tall can an in-memory rmap btree become if we filled the entire
666 * AG with rmap records?
667 */
668 return xfs_btree_compute_maxlevels(minrecs,
669 XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
670 }
671 #else
672 # define xfs_rmapbt_mem_maxlevels() (0)
673 #endif /* CONFIG_XFS_BTREE_IN_MEM */
674
675 /*
676 * Install a new reverse mapping btree root. Caller is responsible for
677 * invalidating and freeing the old btree blocks.
678 */
679 void
xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp)680 xfs_rmapbt_commit_staged_btree(
681 struct xfs_btree_cur *cur,
682 struct xfs_trans *tp,
683 struct xfs_buf *agbp)
684 {
685 struct xfs_agf *agf = agbp->b_addr;
686 struct xbtree_afakeroot *afake = cur->bc_ag.afake;
687
688 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
689
690 agf->agf_rmap_root = cpu_to_be32(afake->af_root);
691 agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
692 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
693 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
694 XFS_AGF_RMAP_BLOCKS);
695 xfs_btree_commit_afakeroot(cur, tp, agbp);
696 }
697
698 /* Calculate number of records in a reverse mapping btree block. */
699 static inline unsigned int
xfs_rmapbt_block_maxrecs(unsigned int blocklen,bool leaf)700 xfs_rmapbt_block_maxrecs(
701 unsigned int blocklen,
702 bool leaf)
703 {
704 if (leaf)
705 return blocklen / sizeof(struct xfs_rmap_rec);
706 return blocklen /
707 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
708 }
709
710 /*
711 * Calculate number of records in an rmap btree block.
712 */
713 unsigned int
xfs_rmapbt_maxrecs(struct xfs_mount * mp,unsigned int blocklen,bool leaf)714 xfs_rmapbt_maxrecs(
715 struct xfs_mount *mp,
716 unsigned int blocklen,
717 bool leaf)
718 {
719 blocklen -= XFS_RMAP_BLOCK_LEN;
720 return xfs_rmapbt_block_maxrecs(blocklen, leaf);
721 }
722
723 /* Compute the max possible height for reverse mapping btrees. */
724 unsigned int
xfs_rmapbt_maxlevels_ondisk(void)725 xfs_rmapbt_maxlevels_ondisk(void)
726 {
727 unsigned int minrecs[2];
728 unsigned int blocklen;
729
730 blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
731
732 minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
733 minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
734
735 /*
736 * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
737 *
738 * On a reflink filesystem, each AG block can have up to 2^32 (per the
739 * refcount record format) owners, which means that theoretically we
740 * could face up to 2^64 rmap records. However, we're likely to run
741 * out of blocks in the AG long before that happens, which means that
742 * we must compute the max height based on what the btree will look
743 * like if it consumes almost all the blocks in the AG due to maximal
744 * sharing factor.
745 */
746 return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
747 xfs_rmapbt_mem_maxlevels());
748 }
749
750 /* Compute the maximum height of an rmap btree. */
751 void
xfs_rmapbt_compute_maxlevels(struct xfs_mount * mp)752 xfs_rmapbt_compute_maxlevels(
753 struct xfs_mount *mp)
754 {
755 if (!xfs_has_rmapbt(mp)) {
756 mp->m_rmap_maxlevels = 0;
757 return;
758 }
759
760 if (xfs_has_reflink(mp)) {
761 /*
762 * Compute the asymptotic maxlevels for an rmap btree on a
763 * filesystem that supports reflink.
764 *
765 * On a reflink filesystem, each AG block can have up to 2^32
766 * (per the refcount record format) owners, which means that
767 * theoretically we could face up to 2^64 rmap records.
768 * However, we're likely to run out of blocks in the AG long
769 * before that happens, which means that we must compute the
770 * max height based on what the btree will look like if it
771 * consumes almost all the blocks in the AG due to maximal
772 * sharing factor.
773 */
774 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
775 mp->m_sb.sb_agblocks);
776 } else {
777 /*
778 * If there's no block sharing, compute the maximum rmapbt
779 * height assuming one rmap record per AG block.
780 */
781 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
782 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
783 }
784 ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
785 }
786
787 /* Calculate the refcount btree size for some records. */
788 xfs_extlen_t
xfs_rmapbt_calc_size(struct xfs_mount * mp,unsigned long long len)789 xfs_rmapbt_calc_size(
790 struct xfs_mount *mp,
791 unsigned long long len)
792 {
793 return xfs_btree_calc_size(mp->m_rmap_mnr, len);
794 }
795
796 /*
797 * Calculate the maximum refcount btree size.
798 */
799 xfs_extlen_t
xfs_rmapbt_max_size(struct xfs_mount * mp,xfs_agblock_t agblocks)800 xfs_rmapbt_max_size(
801 struct xfs_mount *mp,
802 xfs_agblock_t agblocks)
803 {
804 /* Bail out if we're uninitialized, which can happen in mkfs. */
805 if (mp->m_rmap_mxr[0] == 0)
806 return 0;
807
808 return xfs_rmapbt_calc_size(mp, agblocks);
809 }
810
811 /*
812 * Figure out how many blocks to reserve and how many are used by this btree.
813 */
814 int
xfs_rmapbt_calc_reserves(struct xfs_mount * mp,struct xfs_trans * tp,struct xfs_perag * pag,xfs_extlen_t * ask,xfs_extlen_t * used)815 xfs_rmapbt_calc_reserves(
816 struct xfs_mount *mp,
817 struct xfs_trans *tp,
818 struct xfs_perag *pag,
819 xfs_extlen_t *ask,
820 xfs_extlen_t *used)
821 {
822 struct xfs_buf *agbp;
823 struct xfs_agf *agf;
824 xfs_agblock_t agblocks;
825 xfs_extlen_t tree_len;
826 int error;
827
828 if (!xfs_has_rmapbt(mp))
829 return 0;
830
831 error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
832 if (error)
833 return error;
834
835 agf = agbp->b_addr;
836 agblocks = be32_to_cpu(agf->agf_length);
837 tree_len = be32_to_cpu(agf->agf_rmap_blocks);
838 xfs_trans_brelse(tp, agbp);
839
840 /*
841 * The log is permanently allocated, so the space it occupies will
842 * never be available for the kinds of things that would require btree
843 * expansion. We therefore can pretend the space isn't there.
844 */
845 if (xfs_ag_contains_log(mp, pag_agno(pag)))
846 agblocks -= mp->m_sb.sb_logblocks;
847
848 /* Reserve 1% of the AG or enough for 1 block per record. */
849 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
850 *used += tree_len;
851
852 return error;
853 }
854
855 int __init
xfs_rmapbt_init_cur_cache(void)856 xfs_rmapbt_init_cur_cache(void)
857 {
858 xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
859 xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
860 0, 0, NULL);
861
862 if (!xfs_rmapbt_cur_cache)
863 return -ENOMEM;
864 return 0;
865 }
866
867 void
xfs_rmapbt_destroy_cur_cache(void)868 xfs_rmapbt_destroy_cur_cache(void)
869 {
870 kmem_cache_destroy(xfs_rmapbt_cur_cache);
871 xfs_rmapbt_cur_cache = NULL;
872 }
873