1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "xfs_metafile.h"
23 #include "xfs_quota.h"
24 #include "scrub/scrub.h"
25 #include "scrub/common.h"
26 #include "scrub/trace.h"
27 #include "scrub/repair.h"
28 #include "scrub/newbt.h"
29
30 /*
31 * This is the maximum number of deferred extent freeing item extents (EFIs)
32 * that we'll attach to a transaction without rolling the transaction to avoid
33 * overrunning a tr_itruncate reservation. The newbt code should reserve
34 * exactly the correct number of blocks to rebuild the btree, so there should
35 * not be any excess blocks to free when committing a new btree.
36 */
37 #define XREP_MAX_ITRUNCATE_EFIS (128)
38
39 /*
40 * Estimate proper slack values for a btree that's being reloaded.
41 *
42 * Under most circumstances, we'll take whatever default loading value the
43 * btree bulk loading code calculates for us. However, there are some
44 * exceptions to this rule:
45 *
46 * (0) If someone turned one of the debug knobs.
47 * (1) If this is a per-AG btree and the AG has less than 10% space free.
48 * (2) If this is an inode btree and the FS has less than 10% space free.
49
50 * In either case, format the new btree blocks almost completely full to
51 * minimize space usage.
52 */
53 static void
xrep_newbt_estimate_slack(struct xrep_newbt * xnr)54 xrep_newbt_estimate_slack(
55 struct xrep_newbt *xnr)
56 {
57 struct xfs_scrub *sc = xnr->sc;
58 struct xfs_btree_bload *bload = &xnr->bload;
59 uint64_t free;
60 uint64_t sz;
61
62 /*
63 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
64 * unless someone has set them otherwise, so we just pull the values
65 * here.
66 */
67 bload->leaf_slack = xfs_globals.bload_leaf_slack;
68 bload->node_slack = xfs_globals.bload_node_slack;
69
70 if (sc->ops->type == ST_PERAG) {
71 free = sc->sa.pag->pagf_freeblks;
72 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
73 } else {
74 free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
75 sz = sc->mp->m_sb.sb_dblocks;
76 }
77
78 /* No further changes if there's more than 10% free space left. */
79 if (free >= div_u64(sz, 10))
80 return;
81
82 /*
83 * We're low on space; load the btrees as tightly as possible. Leave
84 * a couple of open slots in each btree block so that we don't end up
85 * splitting the btrees like crazy after a mount.
86 */
87 if (bload->leaf_slack < 0)
88 bload->leaf_slack = 2;
89 if (bload->node_slack < 0)
90 bload->node_slack = 2;
91 }
92
93 /* Initialize accounting resources for staging a new AG btree. */
94 void
xrep_newbt_init_ag(struct xrep_newbt * xnr,struct xfs_scrub * sc,const struct xfs_owner_info * oinfo,xfs_fsblock_t alloc_hint,enum xfs_ag_resv_type resv)95 xrep_newbt_init_ag(
96 struct xrep_newbt *xnr,
97 struct xfs_scrub *sc,
98 const struct xfs_owner_info *oinfo,
99 xfs_fsblock_t alloc_hint,
100 enum xfs_ag_resv_type resv)
101 {
102 memset(xnr, 0, sizeof(struct xrep_newbt));
103 xnr->sc = sc;
104 xnr->oinfo = *oinfo; /* structure copy */
105 xnr->alloc_hint = alloc_hint;
106 xnr->resv = resv;
107 INIT_LIST_HEAD(&xnr->resv_list);
108 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
109 xrep_newbt_estimate_slack(xnr);
110 }
111
112 /* Initialize accounting resources for staging a new inode fork btree. */
113 int
xrep_newbt_init_inode(struct xrep_newbt * xnr,struct xfs_scrub * sc,int whichfork,const struct xfs_owner_info * oinfo)114 xrep_newbt_init_inode(
115 struct xrep_newbt *xnr,
116 struct xfs_scrub *sc,
117 int whichfork,
118 const struct xfs_owner_info *oinfo)
119 {
120 struct xfs_ifork *ifp;
121
122 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
123 if (!ifp)
124 return -ENOMEM;
125
126 xrep_newbt_init_ag(xnr, sc, oinfo,
127 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
128 XFS_AG_RESV_NONE);
129 xnr->ifake.if_fork = ifp;
130 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
131 return 0;
132 }
133
134 /*
135 * Initialize accounting resources for staging a new metadata inode btree.
136 * If the metadata file has a space reservation, the caller must adjust that
137 * reservation when committing the new ondisk btree.
138 */
139 int
xrep_newbt_init_metadir_inode(struct xrep_newbt * xnr,struct xfs_scrub * sc)140 xrep_newbt_init_metadir_inode(
141 struct xrep_newbt *xnr,
142 struct xfs_scrub *sc)
143 {
144 struct xfs_owner_info oinfo;
145 struct xfs_ifork *ifp;
146
147 ASSERT(xfs_is_metadir_inode(sc->ip));
148
149 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
150
151 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
152 if (!ifp)
153 return -ENOMEM;
154
155 /*
156 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
157 * inode metadata space reservations can only account allocated space
158 * to the i_nblocks. We do not want to change the inode core fields
159 * until we're ready to commit the new tree, so we allocate the blocks
160 * as if they were regular file blocks. This exposes us to a higher
161 * risk of the repair being cancelled due to ENOSPC.
162 */
163 xrep_newbt_init_ag(xnr, sc, &oinfo,
164 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
165 XFS_AG_RESV_NONE);
166 xnr->ifake.if_fork = ifp;
167 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
168 return 0;
169 }
170
171 /*
172 * Initialize accounting resources for staging a new btree. Callers are
173 * expected to add their own reservations (and clean them up) manually.
174 */
175 void
xrep_newbt_init_bare(struct xrep_newbt * xnr,struct xfs_scrub * sc)176 xrep_newbt_init_bare(
177 struct xrep_newbt *xnr,
178 struct xfs_scrub *sc)
179 {
180 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
181 XFS_AG_RESV_NONE);
182 }
183
184 /*
185 * Designate specific blocks to be used to build our new btree. @pag must be
186 * a passive reference.
187 */
188 STATIC int
xrep_newbt_add_blocks(struct xrep_newbt * xnr,struct xfs_perag * pag,const struct xfs_alloc_arg * args)189 xrep_newbt_add_blocks(
190 struct xrep_newbt *xnr,
191 struct xfs_perag *pag,
192 const struct xfs_alloc_arg *args)
193 {
194 struct xfs_mount *mp = xnr->sc->mp;
195 struct xrep_newbt_resv *resv;
196 int error;
197
198 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
199 if (!resv)
200 return -ENOMEM;
201
202 INIT_LIST_HEAD(&resv->list);
203 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
204 resv->len = args->len;
205 resv->used = 0;
206 resv->pag = xfs_perag_hold(pag);
207
208 if (args->tp) {
209 ASSERT(xnr->oinfo.oi_offset == 0);
210
211 error = xfs_alloc_schedule_autoreap(args,
212 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
213 if (error)
214 goto out_pag;
215 }
216
217 list_add_tail(&resv->list, &xnr->resv_list);
218 return 0;
219 out_pag:
220 xfs_perag_put(resv->pag);
221 kfree(resv);
222 return error;
223 }
224
225 /*
226 * Add an extent to the new btree reservation pool. Callers are required to
227 * reap this reservation manually if the repair is cancelled. @pag must be a
228 * passive reference.
229 */
230 int
xrep_newbt_add_extent(struct xrep_newbt * xnr,struct xfs_perag * pag,xfs_agblock_t agbno,xfs_extlen_t len)231 xrep_newbt_add_extent(
232 struct xrep_newbt *xnr,
233 struct xfs_perag *pag,
234 xfs_agblock_t agbno,
235 xfs_extlen_t len)
236 {
237 struct xfs_alloc_arg args = {
238 .tp = NULL, /* no autoreap */
239 .oinfo = xnr->oinfo,
240 .fsbno = xfs_agbno_to_fsb(pag, agbno),
241 .len = len,
242 .resv = xnr->resv,
243 };
244
245 return xrep_newbt_add_blocks(xnr, pag, &args);
246 }
247
248 /* Don't let our allocation hint take us beyond this AG */
249 static inline void
xrep_newbt_validate_ag_alloc_hint(struct xrep_newbt * xnr)250 xrep_newbt_validate_ag_alloc_hint(
251 struct xrep_newbt *xnr)
252 {
253 struct xfs_scrub *sc = xnr->sc;
254 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
255
256 if (agno == pag_agno(sc->sa.pag) &&
257 xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
258 return;
259
260 xnr->alloc_hint =
261 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
262 }
263
264 /* Allocate disk space for a new per-AG btree. */
265 STATIC int
xrep_newbt_alloc_ag_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)266 xrep_newbt_alloc_ag_blocks(
267 struct xrep_newbt *xnr,
268 uint64_t nr_blocks)
269 {
270 struct xfs_scrub *sc = xnr->sc;
271 struct xfs_mount *mp = sc->mp;
272 int error = 0;
273
274 ASSERT(sc->sa.pag != NULL);
275 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
276
277 while (nr_blocks > 0) {
278 struct xfs_alloc_arg args = {
279 .tp = sc->tp,
280 .mp = mp,
281 .oinfo = xnr->oinfo,
282 .minlen = 1,
283 .maxlen = nr_blocks,
284 .prod = 1,
285 .resv = xnr->resv,
286 };
287 xfs_agnumber_t agno;
288
289 xrep_newbt_validate_ag_alloc_hint(xnr);
290
291 if (xnr->alloc_vextent)
292 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
293 else
294 error = xfs_alloc_vextent_near_bno(&args,
295 xnr->alloc_hint);
296 if (error)
297 return error;
298 if (args.fsbno == NULLFSBLOCK)
299 return -ENOSPC;
300
301 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
302 if (agno != pag_agno(sc->sa.pag)) {
303 ASSERT(agno == pag_agno(sc->sa.pag));
304 return -EFSCORRUPTED;
305 }
306
307 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
308 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
309 xnr->oinfo.oi_owner);
310
311 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
312 if (error)
313 return error;
314
315 nr_blocks -= args.len;
316 xnr->alloc_hint = args.fsbno + args.len;
317
318 error = xrep_defer_finish(sc);
319 if (error)
320 return error;
321 }
322
323 return 0;
324 }
325
326 /* Don't let our allocation hint take us beyond EOFS */
327 static inline void
xrep_newbt_validate_file_alloc_hint(struct xrep_newbt * xnr)328 xrep_newbt_validate_file_alloc_hint(
329 struct xrep_newbt *xnr)
330 {
331 struct xfs_scrub *sc = xnr->sc;
332
333 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
334 return;
335
336 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
337 }
338
339 /* Allocate disk space for our new file-based btree. */
340 STATIC int
xrep_newbt_alloc_file_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)341 xrep_newbt_alloc_file_blocks(
342 struct xrep_newbt *xnr,
343 uint64_t nr_blocks)
344 {
345 struct xfs_scrub *sc = xnr->sc;
346 struct xfs_mount *mp = sc->mp;
347 int error = 0;
348
349 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
350
351 while (nr_blocks > 0) {
352 struct xfs_alloc_arg args = {
353 .tp = sc->tp,
354 .mp = mp,
355 .oinfo = xnr->oinfo,
356 .minlen = 1,
357 .maxlen = nr_blocks,
358 .prod = 1,
359 .resv = xnr->resv,
360 };
361 struct xfs_perag *pag;
362 xfs_agnumber_t agno;
363
364 xrep_newbt_validate_file_alloc_hint(xnr);
365
366 if (xnr->alloc_vextent)
367 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
368 else
369 error = xfs_alloc_vextent_start_ag(&args,
370 xnr->alloc_hint);
371 if (error)
372 return error;
373 if (args.fsbno == NULLFSBLOCK)
374 return -ENOSPC;
375
376 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
377
378 pag = xfs_perag_get(mp, agno);
379 if (!pag) {
380 ASSERT(0);
381 return -EFSCORRUPTED;
382 }
383
384 trace_xrep_newbt_alloc_file_blocks(pag,
385 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
386 xnr->oinfo.oi_owner);
387
388 error = xrep_newbt_add_blocks(xnr, pag, &args);
389 xfs_perag_put(pag);
390 if (error)
391 return error;
392
393 nr_blocks -= args.len;
394 xnr->alloc_hint = args.fsbno + args.len;
395
396 error = xrep_defer_finish(sc);
397 if (error)
398 return error;
399 }
400
401 return 0;
402 }
403
404 /* Allocate disk space for our new btree. */
405 int
xrep_newbt_alloc_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)406 xrep_newbt_alloc_blocks(
407 struct xrep_newbt *xnr,
408 uint64_t nr_blocks)
409 {
410 if (xnr->sc->ip)
411 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
412 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
413 }
414
415 /*
416 * Free the unused part of a space extent that was reserved for a new ondisk
417 * structure. Returns the number of EFIs logged or a negative errno.
418 */
419 STATIC int
xrep_newbt_free_extent(struct xrep_newbt * xnr,struct xrep_newbt_resv * resv,bool btree_committed)420 xrep_newbt_free_extent(
421 struct xrep_newbt *xnr,
422 struct xrep_newbt_resv *resv,
423 bool btree_committed)
424 {
425 struct xfs_scrub *sc = xnr->sc;
426 xfs_agblock_t free_agbno = resv->agbno;
427 xfs_extlen_t free_aglen = resv->len;
428 int error;
429
430 if (!btree_committed || resv->used == 0) {
431 /*
432 * If we're not committing a new btree or we didn't use the
433 * space reservation, let the existing EFI free the entire
434 * space extent.
435 */
436 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
437 xnr->oinfo.oi_owner);
438 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
439 return 1;
440 }
441
442 /*
443 * We used space and committed the btree. Cancel the autoreap, remove
444 * the written blocks from the reservation, and possibly log a new EFI
445 * to free any unused reservation space.
446 */
447 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
448 free_agbno += resv->used;
449 free_aglen -= resv->used;
450
451 if (free_aglen == 0)
452 return 0;
453
454 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
455 xnr->oinfo.oi_owner);
456
457 ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
458 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
459
460 /*
461 * Use EFIs to free the reservations. This reduces the chance
462 * that we leak blocks if the system goes down.
463 */
464 error = xfs_free_extent_later(sc->tp,
465 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
466 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
467 if (error)
468 return error;
469
470 return 1;
471 }
472
473 /* Free all the accounting info and disk space we reserved for a new btree. */
474 STATIC int
xrep_newbt_free(struct xrep_newbt * xnr,bool btree_committed)475 xrep_newbt_free(
476 struct xrep_newbt *xnr,
477 bool btree_committed)
478 {
479 struct xfs_scrub *sc = xnr->sc;
480 struct xrep_newbt_resv *resv, *n;
481 unsigned int freed = 0;
482 int error = 0;
483
484 /*
485 * If the filesystem already went down, we can't free the blocks. Skip
486 * ahead to freeing the incore metadata because we can't fix anything.
487 */
488 if (xfs_is_shutdown(sc->mp))
489 goto junkit;
490
491 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
492 int ret;
493
494 ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
495 list_del(&resv->list);
496 xfs_perag_put(resv->pag);
497 kfree(resv);
498 if (ret < 0) {
499 error = ret;
500 goto junkit;
501 }
502
503 freed += ret;
504 if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
505 error = xrep_defer_finish(sc);
506 if (error)
507 goto junkit;
508 freed = 0;
509 }
510 }
511
512 if (freed)
513 error = xrep_defer_finish(sc);
514
515 junkit:
516 /*
517 * If we still have reservations attached to @newbt, cleanup must have
518 * failed and the filesystem is about to go down. Clean up the incore
519 * reservations and try to commit to freeing the space we used.
520 */
521 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
522 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
523 list_del(&resv->list);
524 xfs_perag_put(resv->pag);
525 kfree(resv);
526 }
527
528 if (sc->ip) {
529 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
530 xnr->ifake.if_fork = NULL;
531 }
532
533 return error;
534 }
535
536 /*
537 * Free all the accounting info and unused disk space allocations after
538 * committing a new btree.
539 */
540 int
xrep_newbt_commit(struct xrep_newbt * xnr)541 xrep_newbt_commit(
542 struct xrep_newbt *xnr)
543 {
544 return xrep_newbt_free(xnr, true);
545 }
546
547 /*
548 * Free all the accounting info and all of the disk space we reserved for a new
549 * btree that we're not going to commit. We want to try to roll things back
550 * cleanly for things like ENOSPC midway through allocation.
551 */
552 void
xrep_newbt_cancel(struct xrep_newbt * xnr)553 xrep_newbt_cancel(
554 struct xrep_newbt *xnr)
555 {
556 xrep_newbt_free(xnr, false);
557 }
558
559 /* Feed one of the reserved btree blocks to the bulk loader. */
560 int
xrep_newbt_claim_block(struct xfs_btree_cur * cur,struct xrep_newbt * xnr,union xfs_btree_ptr * ptr)561 xrep_newbt_claim_block(
562 struct xfs_btree_cur *cur,
563 struct xrep_newbt *xnr,
564 union xfs_btree_ptr *ptr)
565 {
566 struct xrep_newbt_resv *resv;
567 xfs_agblock_t agbno;
568
569 /*
570 * The first item in the list should always have a free block unless
571 * we're completely out.
572 */
573 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
574 if (resv->used == resv->len)
575 return -ENOSPC;
576
577 /*
578 * Peel off a block from the start of the reservation. We allocate
579 * blocks in order to place blocks on disk in increasing record or key
580 * order. The block reservations tend to end up on the list in
581 * decreasing order, which hopefully results in leaf blocks ending up
582 * together.
583 */
584 agbno = resv->agbno + resv->used;
585 resv->used++;
586
587 /* If we used all the blocks in this reservation, move it to the end. */
588 if (resv->used == resv->len)
589 list_move_tail(&resv->list, &xnr->resv_list);
590
591 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
592
593 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
594 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
595 else
596 ptr->s = cpu_to_be32(agbno);
597
598 /* Relog all the EFIs. */
599 return xrep_defer_finish(xnr->sc);
600 }
601
602 /* How many reserved blocks are unused? */
603 unsigned int
xrep_newbt_unused_blocks(struct xrep_newbt * xnr)604 xrep_newbt_unused_blocks(
605 struct xrep_newbt *xnr)
606 {
607 struct xrep_newbt_resv *resv;
608 unsigned int unused = 0;
609
610 list_for_each_entry(resv, &xnr->resv_list, list)
611 unused += resv->len - resv->used;
612 return unused;
613 }
614