1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "xfs_metafile.h"
23 #include "xfs_quota.h"
24 #include "scrub/scrub.h"
25 #include "scrub/common.h"
26 #include "scrub/trace.h"
27 #include "scrub/repair.h"
28 #include "scrub/newbt.h"
29
30 /*
31 * Estimate proper slack values for a btree that's being reloaded.
32 *
33 * Under most circumstances, we'll take whatever default loading value the
34 * btree bulk loading code calculates for us. However, there are some
35 * exceptions to this rule:
36 *
37 * (0) If someone turned one of the debug knobs.
38 * (1) If this is a per-AG btree and the AG has less than 10% space free.
39 * (2) If this is an inode btree and the FS has less than 10% space free.
40
41 * In either case, format the new btree blocks almost completely full to
42 * minimize space usage.
43 */
44 static void
xrep_newbt_estimate_slack(struct xrep_newbt * xnr)45 xrep_newbt_estimate_slack(
46 struct xrep_newbt *xnr)
47 {
48 struct xfs_scrub *sc = xnr->sc;
49 struct xfs_btree_bload *bload = &xnr->bload;
50 uint64_t free;
51 uint64_t sz;
52
53 /*
54 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
55 * unless someone has set them otherwise, so we just pull the values
56 * here.
57 */
58 bload->leaf_slack = xfs_globals.bload_leaf_slack;
59 bload->node_slack = xfs_globals.bload_node_slack;
60
61 if (sc->ops->type == ST_PERAG) {
62 free = sc->sa.pag->pagf_freeblks;
63 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
64 } else {
65 free = percpu_counter_sum(&sc->mp->m_fdblocks);
66 sz = sc->mp->m_sb.sb_dblocks;
67 }
68
69 /* No further changes if there's more than 10% free space left. */
70 if (free >= div_u64(sz, 10))
71 return;
72
73 /*
74 * We're low on space; load the btrees as tightly as possible. Leave
75 * a couple of open slots in each btree block so that we don't end up
76 * splitting the btrees like crazy after a mount.
77 */
78 if (bload->leaf_slack < 0)
79 bload->leaf_slack = 2;
80 if (bload->node_slack < 0)
81 bload->node_slack = 2;
82 }
83
84 /* Initialize accounting resources for staging a new AG btree. */
85 void
xrep_newbt_init_ag(struct xrep_newbt * xnr,struct xfs_scrub * sc,const struct xfs_owner_info * oinfo,xfs_fsblock_t alloc_hint,enum xfs_ag_resv_type resv)86 xrep_newbt_init_ag(
87 struct xrep_newbt *xnr,
88 struct xfs_scrub *sc,
89 const struct xfs_owner_info *oinfo,
90 xfs_fsblock_t alloc_hint,
91 enum xfs_ag_resv_type resv)
92 {
93 memset(xnr, 0, sizeof(struct xrep_newbt));
94 xnr->sc = sc;
95 xnr->oinfo = *oinfo; /* structure copy */
96 xnr->alloc_hint = alloc_hint;
97 xnr->resv = resv;
98 INIT_LIST_HEAD(&xnr->resv_list);
99 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
100 xrep_newbt_estimate_slack(xnr);
101 }
102
103 /* Initialize accounting resources for staging a new inode fork btree. */
104 int
xrep_newbt_init_inode(struct xrep_newbt * xnr,struct xfs_scrub * sc,int whichfork,const struct xfs_owner_info * oinfo)105 xrep_newbt_init_inode(
106 struct xrep_newbt *xnr,
107 struct xfs_scrub *sc,
108 int whichfork,
109 const struct xfs_owner_info *oinfo)
110 {
111 struct xfs_ifork *ifp;
112
113 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
114 if (!ifp)
115 return -ENOMEM;
116
117 xrep_newbt_init_ag(xnr, sc, oinfo,
118 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
119 XFS_AG_RESV_NONE);
120 xnr->ifake.if_fork = ifp;
121 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
122 return 0;
123 }
124
125 /*
126 * Initialize accounting resources for staging a new metadata inode btree.
127 * If the metadata file has a space reservation, the caller must adjust that
128 * reservation when committing the new ondisk btree.
129 */
130 int
xrep_newbt_init_metadir_inode(struct xrep_newbt * xnr,struct xfs_scrub * sc)131 xrep_newbt_init_metadir_inode(
132 struct xrep_newbt *xnr,
133 struct xfs_scrub *sc)
134 {
135 struct xfs_owner_info oinfo;
136 struct xfs_ifork *ifp;
137
138 ASSERT(xfs_is_metadir_inode(sc->ip));
139
140 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
141
142 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
143 if (!ifp)
144 return -ENOMEM;
145
146 /*
147 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
148 * inode metadata space reservations can only account allocated space
149 * to the i_nblocks. We do not want to change the inode core fields
150 * until we're ready to commit the new tree, so we allocate the blocks
151 * as if they were regular file blocks. This exposes us to a higher
152 * risk of the repair being cancelled due to ENOSPC.
153 */
154 xrep_newbt_init_ag(xnr, sc, &oinfo,
155 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
156 XFS_AG_RESV_NONE);
157 xnr->ifake.if_fork = ifp;
158 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
159 return 0;
160 }
161
162 /*
163 * Initialize accounting resources for staging a new btree. Callers are
164 * expected to add their own reservations (and clean them up) manually.
165 */
166 void
xrep_newbt_init_bare(struct xrep_newbt * xnr,struct xfs_scrub * sc)167 xrep_newbt_init_bare(
168 struct xrep_newbt *xnr,
169 struct xfs_scrub *sc)
170 {
171 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
172 XFS_AG_RESV_NONE);
173 }
174
175 /*
176 * Designate specific blocks to be used to build our new btree. @pag must be
177 * a passive reference.
178 */
179 STATIC int
xrep_newbt_add_blocks(struct xrep_newbt * xnr,struct xfs_perag * pag,const struct xfs_alloc_arg * args)180 xrep_newbt_add_blocks(
181 struct xrep_newbt *xnr,
182 struct xfs_perag *pag,
183 const struct xfs_alloc_arg *args)
184 {
185 struct xfs_mount *mp = xnr->sc->mp;
186 struct xrep_newbt_resv *resv;
187 int error;
188
189 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
190 if (!resv)
191 return -ENOMEM;
192
193 INIT_LIST_HEAD(&resv->list);
194 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
195 resv->len = args->len;
196 resv->used = 0;
197 resv->pag = xfs_perag_hold(pag);
198
199 if (args->tp) {
200 ASSERT(xnr->oinfo.oi_offset == 0);
201
202 error = xfs_alloc_schedule_autoreap(args,
203 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
204 if (error)
205 goto out_pag;
206 }
207
208 list_add_tail(&resv->list, &xnr->resv_list);
209 return 0;
210 out_pag:
211 xfs_perag_put(resv->pag);
212 kfree(resv);
213 return error;
214 }
215
216 /*
217 * Add an extent to the new btree reservation pool. Callers are required to
218 * reap this reservation manually if the repair is cancelled. @pag must be a
219 * passive reference.
220 */
221 int
xrep_newbt_add_extent(struct xrep_newbt * xnr,struct xfs_perag * pag,xfs_agblock_t agbno,xfs_extlen_t len)222 xrep_newbt_add_extent(
223 struct xrep_newbt *xnr,
224 struct xfs_perag *pag,
225 xfs_agblock_t agbno,
226 xfs_extlen_t len)
227 {
228 struct xfs_alloc_arg args = {
229 .tp = NULL, /* no autoreap */
230 .oinfo = xnr->oinfo,
231 .fsbno = xfs_agbno_to_fsb(pag, agbno),
232 .len = len,
233 .resv = xnr->resv,
234 };
235
236 return xrep_newbt_add_blocks(xnr, pag, &args);
237 }
238
239 /* Don't let our allocation hint take us beyond this AG */
240 static inline void
xrep_newbt_validate_ag_alloc_hint(struct xrep_newbt * xnr)241 xrep_newbt_validate_ag_alloc_hint(
242 struct xrep_newbt *xnr)
243 {
244 struct xfs_scrub *sc = xnr->sc;
245 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
246
247 if (agno == pag_agno(sc->sa.pag) &&
248 xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
249 return;
250
251 xnr->alloc_hint =
252 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
253 }
254
255 /* Allocate disk space for a new per-AG btree. */
256 STATIC int
xrep_newbt_alloc_ag_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)257 xrep_newbt_alloc_ag_blocks(
258 struct xrep_newbt *xnr,
259 uint64_t nr_blocks)
260 {
261 struct xfs_scrub *sc = xnr->sc;
262 struct xfs_mount *mp = sc->mp;
263 int error = 0;
264
265 ASSERT(sc->sa.pag != NULL);
266 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
267
268 while (nr_blocks > 0) {
269 struct xfs_alloc_arg args = {
270 .tp = sc->tp,
271 .mp = mp,
272 .oinfo = xnr->oinfo,
273 .minlen = 1,
274 .maxlen = nr_blocks,
275 .prod = 1,
276 .resv = xnr->resv,
277 };
278 xfs_agnumber_t agno;
279
280 xrep_newbt_validate_ag_alloc_hint(xnr);
281
282 if (xnr->alloc_vextent)
283 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
284 else
285 error = xfs_alloc_vextent_near_bno(&args,
286 xnr->alloc_hint);
287 if (error)
288 return error;
289 if (args.fsbno == NULLFSBLOCK)
290 return -ENOSPC;
291
292 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
293 if (agno != pag_agno(sc->sa.pag)) {
294 ASSERT(agno == pag_agno(sc->sa.pag));
295 return -EFSCORRUPTED;
296 }
297
298 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
299 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
300 xnr->oinfo.oi_owner);
301
302 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
303 if (error)
304 return error;
305
306 nr_blocks -= args.len;
307 xnr->alloc_hint = args.fsbno + args.len;
308
309 error = xrep_defer_finish(sc);
310 if (error)
311 return error;
312 }
313
314 return 0;
315 }
316
317 /* Don't let our allocation hint take us beyond EOFS */
318 static inline void
xrep_newbt_validate_file_alloc_hint(struct xrep_newbt * xnr)319 xrep_newbt_validate_file_alloc_hint(
320 struct xrep_newbt *xnr)
321 {
322 struct xfs_scrub *sc = xnr->sc;
323
324 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
325 return;
326
327 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
328 }
329
330 /* Allocate disk space for our new file-based btree. */
331 STATIC int
xrep_newbt_alloc_file_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)332 xrep_newbt_alloc_file_blocks(
333 struct xrep_newbt *xnr,
334 uint64_t nr_blocks)
335 {
336 struct xfs_scrub *sc = xnr->sc;
337 struct xfs_mount *mp = sc->mp;
338 int error = 0;
339
340 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
341
342 while (nr_blocks > 0) {
343 struct xfs_alloc_arg args = {
344 .tp = sc->tp,
345 .mp = mp,
346 .oinfo = xnr->oinfo,
347 .minlen = 1,
348 .maxlen = nr_blocks,
349 .prod = 1,
350 .resv = xnr->resv,
351 };
352 struct xfs_perag *pag;
353 xfs_agnumber_t agno;
354
355 xrep_newbt_validate_file_alloc_hint(xnr);
356
357 if (xnr->alloc_vextent)
358 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
359 else
360 error = xfs_alloc_vextent_start_ag(&args,
361 xnr->alloc_hint);
362 if (error)
363 return error;
364 if (args.fsbno == NULLFSBLOCK)
365 return -ENOSPC;
366
367 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
368
369 pag = xfs_perag_get(mp, agno);
370 if (!pag) {
371 ASSERT(0);
372 return -EFSCORRUPTED;
373 }
374
375 trace_xrep_newbt_alloc_file_blocks(pag,
376 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
377 xnr->oinfo.oi_owner);
378
379 error = xrep_newbt_add_blocks(xnr, pag, &args);
380 xfs_perag_put(pag);
381 if (error)
382 return error;
383
384 nr_blocks -= args.len;
385 xnr->alloc_hint = args.fsbno + args.len;
386
387 error = xrep_defer_finish(sc);
388 if (error)
389 return error;
390 }
391
392 return 0;
393 }
394
395 /* Allocate disk space for our new btree. */
396 int
xrep_newbt_alloc_blocks(struct xrep_newbt * xnr,uint64_t nr_blocks)397 xrep_newbt_alloc_blocks(
398 struct xrep_newbt *xnr,
399 uint64_t nr_blocks)
400 {
401 if (xnr->sc->ip)
402 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
403 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
404 }
405
406 /*
407 * Free the unused part of a space extent that was reserved for a new ondisk
408 * structure. Returns the number of EFIs logged or a negative errno.
409 */
410 STATIC int
xrep_newbt_free_extent(struct xrep_newbt * xnr,struct xrep_newbt_resv * resv,bool btree_committed)411 xrep_newbt_free_extent(
412 struct xrep_newbt *xnr,
413 struct xrep_newbt_resv *resv,
414 bool btree_committed)
415 {
416 struct xfs_scrub *sc = xnr->sc;
417 xfs_agblock_t free_agbno = resv->agbno;
418 xfs_extlen_t free_aglen = resv->len;
419 int error;
420
421 if (!btree_committed || resv->used == 0) {
422 /*
423 * If we're not committing a new btree or we didn't use the
424 * space reservation, let the existing EFI free the entire
425 * space extent.
426 */
427 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
428 xnr->oinfo.oi_owner);
429 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
430 return 1;
431 }
432
433 /*
434 * We used space and committed the btree. Cancel the autoreap, remove
435 * the written blocks from the reservation, and possibly log a new EFI
436 * to free any unused reservation space.
437 */
438 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
439 free_agbno += resv->used;
440 free_aglen -= resv->used;
441
442 if (free_aglen == 0)
443 return 0;
444
445 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
446 xnr->oinfo.oi_owner);
447
448 ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
449 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
450
451 /*
452 * Use EFIs to free the reservations. This reduces the chance
453 * that we leak blocks if the system goes down.
454 */
455 error = xfs_free_extent_later(sc->tp,
456 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
457 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
458 if (error)
459 return error;
460
461 return 1;
462 }
463
464 /* Free all the accounting info and disk space we reserved for a new btree. */
465 STATIC int
xrep_newbt_free(struct xrep_newbt * xnr,bool btree_committed)466 xrep_newbt_free(
467 struct xrep_newbt *xnr,
468 bool btree_committed)
469 {
470 struct xfs_scrub *sc = xnr->sc;
471 struct xrep_newbt_resv *resv, *n;
472 unsigned int freed = 0;
473 int error = 0;
474
475 /*
476 * If the filesystem already went down, we can't free the blocks. Skip
477 * ahead to freeing the incore metadata because we can't fix anything.
478 */
479 if (xfs_is_shutdown(sc->mp))
480 goto junkit;
481
482 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
483 int ret;
484
485 ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
486 list_del(&resv->list);
487 xfs_perag_put(resv->pag);
488 kfree(resv);
489 if (ret < 0) {
490 error = ret;
491 goto junkit;
492 }
493
494 freed += ret;
495 if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
496 error = xrep_defer_finish(sc);
497 if (error)
498 goto junkit;
499 freed = 0;
500 }
501 }
502
503 if (freed)
504 error = xrep_defer_finish(sc);
505
506 junkit:
507 /*
508 * If we still have reservations attached to @newbt, cleanup must have
509 * failed and the filesystem is about to go down. Clean up the incore
510 * reservations and try to commit to freeing the space we used.
511 */
512 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
513 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
514 list_del(&resv->list);
515 xfs_perag_put(resv->pag);
516 kfree(resv);
517 }
518
519 if (sc->ip) {
520 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
521 xnr->ifake.if_fork = NULL;
522 }
523
524 return error;
525 }
526
527 /*
528 * Free all the accounting info and unused disk space allocations after
529 * committing a new btree.
530 */
531 int
xrep_newbt_commit(struct xrep_newbt * xnr)532 xrep_newbt_commit(
533 struct xrep_newbt *xnr)
534 {
535 return xrep_newbt_free(xnr, true);
536 }
537
538 /*
539 * Free all the accounting info and all of the disk space we reserved for a new
540 * btree that we're not going to commit. We want to try to roll things back
541 * cleanly for things like ENOSPC midway through allocation.
542 */
543 void
xrep_newbt_cancel(struct xrep_newbt * xnr)544 xrep_newbt_cancel(
545 struct xrep_newbt *xnr)
546 {
547 xrep_newbt_free(xnr, false);
548 }
549
550 /* Feed one of the reserved btree blocks to the bulk loader. */
551 int
xrep_newbt_claim_block(struct xfs_btree_cur * cur,struct xrep_newbt * xnr,union xfs_btree_ptr * ptr)552 xrep_newbt_claim_block(
553 struct xfs_btree_cur *cur,
554 struct xrep_newbt *xnr,
555 union xfs_btree_ptr *ptr)
556 {
557 struct xrep_newbt_resv *resv;
558 xfs_agblock_t agbno;
559
560 /*
561 * The first item in the list should always have a free block unless
562 * we're completely out.
563 */
564 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
565 if (resv->used == resv->len)
566 return -ENOSPC;
567
568 /*
569 * Peel off a block from the start of the reservation. We allocate
570 * blocks in order to place blocks on disk in increasing record or key
571 * order. The block reservations tend to end up on the list in
572 * decreasing order, which hopefully results in leaf blocks ending up
573 * together.
574 */
575 agbno = resv->agbno + resv->used;
576 resv->used++;
577
578 /* If we used all the blocks in this reservation, move it to the end. */
579 if (resv->used == resv->len)
580 list_move_tail(&resv->list, &xnr->resv_list);
581
582 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
583
584 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
585 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
586 else
587 ptr->s = cpu_to_be32(agbno);
588
589 /* Relog all the EFIs. */
590 return xrep_defer_finish(xnr->sc);
591 }
592
593 /* How many reserved blocks are unused? */
594 unsigned int
xrep_newbt_unused_blocks(struct xrep_newbt * xnr)595 xrep_newbt_unused_blocks(
596 struct xrep_newbt *xnr)
597 {
598 struct xrep_newbt_resv *resv;
599 unsigned int unused = 0;
600
601 list_for_each_entry(resv, &xnr->resv_list, list)
602 unused += resv->len - resv->used;
603 return unused;
604 }
605