xref: /linux/fs/xfs/scrub/newbt.c (revision 186779c036468038b0d077ec5333a51512f867e5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "xfs_metafile.h"
23 #include "xfs_quota.h"
24 #include "scrub/scrub.h"
25 #include "scrub/common.h"
26 #include "scrub/trace.h"
27 #include "scrub/repair.h"
28 #include "scrub/newbt.h"
29 
30 /*
31  * Estimate proper slack values for a btree that's being reloaded.
32  *
33  * Under most circumstances, we'll take whatever default loading value the
34  * btree bulk loading code calculates for us.  However, there are some
35  * exceptions to this rule:
36  *
37  * (0) If someone turned one of the debug knobs.
38  * (1) If this is a per-AG btree and the AG has less than 10% space free.
39  * (2) If this is an inode btree and the FS has less than 10% space free.
40 
41  * In either case, format the new btree blocks almost completely full to
42  * minimize space usage.
43  */
44 static void
45 xrep_newbt_estimate_slack(
46 	struct xrep_newbt	*xnr)
47 {
48 	struct xfs_scrub	*sc = xnr->sc;
49 	struct xfs_btree_bload	*bload = &xnr->bload;
50 	uint64_t		free;
51 	uint64_t		sz;
52 
53 	/*
54 	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
55 	 * unless someone has set them otherwise, so we just pull the values
56 	 * here.
57 	 */
58 	bload->leaf_slack = xfs_globals.bload_leaf_slack;
59 	bload->node_slack = xfs_globals.bload_node_slack;
60 
61 	if (sc->ops->type == ST_PERAG) {
62 		free = sc->sa.pag->pagf_freeblks;
63 		sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
64 	} else {
65 		free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
66 		sz = sc->mp->m_sb.sb_dblocks;
67 	}
68 
69 	/* No further changes if there's more than 10% free space left. */
70 	if (free >= div_u64(sz, 10))
71 		return;
72 
73 	/*
74 	 * We're low on space; load the btrees as tightly as possible.  Leave
75 	 * a couple of open slots in each btree block so that we don't end up
76 	 * splitting the btrees like crazy after a mount.
77 	 */
78 	if (bload->leaf_slack < 0)
79 		bload->leaf_slack = 2;
80 	if (bload->node_slack < 0)
81 		bload->node_slack = 2;
82 }
83 
84 /* Initialize accounting resources for staging a new AG btree. */
85 void
86 xrep_newbt_init_ag(
87 	struct xrep_newbt		*xnr,
88 	struct xfs_scrub		*sc,
89 	const struct xfs_owner_info	*oinfo,
90 	xfs_fsblock_t			alloc_hint,
91 	enum xfs_ag_resv_type		resv)
92 {
93 	memset(xnr, 0, sizeof(struct xrep_newbt));
94 	xnr->sc = sc;
95 	xnr->oinfo = *oinfo; /* structure copy */
96 	xnr->alloc_hint = alloc_hint;
97 	xnr->resv = resv;
98 	INIT_LIST_HEAD(&xnr->resv_list);
99 	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
100 	xrep_newbt_estimate_slack(xnr);
101 }
102 
103 /* Initialize accounting resources for staging a new inode fork btree. */
104 int
105 xrep_newbt_init_inode(
106 	struct xrep_newbt		*xnr,
107 	struct xfs_scrub		*sc,
108 	int				whichfork,
109 	const struct xfs_owner_info	*oinfo)
110 {
111 	struct xfs_ifork		*ifp;
112 
113 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
114 	if (!ifp)
115 		return -ENOMEM;
116 
117 	xrep_newbt_init_ag(xnr, sc, oinfo,
118 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
119 			XFS_AG_RESV_NONE);
120 	xnr->ifake.if_fork = ifp;
121 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
122 	return 0;
123 }
124 
125 /*
126  * Initialize accounting resources for staging a new metadata inode btree.
127  * If the metadata file has a space reservation, the caller must adjust that
128  * reservation when committing the new ondisk btree.
129  */
130 int
131 xrep_newbt_init_metadir_inode(
132 	struct xrep_newbt		*xnr,
133 	struct xfs_scrub		*sc)
134 {
135 	struct xfs_owner_info		oinfo;
136 	struct xfs_ifork		*ifp;
137 
138 	ASSERT(xfs_is_metadir_inode(sc->ip));
139 
140 	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
141 
142 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
143 	if (!ifp)
144 		return -ENOMEM;
145 
146 	/*
147 	 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
148 	 * inode metadata space reservations can only account allocated space
149 	 * to the i_nblocks.  We do not want to change the inode core fields
150 	 * until we're ready to commit the new tree, so we allocate the blocks
151 	 * as if they were regular file blocks.  This exposes us to a higher
152 	 * risk of the repair being cancelled due to ENOSPC.
153 	 */
154 	xrep_newbt_init_ag(xnr, sc, &oinfo,
155 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
156 			XFS_AG_RESV_NONE);
157 	xnr->ifake.if_fork = ifp;
158 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
159 	return 0;
160 }
161 
162 /*
163  * Initialize accounting resources for staging a new btree.  Callers are
164  * expected to add their own reservations (and clean them up) manually.
165  */
166 void
167 xrep_newbt_init_bare(
168 	struct xrep_newbt		*xnr,
169 	struct xfs_scrub		*sc)
170 {
171 	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
172 			XFS_AG_RESV_NONE);
173 }
174 
175 /*
176  * Designate specific blocks to be used to build our new btree.  @pag must be
177  * a passive reference.
178  */
179 STATIC int
180 xrep_newbt_add_blocks(
181 	struct xrep_newbt		*xnr,
182 	struct xfs_perag		*pag,
183 	const struct xfs_alloc_arg	*args)
184 {
185 	struct xfs_mount		*mp = xnr->sc->mp;
186 	struct xrep_newbt_resv		*resv;
187 	int				error;
188 
189 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
190 	if (!resv)
191 		return -ENOMEM;
192 
193 	INIT_LIST_HEAD(&resv->list);
194 	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
195 	resv->len = args->len;
196 	resv->used = 0;
197 	resv->pag = xfs_perag_hold(pag);
198 
199 	if (args->tp) {
200 		ASSERT(xnr->oinfo.oi_offset == 0);
201 
202 		error = xfs_alloc_schedule_autoreap(args,
203 				XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
204 		if (error)
205 			goto out_pag;
206 	}
207 
208 	list_add_tail(&resv->list, &xnr->resv_list);
209 	return 0;
210 out_pag:
211 	xfs_perag_put(resv->pag);
212 	kfree(resv);
213 	return error;
214 }
215 
216 /*
217  * Add an extent to the new btree reservation pool.  Callers are required to
218  * reap this reservation manually if the repair is cancelled.  @pag must be a
219  * passive reference.
220  */
221 int
222 xrep_newbt_add_extent(
223 	struct xrep_newbt	*xnr,
224 	struct xfs_perag	*pag,
225 	xfs_agblock_t		agbno,
226 	xfs_extlen_t		len)
227 {
228 	struct xfs_alloc_arg	args = {
229 		.tp		= NULL, /* no autoreap */
230 		.oinfo		= xnr->oinfo,
231 		.fsbno		= xfs_agbno_to_fsb(pag, agbno),
232 		.len		= len,
233 		.resv		= xnr->resv,
234 	};
235 
236 	return xrep_newbt_add_blocks(xnr, pag, &args);
237 }
238 
239 /* Don't let our allocation hint take us beyond this AG */
240 static inline void
241 xrep_newbt_validate_ag_alloc_hint(
242 	struct xrep_newbt	*xnr)
243 {
244 	struct xfs_scrub	*sc = xnr->sc;
245 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
246 
247 	if (agno == pag_agno(sc->sa.pag) &&
248 	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
249 		return;
250 
251 	xnr->alloc_hint =
252 		xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
253 }
254 
255 /* Allocate disk space for a new per-AG btree. */
256 STATIC int
257 xrep_newbt_alloc_ag_blocks(
258 	struct xrep_newbt	*xnr,
259 	uint64_t		nr_blocks)
260 {
261 	struct xfs_scrub	*sc = xnr->sc;
262 	struct xfs_mount	*mp = sc->mp;
263 	int			error = 0;
264 
265 	ASSERT(sc->sa.pag != NULL);
266 	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
267 
268 	while (nr_blocks > 0) {
269 		struct xfs_alloc_arg	args = {
270 			.tp		= sc->tp,
271 			.mp		= mp,
272 			.oinfo		= xnr->oinfo,
273 			.minlen		= 1,
274 			.maxlen		= nr_blocks,
275 			.prod		= 1,
276 			.resv		= xnr->resv,
277 		};
278 		xfs_agnumber_t		agno;
279 
280 		xrep_newbt_validate_ag_alloc_hint(xnr);
281 
282 		if (xnr->alloc_vextent)
283 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
284 		else
285 			error = xfs_alloc_vextent_near_bno(&args,
286 					xnr->alloc_hint);
287 		if (error)
288 			return error;
289 		if (args.fsbno == NULLFSBLOCK)
290 			return -ENOSPC;
291 
292 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
293 		if (agno != pag_agno(sc->sa.pag)) {
294 			ASSERT(agno == pag_agno(sc->sa.pag));
295 			return -EFSCORRUPTED;
296 		}
297 
298 		trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
299 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
300 				xnr->oinfo.oi_owner);
301 
302 		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
303 		if (error)
304 			return error;
305 
306 		nr_blocks -= args.len;
307 		xnr->alloc_hint = args.fsbno + args.len;
308 
309 		error = xrep_defer_finish(sc);
310 		if (error)
311 			return error;
312 	}
313 
314 	return 0;
315 }
316 
317 /* Don't let our allocation hint take us beyond EOFS */
318 static inline void
319 xrep_newbt_validate_file_alloc_hint(
320 	struct xrep_newbt	*xnr)
321 {
322 	struct xfs_scrub	*sc = xnr->sc;
323 
324 	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
325 		return;
326 
327 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
328 }
329 
330 /* Allocate disk space for our new file-based btree. */
331 STATIC int
332 xrep_newbt_alloc_file_blocks(
333 	struct xrep_newbt	*xnr,
334 	uint64_t		nr_blocks)
335 {
336 	struct xfs_scrub	*sc = xnr->sc;
337 	struct xfs_mount	*mp = sc->mp;
338 	int			error = 0;
339 
340 	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
341 
342 	while (nr_blocks > 0) {
343 		struct xfs_alloc_arg	args = {
344 			.tp		= sc->tp,
345 			.mp		= mp,
346 			.oinfo		= xnr->oinfo,
347 			.minlen		= 1,
348 			.maxlen		= nr_blocks,
349 			.prod		= 1,
350 			.resv		= xnr->resv,
351 		};
352 		struct xfs_perag	*pag;
353 		xfs_agnumber_t		agno;
354 
355 		xrep_newbt_validate_file_alloc_hint(xnr);
356 
357 		if (xnr->alloc_vextent)
358 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
359 		else
360 			error = xfs_alloc_vextent_start_ag(&args,
361 					xnr->alloc_hint);
362 		if (error)
363 			return error;
364 		if (args.fsbno == NULLFSBLOCK)
365 			return -ENOSPC;
366 
367 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
368 
369 		pag = xfs_perag_get(mp, agno);
370 		if (!pag) {
371 			ASSERT(0);
372 			return -EFSCORRUPTED;
373 		}
374 
375 		trace_xrep_newbt_alloc_file_blocks(pag,
376 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
377 				xnr->oinfo.oi_owner);
378 
379 		error = xrep_newbt_add_blocks(xnr, pag, &args);
380 		xfs_perag_put(pag);
381 		if (error)
382 			return error;
383 
384 		nr_blocks -= args.len;
385 		xnr->alloc_hint = args.fsbno + args.len;
386 
387 		error = xrep_defer_finish(sc);
388 		if (error)
389 			return error;
390 	}
391 
392 	return 0;
393 }
394 
395 /* Allocate disk space for our new btree. */
396 int
397 xrep_newbt_alloc_blocks(
398 	struct xrep_newbt	*xnr,
399 	uint64_t		nr_blocks)
400 {
401 	if (xnr->sc->ip)
402 		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
403 	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
404 }
405 
406 /*
407  * Free the unused part of a space extent that was reserved for a new ondisk
408  * structure.  Returns the number of EFIs logged or a negative errno.
409  */
410 STATIC int
411 xrep_newbt_free_extent(
412 	struct xrep_newbt	*xnr,
413 	struct xrep_newbt_resv	*resv,
414 	bool			btree_committed)
415 {
416 	struct xfs_scrub	*sc = xnr->sc;
417 	xfs_agblock_t		free_agbno = resv->agbno;
418 	xfs_extlen_t		free_aglen = resv->len;
419 	int			error;
420 
421 	if (!btree_committed || resv->used == 0) {
422 		/*
423 		 * If we're not committing a new btree or we didn't use the
424 		 * space reservation, let the existing EFI free the entire
425 		 * space extent.
426 		 */
427 		trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
428 				xnr->oinfo.oi_owner);
429 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
430 		return 1;
431 	}
432 
433 	/*
434 	 * We used space and committed the btree.  Cancel the autoreap, remove
435 	 * the written blocks from the reservation, and possibly log a new EFI
436 	 * to free any unused reservation space.
437 	 */
438 	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
439 	free_agbno += resv->used;
440 	free_aglen -= resv->used;
441 
442 	if (free_aglen == 0)
443 		return 0;
444 
445 	trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
446 			xnr->oinfo.oi_owner);
447 
448 	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
449 	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
450 
451 	/*
452 	 * Use EFIs to free the reservations.  This reduces the chance
453 	 * that we leak blocks if the system goes down.
454 	 */
455 	error = xfs_free_extent_later(sc->tp,
456 			xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
457 			&xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
458 	if (error)
459 		return error;
460 
461 	return 1;
462 }
463 
464 /* Free all the accounting info and disk space we reserved for a new btree. */
465 STATIC int
466 xrep_newbt_free(
467 	struct xrep_newbt	*xnr,
468 	bool			btree_committed)
469 {
470 	struct xfs_scrub	*sc = xnr->sc;
471 	struct xrep_newbt_resv	*resv, *n;
472 	unsigned int		freed = 0;
473 	int			error = 0;
474 
475 	/*
476 	 * If the filesystem already went down, we can't free the blocks.  Skip
477 	 * ahead to freeing the incore metadata because we can't fix anything.
478 	 */
479 	if (xfs_is_shutdown(sc->mp))
480 		goto junkit;
481 
482 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
483 		int		ret;
484 
485 		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
486 		list_del(&resv->list);
487 		xfs_perag_put(resv->pag);
488 		kfree(resv);
489 		if (ret < 0) {
490 			error = ret;
491 			goto junkit;
492 		}
493 
494 		freed += ret;
495 		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
496 			error = xrep_defer_finish(sc);
497 			if (error)
498 				goto junkit;
499 			freed = 0;
500 		}
501 	}
502 
503 	if (freed)
504 		error = xrep_defer_finish(sc);
505 
506 junkit:
507 	/*
508 	 * If we still have reservations attached to @newbt, cleanup must have
509 	 * failed and the filesystem is about to go down.  Clean up the incore
510 	 * reservations and try to commit to freeing the space we used.
511 	 */
512 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
513 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
514 		list_del(&resv->list);
515 		xfs_perag_put(resv->pag);
516 		kfree(resv);
517 	}
518 
519 	if (sc->ip) {
520 		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
521 		xnr->ifake.if_fork = NULL;
522 	}
523 
524 	return error;
525 }
526 
527 /*
528  * Free all the accounting info and unused disk space allocations after
529  * committing a new btree.
530  */
531 int
532 xrep_newbt_commit(
533 	struct xrep_newbt	*xnr)
534 {
535 	return xrep_newbt_free(xnr, true);
536 }
537 
538 /*
539  * Free all the accounting info and all of the disk space we reserved for a new
540  * btree that we're not going to commit.  We want to try to roll things back
541  * cleanly for things like ENOSPC midway through allocation.
542  */
543 void
544 xrep_newbt_cancel(
545 	struct xrep_newbt	*xnr)
546 {
547 	xrep_newbt_free(xnr, false);
548 }
549 
550 /* Feed one of the reserved btree blocks to the bulk loader. */
551 int
552 xrep_newbt_claim_block(
553 	struct xfs_btree_cur	*cur,
554 	struct xrep_newbt	*xnr,
555 	union xfs_btree_ptr	*ptr)
556 {
557 	struct xrep_newbt_resv	*resv;
558 	xfs_agblock_t		agbno;
559 
560 	/*
561 	 * The first item in the list should always have a free block unless
562 	 * we're completely out.
563 	 */
564 	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
565 	if (resv->used == resv->len)
566 		return -ENOSPC;
567 
568 	/*
569 	 * Peel off a block from the start of the reservation.  We allocate
570 	 * blocks in order to place blocks on disk in increasing record or key
571 	 * order.  The block reservations tend to end up on the list in
572 	 * decreasing order, which hopefully results in leaf blocks ending up
573 	 * together.
574 	 */
575 	agbno = resv->agbno + resv->used;
576 	resv->used++;
577 
578 	/* If we used all the blocks in this reservation, move it to the end. */
579 	if (resv->used == resv->len)
580 		list_move_tail(&resv->list, &xnr->resv_list);
581 
582 	trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
583 
584 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
585 		ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
586 	else
587 		ptr->s = cpu_to_be32(agbno);
588 
589 	/* Relog all the EFIs. */
590 	return xrep_defer_finish(xnr->sc);
591 }
592 
593 /* How many reserved blocks are unused? */
594 unsigned int
595 xrep_newbt_unused_blocks(
596 	struct xrep_newbt	*xnr)
597 {
598 	struct xrep_newbt_resv	*resv;
599 	unsigned int		unused = 0;
600 
601 	list_for_each_entry(resv, &xnr->resv_list, list)
602 		unused += resv->len - resv->used;
603 	return unused;
604 }
605