xref: /linux/fs/xfs/scrub/newbt.c (revision ee237a900ceef00c8d1ca17a5cfadbd50fba67e9)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "xfs_metafile.h"
23 #include "xfs_quota.h"
24 #include "scrub/scrub.h"
25 #include "scrub/common.h"
26 #include "scrub/trace.h"
27 #include "scrub/repair.h"
28 #include "scrub/newbt.h"
29 
30 /*
31  * This is the maximum number of deferred extent freeing item extents (EFIs)
32  * that we'll attach to a transaction without rolling the transaction to avoid
33  * overrunning a tr_itruncate reservation.  The newbt code should reserve
34  * exactly the correct number of blocks to rebuild the btree, so there should
35  * not be any excess blocks to free when committing a new btree.
36  */
37 #define XREP_MAX_ITRUNCATE_EFIS	(128)
38 
39 /*
40  * Estimate proper slack values for a btree that's being reloaded.
41  *
42  * Under most circumstances, we'll take whatever default loading value the
43  * btree bulk loading code calculates for us.  However, there are some
44  * exceptions to this rule:
45  *
46  * (0) If someone turned one of the debug knobs.
47  * (1) If this is a per-AG btree and the AG has less than 10% space free.
48  * (2) If this is an inode btree and the FS has less than 10% space free.
49 
50  * In either case, format the new btree blocks almost completely full to
51  * minimize space usage.
52  */
53 static void
54 xrep_newbt_estimate_slack(
55 	struct xrep_newbt	*xnr)
56 {
57 	struct xfs_scrub	*sc = xnr->sc;
58 	struct xfs_btree_bload	*bload = &xnr->bload;
59 	uint64_t		free;
60 	uint64_t		sz;
61 
62 	/*
63 	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
64 	 * unless someone has set them otherwise, so we just pull the values
65 	 * here.
66 	 */
67 	bload->leaf_slack = xfs_globals.bload_leaf_slack;
68 	bload->node_slack = xfs_globals.bload_node_slack;
69 
70 	if (sc->ops->type == ST_PERAG) {
71 		free = sc->sa.pag->pagf_freeblks;
72 		sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
73 	} else {
74 		free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
75 		sz = sc->mp->m_sb.sb_dblocks;
76 	}
77 
78 	/* No further changes if there's more than 10% free space left. */
79 	if (free >= div_u64(sz, 10))
80 		return;
81 
82 	/*
83 	 * We're low on space; load the btrees as tightly as possible.  Leave
84 	 * a couple of open slots in each btree block so that we don't end up
85 	 * splitting the btrees like crazy after a mount.
86 	 */
87 	if (bload->leaf_slack < 0)
88 		bload->leaf_slack = 2;
89 	if (bload->node_slack < 0)
90 		bload->node_slack = 2;
91 }
92 
93 /* Initialize accounting resources for staging a new AG btree. */
94 void
95 xrep_newbt_init_ag(
96 	struct xrep_newbt		*xnr,
97 	struct xfs_scrub		*sc,
98 	const struct xfs_owner_info	*oinfo,
99 	xfs_fsblock_t			alloc_hint,
100 	enum xfs_ag_resv_type		resv)
101 {
102 	memset(xnr, 0, sizeof(struct xrep_newbt));
103 	xnr->sc = sc;
104 	xnr->oinfo = *oinfo; /* structure copy */
105 	xnr->alloc_hint = alloc_hint;
106 	xnr->resv = resv;
107 	INIT_LIST_HEAD(&xnr->resv_list);
108 	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
109 	xrep_newbt_estimate_slack(xnr);
110 }
111 
112 /* Initialize accounting resources for staging a new inode fork btree. */
113 int
114 xrep_newbt_init_inode(
115 	struct xrep_newbt		*xnr,
116 	struct xfs_scrub		*sc,
117 	int				whichfork,
118 	const struct xfs_owner_info	*oinfo)
119 {
120 	struct xfs_ifork		*ifp;
121 
122 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
123 	if (!ifp)
124 		return -ENOMEM;
125 
126 	xrep_newbt_init_ag(xnr, sc, oinfo, XFS_INODE_TO_FSB(sc->ip),
127 			XFS_AG_RESV_NONE);
128 	xnr->ifake.if_fork = ifp;
129 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
130 	return 0;
131 }
132 
133 /*
134  * Initialize accounting resources for staging a new metadata inode btree.
135  * If the metadata file has a space reservation, the caller must adjust that
136  * reservation when committing the new ondisk btree.
137  */
138 int
139 xrep_newbt_init_metadir_inode(
140 	struct xrep_newbt		*xnr,
141 	struct xfs_scrub		*sc)
142 {
143 	struct xfs_owner_info		oinfo;
144 	struct xfs_ifork		*ifp;
145 
146 	ASSERT(xfs_is_metadir_inode(sc->ip));
147 
148 	xfs_rmap_inode_bmbt_owner(&oinfo, sc->ip, XFS_DATA_FORK);
149 
150 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
151 	if (!ifp)
152 		return -ENOMEM;
153 
154 	/*
155 	 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
156 	 * inode metadata space reservations can only account allocated space
157 	 * to the i_nblocks.  We do not want to change the inode core fields
158 	 * until we're ready to commit the new tree, so we allocate the blocks
159 	 * as if they were regular file blocks.  This exposes us to a higher
160 	 * risk of the repair being cancelled due to ENOSPC.
161 	 */
162 	xrep_newbt_init_ag(xnr, sc, &oinfo, XFS_INODE_TO_FSB(sc->ip),
163 			XFS_AG_RESV_NONE);
164 	xnr->ifake.if_fork = ifp;
165 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
166 	return 0;
167 }
168 
169 /*
170  * Initialize accounting resources for staging a new btree.  Callers are
171  * expected to add their own reservations (and clean them up) manually.
172  */
173 void
174 xrep_newbt_init_bare(
175 	struct xrep_newbt		*xnr,
176 	struct xfs_scrub		*sc)
177 {
178 	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
179 			XFS_AG_RESV_NONE);
180 }
181 
182 /*
183  * Designate specific blocks to be used to build our new btree.  @pag must be
184  * a passive reference.
185  */
186 STATIC int
187 xrep_newbt_add_blocks(
188 	struct xrep_newbt		*xnr,
189 	struct xfs_perag		*pag,
190 	const struct xfs_alloc_arg	*args)
191 {
192 	struct xfs_mount		*mp = xnr->sc->mp;
193 	struct xrep_newbt_resv		*resv;
194 	int				error;
195 
196 	resv = kmalloc_obj(struct xrep_newbt_resv, XCHK_GFP_FLAGS);
197 	if (!resv)
198 		return -ENOMEM;
199 
200 	INIT_LIST_HEAD(&resv->list);
201 	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
202 	resv->len = args->len;
203 	resv->used = 0;
204 	resv->pag = xfs_perag_hold(pag);
205 
206 	if (args->tp) {
207 		ASSERT(xnr->oinfo.oi_offset == 0);
208 
209 		error = xfs_alloc_schedule_autoreap(args,
210 				XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
211 		if (error)
212 			goto out_pag;
213 	}
214 
215 	list_add_tail(&resv->list, &xnr->resv_list);
216 	return 0;
217 out_pag:
218 	xfs_perag_put(resv->pag);
219 	kfree(resv);
220 	return error;
221 }
222 
223 /*
224  * Add an extent to the new btree reservation pool.  Callers are required to
225  * reap this reservation manually if the repair is cancelled.  @pag must be a
226  * passive reference.
227  */
228 int
229 xrep_newbt_add_extent(
230 	struct xrep_newbt	*xnr,
231 	struct xfs_perag	*pag,
232 	xfs_agblock_t		agbno,
233 	xfs_extlen_t		len)
234 {
235 	struct xfs_alloc_arg	args = {
236 		.tp		= NULL, /* no autoreap */
237 		.oinfo		= xnr->oinfo,
238 		.fsbno		= xfs_agbno_to_fsb(pag, agbno),
239 		.len		= len,
240 		.resv		= xnr->resv,
241 	};
242 
243 	return xrep_newbt_add_blocks(xnr, pag, &args);
244 }
245 
246 /* Don't let our allocation hint take us beyond this AG */
247 static inline void
248 xrep_newbt_validate_ag_alloc_hint(
249 	struct xrep_newbt	*xnr)
250 {
251 	struct xfs_scrub	*sc = xnr->sc;
252 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
253 
254 	if (agno == pag_agno(sc->sa.pag) &&
255 	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
256 		return;
257 
258 	xnr->alloc_hint =
259 		xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
260 }
261 
262 /* Allocate disk space for a new per-AG btree. */
263 STATIC int
264 xrep_newbt_alloc_ag_blocks(
265 	struct xrep_newbt	*xnr,
266 	uint64_t		nr_blocks)
267 {
268 	struct xfs_scrub	*sc = xnr->sc;
269 	struct xfs_mount	*mp = sc->mp;
270 	int			error = 0;
271 
272 	ASSERT(sc->sa.pag != NULL);
273 	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
274 
275 	while (nr_blocks > 0) {
276 		struct xfs_alloc_arg	args = {
277 			.tp		= sc->tp,
278 			.mp		= mp,
279 			.oinfo		= xnr->oinfo,
280 			.minlen		= 1,
281 			.maxlen		= nr_blocks,
282 			.prod		= 1,
283 			.resv		= xnr->resv,
284 		};
285 		xfs_agnumber_t		agno;
286 
287 		xrep_newbt_validate_ag_alloc_hint(xnr);
288 
289 		if (xnr->alloc_vextent)
290 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
291 		else
292 			error = xfs_alloc_vextent_near_bno(&args,
293 					xnr->alloc_hint);
294 		if (error)
295 			return error;
296 		if (args.fsbno == NULLFSBLOCK)
297 			return -ENOSPC;
298 
299 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
300 		if (agno != pag_agno(sc->sa.pag)) {
301 			ASSERT(agno == pag_agno(sc->sa.pag));
302 			return -EFSCORRUPTED;
303 		}
304 
305 		trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
306 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
307 				xnr->oinfo.oi_owner);
308 
309 		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
310 		if (error)
311 			return error;
312 
313 		nr_blocks -= args.len;
314 		xnr->alloc_hint = args.fsbno + args.len;
315 
316 		error = xrep_defer_finish(sc);
317 		if (error)
318 			return error;
319 	}
320 
321 	return 0;
322 }
323 
324 /* Don't let our allocation hint take us beyond EOFS */
325 static inline void
326 xrep_newbt_validate_file_alloc_hint(
327 	struct xrep_newbt	*xnr)
328 {
329 	struct xfs_scrub	*sc = xnr->sc;
330 
331 	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
332 		return;
333 
334 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
335 }
336 
337 /* Allocate disk space for our new file-based btree. */
338 STATIC int
339 xrep_newbt_alloc_file_blocks(
340 	struct xrep_newbt	*xnr,
341 	uint64_t		nr_blocks)
342 {
343 	struct xfs_scrub	*sc = xnr->sc;
344 	struct xfs_mount	*mp = sc->mp;
345 	int			error = 0;
346 
347 	ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
348 
349 	while (nr_blocks > 0) {
350 		struct xfs_alloc_arg	args = {
351 			.tp		= sc->tp,
352 			.mp		= mp,
353 			.oinfo		= xnr->oinfo,
354 			.minlen		= 1,
355 			.maxlen		= nr_blocks,
356 			.prod		= 1,
357 			.resv		= xnr->resv,
358 		};
359 		struct xfs_perag	*pag;
360 		xfs_agnumber_t		agno;
361 
362 		xrep_newbt_validate_file_alloc_hint(xnr);
363 
364 		if (xnr->alloc_vextent)
365 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
366 		else
367 			error = xfs_alloc_vextent_start_ag(&args,
368 					xnr->alloc_hint);
369 		if (error)
370 			return error;
371 		if (args.fsbno == NULLFSBLOCK)
372 			return -ENOSPC;
373 
374 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
375 
376 		pag = xfs_perag_get(mp, agno);
377 		if (!pag) {
378 			ASSERT(0);
379 			return -EFSCORRUPTED;
380 		}
381 
382 		trace_xrep_newbt_alloc_file_blocks(pag,
383 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
384 				xnr->oinfo.oi_owner);
385 
386 		error = xrep_newbt_add_blocks(xnr, pag, &args);
387 		xfs_perag_put(pag);
388 		if (error)
389 			return error;
390 
391 		nr_blocks -= args.len;
392 		xnr->alloc_hint = args.fsbno + args.len;
393 
394 		error = xrep_defer_finish(sc);
395 		if (error)
396 			return error;
397 	}
398 
399 	return 0;
400 }
401 
402 /* Allocate disk space for our new btree. */
403 int
404 xrep_newbt_alloc_blocks(
405 	struct xrep_newbt	*xnr,
406 	uint64_t		nr_blocks)
407 {
408 	if (xnr->sc->ip)
409 		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
410 	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
411 }
412 
413 /*
414  * Free the unused part of a space extent that was reserved for a new ondisk
415  * structure.  Returns the number of EFIs logged or a negative errno.
416  */
417 STATIC int
418 xrep_newbt_free_extent(
419 	struct xrep_newbt	*xnr,
420 	struct xrep_newbt_resv	*resv,
421 	bool			btree_committed)
422 {
423 	struct xfs_scrub	*sc = xnr->sc;
424 	xfs_agblock_t		free_agbno = resv->agbno;
425 	xfs_extlen_t		free_aglen = resv->len;
426 	int			error;
427 
428 	if (!btree_committed || resv->used == 0) {
429 		/*
430 		 * If we're not committing a new btree or we didn't use the
431 		 * space reservation, let the existing EFI free the entire
432 		 * space extent.
433 		 */
434 		trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
435 				xnr->oinfo.oi_owner);
436 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
437 		return 1;
438 	}
439 
440 	/*
441 	 * We used space and committed the btree.  Cancel the autoreap, remove
442 	 * the written blocks from the reservation, and possibly log a new EFI
443 	 * to free any unused reservation space.
444 	 */
445 	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
446 	free_agbno += resv->used;
447 	free_aglen -= resv->used;
448 
449 	if (free_aglen == 0)
450 		return 0;
451 
452 	trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
453 			xnr->oinfo.oi_owner);
454 
455 	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
456 	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
457 
458 	/*
459 	 * Use EFIs to free the reservations.  This reduces the chance
460 	 * that we leak blocks if the system goes down.
461 	 */
462 	error = xfs_free_extent_later(sc->tp,
463 			xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
464 			&xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
465 	if (error)
466 		return error;
467 
468 	return 1;
469 }
470 
471 /* Free all the accounting info and disk space we reserved for a new btree. */
472 STATIC int
473 xrep_newbt_free(
474 	struct xrep_newbt	*xnr,
475 	bool			btree_committed)
476 {
477 	struct xfs_scrub	*sc = xnr->sc;
478 	struct xrep_newbt_resv	*resv, *n;
479 	unsigned int		freed = 0;
480 	int			error = 0;
481 
482 	/*
483 	 * If the filesystem already went down, we can't free the blocks.  Skip
484 	 * ahead to freeing the incore metadata because we can't fix anything.
485 	 */
486 	if (xfs_is_shutdown(sc->mp))
487 		goto junkit;
488 
489 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
490 		int		ret;
491 
492 		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
493 		list_del(&resv->list);
494 		xfs_perag_put(resv->pag);
495 		kfree(resv);
496 		if (ret < 0) {
497 			error = ret;
498 			goto junkit;
499 		}
500 
501 		freed += ret;
502 		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
503 			error = xrep_defer_finish(sc);
504 			if (error)
505 				goto junkit;
506 			freed = 0;
507 		}
508 	}
509 
510 	if (freed)
511 		error = xrep_defer_finish(sc);
512 
513 junkit:
514 	/*
515 	 * If we still have reservations attached to @newbt, cleanup must have
516 	 * failed and the filesystem is about to go down.  Clean up the incore
517 	 * reservations and try to commit to freeing the space we used.
518 	 */
519 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
520 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
521 		list_del(&resv->list);
522 		xfs_perag_put(resv->pag);
523 		kfree(resv);
524 	}
525 
526 	if (sc->ip) {
527 		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
528 		xnr->ifake.if_fork = NULL;
529 	}
530 
531 	return error;
532 }
533 
534 /*
535  * Free all the accounting info and unused disk space allocations after
536  * committing a new btree.
537  */
538 int
539 xrep_newbt_commit(
540 	struct xrep_newbt	*xnr)
541 {
542 	return xrep_newbt_free(xnr, true);
543 }
544 
545 /*
546  * Free all the accounting info and all of the disk space we reserved for a new
547  * btree that we're not going to commit.  We want to try to roll things back
548  * cleanly for things like ENOSPC midway through allocation.
549  */
550 void
551 xrep_newbt_cancel(
552 	struct xrep_newbt	*xnr)
553 {
554 	xrep_newbt_free(xnr, false);
555 }
556 
557 /* Feed one of the reserved btree blocks to the bulk loader. */
558 int
559 xrep_newbt_claim_block(
560 	struct xfs_btree_cur	*cur,
561 	struct xrep_newbt	*xnr,
562 	union xfs_btree_ptr	*ptr)
563 {
564 	struct xrep_newbt_resv	*resv;
565 	xfs_agblock_t		agbno;
566 
567 	/*
568 	 * The first item in the list should always have a free block unless
569 	 * we're completely out.
570 	 */
571 	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
572 	if (resv->used == resv->len)
573 		return -ENOSPC;
574 
575 	/*
576 	 * Peel off a block from the start of the reservation.  We allocate
577 	 * blocks in order to place blocks on disk in increasing record or key
578 	 * order.  The block reservations tend to end up on the list in
579 	 * decreasing order, which hopefully results in leaf blocks ending up
580 	 * together.
581 	 */
582 	agbno = resv->agbno + resv->used;
583 	resv->used++;
584 
585 	/* If we used all the blocks in this reservation, move it to the end. */
586 	if (resv->used == resv->len)
587 		list_move_tail(&resv->list, &xnr->resv_list);
588 
589 	trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
590 
591 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
592 		ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
593 	else
594 		ptr->s = cpu_to_be32(agbno);
595 
596 	/* Relog all the EFIs. */
597 	return xrep_defer_finish(xnr->sc);
598 }
599 
600 /* How many reserved blocks are unused? */
601 unsigned int
602 xrep_newbt_unused_blocks(
603 	struct xrep_newbt	*xnr)
604 {
605 	struct xrep_newbt_resv	*resv;
606 	unsigned int		unused = 0;
607 
608 	list_for_each_entry(resv, &xnr->resv_list, list)
609 		unused += resv->len - resv->used;
610 	return unused;
611 }
612