xref: /linux/fs/xfs/scrub/newbt.c (revision 2a52ca7c98960aafb0eca9ef96b2d0c932171357)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25 #include "scrub/repair.h"
26 #include "scrub/newbt.h"
27 
28 /*
29  * Estimate proper slack values for a btree that's being reloaded.
30  *
31  * Under most circumstances, we'll take whatever default loading value the
32  * btree bulk loading code calculates for us.  However, there are some
33  * exceptions to this rule:
34  *
35  * (0) If someone turned one of the debug knobs.
36  * (1) If this is a per-AG btree and the AG has less than 10% space free.
37  * (2) If this is an inode btree and the FS has less than 10% space free.
38 
39  * In either case, format the new btree blocks almost completely full to
40  * minimize space usage.
41  */
42 static void
43 xrep_newbt_estimate_slack(
44 	struct xrep_newbt	*xnr)
45 {
46 	struct xfs_scrub	*sc = xnr->sc;
47 	struct xfs_btree_bload	*bload = &xnr->bload;
48 	uint64_t		free;
49 	uint64_t		sz;
50 
51 	/*
52 	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
53 	 * unless someone has set them otherwise, so we just pull the values
54 	 * here.
55 	 */
56 	bload->leaf_slack = xfs_globals.bload_leaf_slack;
57 	bload->node_slack = xfs_globals.bload_node_slack;
58 
59 	if (sc->ops->type == ST_PERAG) {
60 		free = sc->sa.pag->pagf_freeblks;
61 		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
62 	} else {
63 		free = percpu_counter_sum(&sc->mp->m_fdblocks);
64 		sz = sc->mp->m_sb.sb_dblocks;
65 	}
66 
67 	/* No further changes if there's more than 10% free space left. */
68 	if (free >= div_u64(sz, 10))
69 		return;
70 
71 	/*
72 	 * We're low on space; load the btrees as tightly as possible.  Leave
73 	 * a couple of open slots in each btree block so that we don't end up
74 	 * splitting the btrees like crazy after a mount.
75 	 */
76 	if (bload->leaf_slack < 0)
77 		bload->leaf_slack = 2;
78 	if (bload->node_slack < 0)
79 		bload->node_slack = 2;
80 }
81 
82 /* Initialize accounting resources for staging a new AG btree. */
83 void
84 xrep_newbt_init_ag(
85 	struct xrep_newbt		*xnr,
86 	struct xfs_scrub		*sc,
87 	const struct xfs_owner_info	*oinfo,
88 	xfs_fsblock_t			alloc_hint,
89 	enum xfs_ag_resv_type		resv)
90 {
91 	memset(xnr, 0, sizeof(struct xrep_newbt));
92 	xnr->sc = sc;
93 	xnr->oinfo = *oinfo; /* structure copy */
94 	xnr->alloc_hint = alloc_hint;
95 	xnr->resv = resv;
96 	INIT_LIST_HEAD(&xnr->resv_list);
97 	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
98 	xrep_newbt_estimate_slack(xnr);
99 }
100 
101 /* Initialize accounting resources for staging a new inode fork btree. */
102 int
103 xrep_newbt_init_inode(
104 	struct xrep_newbt		*xnr,
105 	struct xfs_scrub		*sc,
106 	int				whichfork,
107 	const struct xfs_owner_info	*oinfo)
108 {
109 	struct xfs_ifork		*ifp;
110 
111 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
112 	if (!ifp)
113 		return -ENOMEM;
114 
115 	xrep_newbt_init_ag(xnr, sc, oinfo,
116 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
117 			XFS_AG_RESV_NONE);
118 	xnr->ifake.if_fork = ifp;
119 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
120 	return 0;
121 }
122 
123 /*
124  * Initialize accounting resources for staging a new btree.  Callers are
125  * expected to add their own reservations (and clean them up) manually.
126  */
127 void
128 xrep_newbt_init_bare(
129 	struct xrep_newbt		*xnr,
130 	struct xfs_scrub		*sc)
131 {
132 	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
133 			XFS_AG_RESV_NONE);
134 }
135 
136 /*
137  * Designate specific blocks to be used to build our new btree.  @pag must be
138  * a passive reference.
139  */
140 STATIC int
141 xrep_newbt_add_blocks(
142 	struct xrep_newbt		*xnr,
143 	struct xfs_perag		*pag,
144 	const struct xfs_alloc_arg	*args)
145 {
146 	struct xfs_mount		*mp = xnr->sc->mp;
147 	struct xrep_newbt_resv		*resv;
148 	int				error;
149 
150 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
151 	if (!resv)
152 		return -ENOMEM;
153 
154 	INIT_LIST_HEAD(&resv->list);
155 	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
156 	resv->len = args->len;
157 	resv->used = 0;
158 	resv->pag = xfs_perag_hold(pag);
159 
160 	if (args->tp) {
161 		ASSERT(xnr->oinfo.oi_offset == 0);
162 
163 		error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
164 		if (error)
165 			goto out_pag;
166 	}
167 
168 	list_add_tail(&resv->list, &xnr->resv_list);
169 	return 0;
170 out_pag:
171 	xfs_perag_put(resv->pag);
172 	kfree(resv);
173 	return error;
174 }
175 
176 /*
177  * Add an extent to the new btree reservation pool.  Callers are required to
178  * reap this reservation manually if the repair is cancelled.  @pag must be a
179  * passive reference.
180  */
181 int
182 xrep_newbt_add_extent(
183 	struct xrep_newbt	*xnr,
184 	struct xfs_perag	*pag,
185 	xfs_agblock_t		agbno,
186 	xfs_extlen_t		len)
187 {
188 	struct xfs_mount	*mp = xnr->sc->mp;
189 	struct xfs_alloc_arg	args = {
190 		.tp		= NULL, /* no autoreap */
191 		.oinfo		= xnr->oinfo,
192 		.fsbno		= XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
193 		.len		= len,
194 		.resv		= xnr->resv,
195 	};
196 
197 	return xrep_newbt_add_blocks(xnr, pag, &args);
198 }
199 
200 /* Don't let our allocation hint take us beyond this AG */
201 static inline void
202 xrep_newbt_validate_ag_alloc_hint(
203 	struct xrep_newbt	*xnr)
204 {
205 	struct xfs_scrub	*sc = xnr->sc;
206 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
207 
208 	if (agno == sc->sa.pag->pag_agno &&
209 	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
210 		return;
211 
212 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
213 					 XFS_AGFL_BLOCK(sc->mp) + 1);
214 }
215 
216 /* Allocate disk space for a new per-AG btree. */
217 STATIC int
218 xrep_newbt_alloc_ag_blocks(
219 	struct xrep_newbt	*xnr,
220 	uint64_t		nr_blocks)
221 {
222 	struct xfs_scrub	*sc = xnr->sc;
223 	struct xfs_mount	*mp = sc->mp;
224 	int			error = 0;
225 
226 	ASSERT(sc->sa.pag != NULL);
227 
228 	while (nr_blocks > 0) {
229 		struct xfs_alloc_arg	args = {
230 			.tp		= sc->tp,
231 			.mp		= mp,
232 			.oinfo		= xnr->oinfo,
233 			.minlen		= 1,
234 			.maxlen		= nr_blocks,
235 			.prod		= 1,
236 			.resv		= xnr->resv,
237 		};
238 		xfs_agnumber_t		agno;
239 
240 		xrep_newbt_validate_ag_alloc_hint(xnr);
241 
242 		if (xnr->alloc_vextent)
243 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
244 		else
245 			error = xfs_alloc_vextent_near_bno(&args,
246 					xnr->alloc_hint);
247 		if (error)
248 			return error;
249 		if (args.fsbno == NULLFSBLOCK)
250 			return -ENOSPC;
251 
252 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
253 
254 		trace_xrep_newbt_alloc_ag_blocks(mp, agno,
255 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
256 				xnr->oinfo.oi_owner);
257 
258 		if (agno != sc->sa.pag->pag_agno) {
259 			ASSERT(agno == sc->sa.pag->pag_agno);
260 			return -EFSCORRUPTED;
261 		}
262 
263 		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
264 		if (error)
265 			return error;
266 
267 		nr_blocks -= args.len;
268 		xnr->alloc_hint = args.fsbno + args.len;
269 
270 		error = xrep_defer_finish(sc);
271 		if (error)
272 			return error;
273 	}
274 
275 	return 0;
276 }
277 
278 /* Don't let our allocation hint take us beyond EOFS */
279 static inline void
280 xrep_newbt_validate_file_alloc_hint(
281 	struct xrep_newbt	*xnr)
282 {
283 	struct xfs_scrub	*sc = xnr->sc;
284 
285 	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
286 		return;
287 
288 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
289 }
290 
291 /* Allocate disk space for our new file-based btree. */
292 STATIC int
293 xrep_newbt_alloc_file_blocks(
294 	struct xrep_newbt	*xnr,
295 	uint64_t		nr_blocks)
296 {
297 	struct xfs_scrub	*sc = xnr->sc;
298 	struct xfs_mount	*mp = sc->mp;
299 	int			error = 0;
300 
301 	while (nr_blocks > 0) {
302 		struct xfs_alloc_arg	args = {
303 			.tp		= sc->tp,
304 			.mp		= mp,
305 			.oinfo		= xnr->oinfo,
306 			.minlen		= 1,
307 			.maxlen		= nr_blocks,
308 			.prod		= 1,
309 			.resv		= xnr->resv,
310 		};
311 		struct xfs_perag	*pag;
312 		xfs_agnumber_t		agno;
313 
314 		xrep_newbt_validate_file_alloc_hint(xnr);
315 
316 		if (xnr->alloc_vextent)
317 			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
318 		else
319 			error = xfs_alloc_vextent_start_ag(&args,
320 					xnr->alloc_hint);
321 		if (error)
322 			return error;
323 		if (args.fsbno == NULLFSBLOCK)
324 			return -ENOSPC;
325 
326 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
327 
328 		trace_xrep_newbt_alloc_file_blocks(mp, agno,
329 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
330 				xnr->oinfo.oi_owner);
331 
332 		pag = xfs_perag_get(mp, agno);
333 		if (!pag) {
334 			ASSERT(0);
335 			return -EFSCORRUPTED;
336 		}
337 
338 		error = xrep_newbt_add_blocks(xnr, pag, &args);
339 		xfs_perag_put(pag);
340 		if (error)
341 			return error;
342 
343 		nr_blocks -= args.len;
344 		xnr->alloc_hint = args.fsbno + args.len;
345 
346 		error = xrep_defer_finish(sc);
347 		if (error)
348 			return error;
349 	}
350 
351 	return 0;
352 }
353 
354 /* Allocate disk space for our new btree. */
355 int
356 xrep_newbt_alloc_blocks(
357 	struct xrep_newbt	*xnr,
358 	uint64_t		nr_blocks)
359 {
360 	if (xnr->sc->ip)
361 		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
362 	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
363 }
364 
365 /*
366  * Free the unused part of a space extent that was reserved for a new ondisk
367  * structure.  Returns the number of EFIs logged or a negative errno.
368  */
369 STATIC int
370 xrep_newbt_free_extent(
371 	struct xrep_newbt	*xnr,
372 	struct xrep_newbt_resv	*resv,
373 	bool			btree_committed)
374 {
375 	struct xfs_scrub	*sc = xnr->sc;
376 	xfs_agblock_t		free_agbno = resv->agbno;
377 	xfs_extlen_t		free_aglen = resv->len;
378 	xfs_fsblock_t		fsbno;
379 	int			error;
380 
381 	if (!btree_committed || resv->used == 0) {
382 		/*
383 		 * If we're not committing a new btree or we didn't use the
384 		 * space reservation, let the existing EFI free the entire
385 		 * space extent.
386 		 */
387 		trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
388 				free_agbno, free_aglen, xnr->oinfo.oi_owner);
389 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
390 		return 1;
391 	}
392 
393 	/*
394 	 * We used space and committed the btree.  Cancel the autoreap, remove
395 	 * the written blocks from the reservation, and possibly log a new EFI
396 	 * to free any unused reservation space.
397 	 */
398 	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
399 	free_agbno += resv->used;
400 	free_aglen -= resv->used;
401 
402 	if (free_aglen == 0)
403 		return 0;
404 
405 	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
406 			free_aglen, xnr->oinfo.oi_owner);
407 
408 	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
409 	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
410 
411 	/*
412 	 * Use EFIs to free the reservations.  This reduces the chance
413 	 * that we leak blocks if the system goes down.
414 	 */
415 	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
416 	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
417 			xnr->resv, true);
418 	if (error)
419 		return error;
420 
421 	return 1;
422 }
423 
424 /* Free all the accounting info and disk space we reserved for a new btree. */
425 STATIC int
426 xrep_newbt_free(
427 	struct xrep_newbt	*xnr,
428 	bool			btree_committed)
429 {
430 	struct xfs_scrub	*sc = xnr->sc;
431 	struct xrep_newbt_resv	*resv, *n;
432 	unsigned int		freed = 0;
433 	int			error = 0;
434 
435 	/*
436 	 * If the filesystem already went down, we can't free the blocks.  Skip
437 	 * ahead to freeing the incore metadata because we can't fix anything.
438 	 */
439 	if (xfs_is_shutdown(sc->mp))
440 		goto junkit;
441 
442 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
443 		int		ret;
444 
445 		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
446 		list_del(&resv->list);
447 		xfs_perag_put(resv->pag);
448 		kfree(resv);
449 		if (ret < 0) {
450 			error = ret;
451 			goto junkit;
452 		}
453 
454 		freed += ret;
455 		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
456 			error = xrep_defer_finish(sc);
457 			if (error)
458 				goto junkit;
459 			freed = 0;
460 		}
461 	}
462 
463 	if (freed)
464 		error = xrep_defer_finish(sc);
465 
466 junkit:
467 	/*
468 	 * If we still have reservations attached to @newbt, cleanup must have
469 	 * failed and the filesystem is about to go down.  Clean up the incore
470 	 * reservations and try to commit to freeing the space we used.
471 	 */
472 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
473 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
474 		list_del(&resv->list);
475 		xfs_perag_put(resv->pag);
476 		kfree(resv);
477 	}
478 
479 	if (sc->ip) {
480 		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
481 		xnr->ifake.if_fork = NULL;
482 	}
483 
484 	return error;
485 }
486 
487 /*
488  * Free all the accounting info and unused disk space allocations after
489  * committing a new btree.
490  */
491 int
492 xrep_newbt_commit(
493 	struct xrep_newbt	*xnr)
494 {
495 	return xrep_newbt_free(xnr, true);
496 }
497 
498 /*
499  * Free all the accounting info and all of the disk space we reserved for a new
500  * btree that we're not going to commit.  We want to try to roll things back
501  * cleanly for things like ENOSPC midway through allocation.
502  */
503 void
504 xrep_newbt_cancel(
505 	struct xrep_newbt	*xnr)
506 {
507 	xrep_newbt_free(xnr, false);
508 }
509 
510 /* Feed one of the reserved btree blocks to the bulk loader. */
511 int
512 xrep_newbt_claim_block(
513 	struct xfs_btree_cur	*cur,
514 	struct xrep_newbt	*xnr,
515 	union xfs_btree_ptr	*ptr)
516 {
517 	struct xrep_newbt_resv	*resv;
518 	struct xfs_mount	*mp = cur->bc_mp;
519 	xfs_agblock_t		agbno;
520 
521 	/*
522 	 * The first item in the list should always have a free block unless
523 	 * we're completely out.
524 	 */
525 	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
526 	if (resv->used == resv->len)
527 		return -ENOSPC;
528 
529 	/*
530 	 * Peel off a block from the start of the reservation.  We allocate
531 	 * blocks in order to place blocks on disk in increasing record or key
532 	 * order.  The block reservations tend to end up on the list in
533 	 * decreasing order, which hopefully results in leaf blocks ending up
534 	 * together.
535 	 */
536 	agbno = resv->agbno + resv->used;
537 	resv->used++;
538 
539 	/* If we used all the blocks in this reservation, move it to the end. */
540 	if (resv->used == resv->len)
541 		list_move_tail(&resv->list, &xnr->resv_list);
542 
543 	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
544 			xnr->oinfo.oi_owner);
545 
546 	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
547 		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
548 								agbno));
549 	else
550 		ptr->s = cpu_to_be32(agbno);
551 
552 	/* Relog all the EFIs. */
553 	return xrep_defer_finish(xnr->sc);
554 }
555 
556 /* How many reserved blocks are unused? */
557 unsigned int
558 xrep_newbt_unused_blocks(
559 	struct xrep_newbt	*xnr)
560 {
561 	struct xrep_newbt_resv	*resv;
562 	unsigned int		unused = 0;
563 
564 	list_for_each_entry(resv, &xnr->resv_list, list)
565 		unused += resv->len - resv->used;
566 	return unused;
567 }
568