xref: /linux/fs/xfs/scrub/newbt.c (revision 011f129fee4bd064a3db30ca1a0139548a619482)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_btree_staging.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_alloc.h"
19 #include "xfs_rmap.h"
20 #include "xfs_ag.h"
21 #include "xfs_defer.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25 #include "scrub/repair.h"
26 #include "scrub/newbt.h"
27 
28 /*
29  * Estimate proper slack values for a btree that's being reloaded.
30  *
31  * Under most circumstances, we'll take whatever default loading value the
32  * btree bulk loading code calculates for us.  However, there are some
33  * exceptions to this rule:
34  *
35  * (1) If this is a per-AG btree and the AG has less than 10% space free.
36  * (2) If this is an inode btree and the FS has less than 10% space free.
37 
38  * In either case, format the new btree blocks almost completely full to
39  * minimize space usage.
40  */
41 static void
42 xrep_newbt_estimate_slack(
43 	struct xrep_newbt	*xnr)
44 {
45 	struct xfs_scrub	*sc = xnr->sc;
46 	struct xfs_btree_bload	*bload = &xnr->bload;
47 	uint64_t		free;
48 	uint64_t		sz;
49 
50 	/* Let the btree code compute the default slack values. */
51 	bload->leaf_slack = -1;
52 	bload->node_slack = -1;
53 
54 	if (sc->ops->type == ST_PERAG) {
55 		free = sc->sa.pag->pagf_freeblks;
56 		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
57 	} else {
58 		free = percpu_counter_sum(&sc->mp->m_fdblocks);
59 		sz = sc->mp->m_sb.sb_dblocks;
60 	}
61 
62 	/* No further changes if there's more than 10% free space left. */
63 	if (free >= div_u64(sz, 10))
64 		return;
65 
66 	/*
67 	 * We're low on space; load the btrees as tightly as possible.  Leave
68 	 * a couple of open slots in each btree block so that we don't end up
69 	 * splitting the btrees like crazy after a mount.
70 	 */
71 	if (bload->leaf_slack < 0)
72 		bload->leaf_slack = 2;
73 	if (bload->node_slack < 0)
74 		bload->node_slack = 2;
75 }
76 
77 /* Initialize accounting resources for staging a new AG btree. */
78 void
79 xrep_newbt_init_ag(
80 	struct xrep_newbt		*xnr,
81 	struct xfs_scrub		*sc,
82 	const struct xfs_owner_info	*oinfo,
83 	xfs_fsblock_t			alloc_hint,
84 	enum xfs_ag_resv_type		resv)
85 {
86 	memset(xnr, 0, sizeof(struct xrep_newbt));
87 	xnr->sc = sc;
88 	xnr->oinfo = *oinfo; /* structure copy */
89 	xnr->alloc_hint = alloc_hint;
90 	xnr->resv = resv;
91 	INIT_LIST_HEAD(&xnr->resv_list);
92 	xrep_newbt_estimate_slack(xnr);
93 }
94 
95 /* Initialize accounting resources for staging a new inode fork btree. */
96 int
97 xrep_newbt_init_inode(
98 	struct xrep_newbt		*xnr,
99 	struct xfs_scrub		*sc,
100 	int				whichfork,
101 	const struct xfs_owner_info	*oinfo)
102 {
103 	struct xfs_ifork		*ifp;
104 
105 	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
106 	if (!ifp)
107 		return -ENOMEM;
108 
109 	xrep_newbt_init_ag(xnr, sc, oinfo,
110 			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
111 			XFS_AG_RESV_NONE);
112 	xnr->ifake.if_fork = ifp;
113 	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
114 	return 0;
115 }
116 
117 /*
118  * Initialize accounting resources for staging a new btree.  Callers are
119  * expected to add their own reservations (and clean them up) manually.
120  */
121 void
122 xrep_newbt_init_bare(
123 	struct xrep_newbt		*xnr,
124 	struct xfs_scrub		*sc)
125 {
126 	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
127 			XFS_AG_RESV_NONE);
128 }
129 
130 /*
131  * Designate specific blocks to be used to build our new btree.  @pag must be
132  * a passive reference.
133  */
134 STATIC int
135 xrep_newbt_add_blocks(
136 	struct xrep_newbt		*xnr,
137 	struct xfs_perag		*pag,
138 	const struct xfs_alloc_arg	*args)
139 {
140 	struct xfs_mount		*mp = xnr->sc->mp;
141 	struct xrep_newbt_resv		*resv;
142 	int				error;
143 
144 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
145 	if (!resv)
146 		return -ENOMEM;
147 
148 	INIT_LIST_HEAD(&resv->list);
149 	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
150 	resv->len = args->len;
151 	resv->used = 0;
152 	resv->pag = xfs_perag_hold(pag);
153 
154 	ASSERT(xnr->oinfo.oi_offset == 0);
155 
156 	error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
157 	if (error)
158 		goto out_pag;
159 
160 	list_add_tail(&resv->list, &xnr->resv_list);
161 	return 0;
162 out_pag:
163 	xfs_perag_put(resv->pag);
164 	kfree(resv);
165 	return error;
166 }
167 
168 /* Don't let our allocation hint take us beyond this AG */
169 static inline void
170 xrep_newbt_validate_ag_alloc_hint(
171 	struct xrep_newbt	*xnr)
172 {
173 	struct xfs_scrub	*sc = xnr->sc;
174 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
175 
176 	if (agno == sc->sa.pag->pag_agno &&
177 	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
178 		return;
179 
180 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
181 					 XFS_AGFL_BLOCK(sc->mp) + 1);
182 }
183 
184 /* Allocate disk space for a new per-AG btree. */
185 STATIC int
186 xrep_newbt_alloc_ag_blocks(
187 	struct xrep_newbt	*xnr,
188 	uint64_t		nr_blocks)
189 {
190 	struct xfs_scrub	*sc = xnr->sc;
191 	struct xfs_mount	*mp = sc->mp;
192 	int			error = 0;
193 
194 	ASSERT(sc->sa.pag != NULL);
195 
196 	while (nr_blocks > 0) {
197 		struct xfs_alloc_arg	args = {
198 			.tp		= sc->tp,
199 			.mp		= mp,
200 			.oinfo		= xnr->oinfo,
201 			.minlen		= 1,
202 			.maxlen		= nr_blocks,
203 			.prod		= 1,
204 			.resv		= xnr->resv,
205 		};
206 		xfs_agnumber_t		agno;
207 
208 		xrep_newbt_validate_ag_alloc_hint(xnr);
209 
210 		error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
211 		if (error)
212 			return error;
213 		if (args.fsbno == NULLFSBLOCK)
214 			return -ENOSPC;
215 
216 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
217 
218 		trace_xrep_newbt_alloc_ag_blocks(mp, agno,
219 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
220 				xnr->oinfo.oi_owner);
221 
222 		if (agno != sc->sa.pag->pag_agno) {
223 			ASSERT(agno == sc->sa.pag->pag_agno);
224 			return -EFSCORRUPTED;
225 		}
226 
227 		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
228 		if (error)
229 			return error;
230 
231 		nr_blocks -= args.len;
232 		xnr->alloc_hint = args.fsbno + args.len;
233 
234 		error = xrep_defer_finish(sc);
235 		if (error)
236 			return error;
237 	}
238 
239 	return 0;
240 }
241 
242 /* Don't let our allocation hint take us beyond EOFS */
243 static inline void
244 xrep_newbt_validate_file_alloc_hint(
245 	struct xrep_newbt	*xnr)
246 {
247 	struct xfs_scrub	*sc = xnr->sc;
248 
249 	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
250 		return;
251 
252 	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
253 }
254 
255 /* Allocate disk space for our new file-based btree. */
256 STATIC int
257 xrep_newbt_alloc_file_blocks(
258 	struct xrep_newbt	*xnr,
259 	uint64_t		nr_blocks)
260 {
261 	struct xfs_scrub	*sc = xnr->sc;
262 	struct xfs_mount	*mp = sc->mp;
263 	int			error = 0;
264 
265 	while (nr_blocks > 0) {
266 		struct xfs_alloc_arg	args = {
267 			.tp		= sc->tp,
268 			.mp		= mp,
269 			.oinfo		= xnr->oinfo,
270 			.minlen		= 1,
271 			.maxlen		= nr_blocks,
272 			.prod		= 1,
273 			.resv		= xnr->resv,
274 		};
275 		struct xfs_perag	*pag;
276 		xfs_agnumber_t		agno;
277 
278 		xrep_newbt_validate_file_alloc_hint(xnr);
279 
280 		error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
281 		if (error)
282 			return error;
283 		if (args.fsbno == NULLFSBLOCK)
284 			return -ENOSPC;
285 
286 		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
287 
288 		trace_xrep_newbt_alloc_file_blocks(mp, agno,
289 				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
290 				xnr->oinfo.oi_owner);
291 
292 		pag = xfs_perag_get(mp, agno);
293 		if (!pag) {
294 			ASSERT(0);
295 			return -EFSCORRUPTED;
296 		}
297 
298 		error = xrep_newbt_add_blocks(xnr, pag, &args);
299 		xfs_perag_put(pag);
300 		if (error)
301 			return error;
302 
303 		nr_blocks -= args.len;
304 		xnr->alloc_hint = args.fsbno + args.len;
305 
306 		error = xrep_defer_finish(sc);
307 		if (error)
308 			return error;
309 	}
310 
311 	return 0;
312 }
313 
314 /* Allocate disk space for our new btree. */
315 int
316 xrep_newbt_alloc_blocks(
317 	struct xrep_newbt	*xnr,
318 	uint64_t		nr_blocks)
319 {
320 	if (xnr->sc->ip)
321 		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
322 	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
323 }
324 
325 /*
326  * Free the unused part of a space extent that was reserved for a new ondisk
327  * structure.  Returns the number of EFIs logged or a negative errno.
328  */
329 STATIC int
330 xrep_newbt_free_extent(
331 	struct xrep_newbt	*xnr,
332 	struct xrep_newbt_resv	*resv,
333 	bool			btree_committed)
334 {
335 	struct xfs_scrub	*sc = xnr->sc;
336 	xfs_agblock_t		free_agbno = resv->agbno;
337 	xfs_extlen_t		free_aglen = resv->len;
338 	xfs_fsblock_t		fsbno;
339 	int			error;
340 
341 	if (!btree_committed || resv->used == 0) {
342 		/*
343 		 * If we're not committing a new btree or we didn't use the
344 		 * space reservation, let the existing EFI free the entire
345 		 * space extent.
346 		 */
347 		trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
348 				free_agbno, free_aglen, xnr->oinfo.oi_owner);
349 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
350 		return 1;
351 	}
352 
353 	/*
354 	 * We used space and committed the btree.  Cancel the autoreap, remove
355 	 * the written blocks from the reservation, and possibly log a new EFI
356 	 * to free any unused reservation space.
357 	 */
358 	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
359 	free_agbno += resv->used;
360 	free_aglen -= resv->used;
361 
362 	if (free_aglen == 0)
363 		return 0;
364 
365 	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
366 			free_aglen, xnr->oinfo.oi_owner);
367 
368 	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
369 
370 	/*
371 	 * Use EFIs to free the reservations.  This reduces the chance
372 	 * that we leak blocks if the system goes down.
373 	 */
374 	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
375 	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
376 			xnr->resv, true);
377 	if (error)
378 		return error;
379 
380 	return 1;
381 }
382 
383 /* Free all the accounting info and disk space we reserved for a new btree. */
384 STATIC int
385 xrep_newbt_free(
386 	struct xrep_newbt	*xnr,
387 	bool			btree_committed)
388 {
389 	struct xfs_scrub	*sc = xnr->sc;
390 	struct xrep_newbt_resv	*resv, *n;
391 	unsigned int		freed = 0;
392 	int			error = 0;
393 
394 	/*
395 	 * If the filesystem already went down, we can't free the blocks.  Skip
396 	 * ahead to freeing the incore metadata because we can't fix anything.
397 	 */
398 	if (xfs_is_shutdown(sc->mp))
399 		goto junkit;
400 
401 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
402 		int		ret;
403 
404 		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
405 		list_del(&resv->list);
406 		xfs_perag_put(resv->pag);
407 		kfree(resv);
408 		if (ret < 0) {
409 			error = ret;
410 			goto junkit;
411 		}
412 
413 		freed += ret;
414 		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
415 			error = xrep_defer_finish(sc);
416 			if (error)
417 				goto junkit;
418 			freed = 0;
419 		}
420 	}
421 
422 	if (freed)
423 		error = xrep_defer_finish(sc);
424 
425 junkit:
426 	/*
427 	 * If we still have reservations attached to @newbt, cleanup must have
428 	 * failed and the filesystem is about to go down.  Clean up the incore
429 	 * reservations and try to commit to freeing the space we used.
430 	 */
431 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
432 		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
433 		list_del(&resv->list);
434 		xfs_perag_put(resv->pag);
435 		kfree(resv);
436 	}
437 
438 	if (sc->ip) {
439 		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
440 		xnr->ifake.if_fork = NULL;
441 	}
442 
443 	return error;
444 }
445 
446 /*
447  * Free all the accounting info and unused disk space allocations after
448  * committing a new btree.
449  */
450 int
451 xrep_newbt_commit(
452 	struct xrep_newbt	*xnr)
453 {
454 	return xrep_newbt_free(xnr, true);
455 }
456 
457 /*
458  * Free all the accounting info and all of the disk space we reserved for a new
459  * btree that we're not going to commit.  We want to try to roll things back
460  * cleanly for things like ENOSPC midway through allocation.
461  */
462 void
463 xrep_newbt_cancel(
464 	struct xrep_newbt	*xnr)
465 {
466 	xrep_newbt_free(xnr, false);
467 }
468 
469 /* Feed one of the reserved btree blocks to the bulk loader. */
470 int
471 xrep_newbt_claim_block(
472 	struct xfs_btree_cur	*cur,
473 	struct xrep_newbt	*xnr,
474 	union xfs_btree_ptr	*ptr)
475 {
476 	struct xrep_newbt_resv	*resv;
477 	struct xfs_mount	*mp = cur->bc_mp;
478 	xfs_agblock_t		agbno;
479 
480 	/*
481 	 * The first item in the list should always have a free block unless
482 	 * we're completely out.
483 	 */
484 	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
485 	if (resv->used == resv->len)
486 		return -ENOSPC;
487 
488 	/*
489 	 * Peel off a block from the start of the reservation.  We allocate
490 	 * blocks in order to place blocks on disk in increasing record or key
491 	 * order.  The block reservations tend to end up on the list in
492 	 * decreasing order, which hopefully results in leaf blocks ending up
493 	 * together.
494 	 */
495 	agbno = resv->agbno + resv->used;
496 	resv->used++;
497 
498 	/* If we used all the blocks in this reservation, move it to the end. */
499 	if (resv->used == resv->len)
500 		list_move_tail(&resv->list, &xnr->resv_list);
501 
502 	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
503 			xnr->oinfo.oi_owner);
504 
505 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
506 		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
507 								agbno));
508 	else
509 		ptr->s = cpu_to_be32(agbno);
510 
511 	/* Relog all the EFIs. */
512 	return xrep_defer_finish(xnr->sc);
513 }
514