xref: /linux/fs/xfs/xfs_verify_media.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2026 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_bit.h"
13 #include "xfs_btree.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_trans.h"
17 #include "xfs_alloc.h"
18 #include "xfs_ag.h"
19 #include "xfs_rmap.h"
20 #include "xfs_rmap_btree.h"
21 #include "xfs_rtgroup.h"
22 #include "xfs_rtrmap_btree.h"
23 #include "xfs_health.h"
24 #include "xfs_healthmon.h"
25 #include "xfs_trace.h"
26 #include "xfs_verify_media.h"
27 
28 #include <linux/fserror.h>
29 
30 struct xfs_group_data_lost {
31 	xfs_agblock_t		startblock;
32 	xfs_extlen_t		blockcount;
33 };
34 
35 /* Report lost file data from rmap records */
36 static int
37 xfs_verify_report_data_lost(
38 	struct xfs_btree_cur		*cur,
39 	const struct xfs_rmap_irec	*rec,
40 	void				*data)
41 {
42 	struct xfs_mount		*mp = cur->bc_mp;
43 	struct xfs_inode		*ip;
44 	struct xfs_group_data_lost	*lost = data;
45 	xfs_fileoff_t			fileoff = rec->rm_offset;
46 	xfs_extlen_t			blocks = rec->rm_blockcount;
47 	const bool			is_attr =
48 			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
49 	const xfs_agblock_t		lost_end =
50 			lost->startblock + lost->blockcount;
51 	const xfs_agblock_t		rmap_end =
52 			rec->rm_startblock + rec->rm_blockcount;
53 	int				error = 0;
54 
55 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
56 	       return 0;
57 
58 	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
59 	if (error)
60 		return 0;
61 
62 	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
63 		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
64 		goto out_rele;
65 	}
66 
67 	if (is_attr) {
68 		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
69 		goto out_rele;
70 	}
71 
72 	if (lost->startblock > rec->rm_startblock) {
73 		fileoff += lost->startblock - rec->rm_startblock;
74 		blocks -= lost->startblock - rec->rm_startblock;
75 	}
76 	if (rmap_end > lost_end)
77 		blocks -= rmap_end - lost_end;
78 
79 	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
80 			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
81 
82 out_rele:
83 	xfs_irele(ip);
84 	return 0;
85 }
86 
87 /* Walk reverse mappings to look for all file data loss */
88 static int
89 xfs_verify_report_losses(
90 	struct xfs_mount	*mp,
91 	enum xfs_group_type	type,
92 	xfs_daddr_t		daddr,
93 	u64			bblen)
94 {
95 	struct xfs_group	*xg = NULL;
96 	struct xfs_trans	*tp;
97 	xfs_fsblock_t		start_bno, end_bno;
98 	uint32_t		start_gno, end_gno;
99 	int			error;
100 
101 	if (type == XG_TYPE_RTG) {
102 		start_bno = xfs_daddr_to_rtb(mp, daddr);
103 		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
104 	} else {
105 		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
106 		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
107 	}
108 
109 	tp = xfs_trans_alloc_empty(mp);
110 	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
111 	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
112 	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
113 		struct xfs_buf		*agf_bp = NULL;
114 		struct xfs_rtgroup	*rtg = NULL;
115 		struct xfs_btree_cur	*cur;
116 		struct xfs_rmap_irec	ri_low = { };
117 		struct xfs_rmap_irec	ri_high;
118 		struct xfs_group_data_lost lost;
119 
120 		if (type == XG_TYPE_AG) {
121 			struct xfs_perag	*pag = to_perag(xg);
122 
123 			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
124 			if (error) {
125 				xfs_perag_put(pag);
126 				break;
127 			}
128 
129 			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
130 		} else {
131 			rtg = to_rtg(xg);
132 			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
133 			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
134 		}
135 
136 		/*
137 		 * Set the rmap range from ri_low to ri_high, which represents
138 		 * a [start, end] where we looking for the files or metadata.
139 		 */
140 		memset(&ri_high, 0xFF, sizeof(ri_high));
141 		if (xg->xg_gno == start_gno)
142 			ri_low.rm_startblock =
143 				xfs_fsb_to_gbno(mp, start_bno, type);
144 		if (xg->xg_gno == end_gno)
145 			ri_high.rm_startblock =
146 				xfs_fsb_to_gbno(mp, end_bno, type);
147 
148 		lost.startblock = ri_low.rm_startblock;
149 		lost.blockcount = min(xg->xg_block_count,
150 				      ri_high.rm_startblock + 1) -
151 							ri_low.rm_startblock;
152 
153 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
154 				xfs_verify_report_data_lost, &lost);
155 		xfs_btree_del_cursor(cur, error);
156 		if (agf_bp)
157 			xfs_trans_brelse(tp, agf_bp);
158 		if (rtg)
159 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
160 		if (error) {
161 			xfs_group_put(xg);
162 			break;
163 		}
164 	}
165 
166 	xfs_trans_cancel(tp);
167 	return 0;
168 }
169 
170 /*
171  * Compute the desired verify IO size.
172  *
173  * To minimize command overhead, we'd like to create bios that are 1MB, though
174  * we allow the user to ask for a smaller size.
175  */
176 static unsigned int
177 xfs_verify_iosize(
178 	const struct xfs_verify_media	*me,
179 	struct xfs_buftarg		*btp,
180 	uint64_t			bbcount)
181 {
182 	unsigned int			iosize =
183 			min_not_zero(SZ_1M, me->me_max_io_size);
184 
185 	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
186 	ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
187 
188 	return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
189 			BBTOB(bbcount));
190 }
191 
192 /* Allocate as much memory as we can get for verification buffer. */
193 static struct folio *
194 xfs_verify_alloc_folio(
195 	const unsigned int	iosize)
196 {
197 	unsigned int		order = get_order(iosize);
198 
199 	while (order > 0) {
200 		struct folio	*folio =
201 			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
202 
203 		if (folio)
204 			return folio;
205 		order--;
206 	}
207 
208 	return folio_alloc(GFP_KERNEL, 0);
209 }
210 
211 /* Report any kind of problem verifying media */
212 static void
213 xfs_verify_media_error(
214 	struct xfs_mount	*mp,
215 	struct xfs_verify_media	*me,
216 	struct xfs_buftarg	*btp,
217 	xfs_daddr_t		daddr,
218 	unsigned int		bio_bbcount,
219 	blk_status_t		bio_status)
220 {
221 	trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
222 			bio_bbcount, bio_status);
223 
224 	/*
225 	 * Pass any error, I/O or otherwise, up to the caller if we didn't
226 	 * successfully verify any bytes at all.
227 	 */
228 	if (me->me_start_daddr == daddr)
229 		me->me_ioerror = -blk_status_to_errno(bio_status);
230 
231 	/*
232 	 * PI validation failures, medium errors, or general IO errors are
233 	 * treated as indicators of data loss.  Everything else are (hopefully)
234 	 * transient errors and are not reported to healthmon or fsnotify.
235 	 */
236 	switch (bio_status) {
237 	case BLK_STS_PROTECTION:
238 	case BLK_STS_IOERR:
239 	case BLK_STS_MEDIUM:
240 		break;
241 	default:
242 		return;
243 	}
244 
245 	if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
246 		return;
247 
248 	xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
249 
250 	if (!xfs_has_rmapbt(mp))
251 		return;
252 
253 	switch (me->me_dev) {
254 	case XFS_DEV_DATA:
255 		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
256 		break;
257 	case XFS_DEV_RT:
258 		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
259 		break;
260 	}
261 }
262 
263 /* Verify the media of an xfs device by submitting read requests to the disk. */
264 static int
265 xfs_verify_media(
266 	struct xfs_mount	*mp,
267 	struct xfs_verify_media	*me)
268 {
269 	struct xfs_buftarg	*btp = NULL;
270 	struct bio		*bio;
271 	struct folio		*folio;
272 	xfs_daddr_t		daddr;
273 	uint64_t		bbcount;
274 	int			error = 0;
275 
276 	me->me_ioerror = 0;
277 
278 	switch (me->me_dev) {
279 	case XFS_DEV_DATA:
280 		btp = mp->m_ddev_targp;
281 		break;
282 	case XFS_DEV_LOG:
283 		if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
284 			btp = mp->m_logdev_targp;
285 		break;
286 	case XFS_DEV_RT:
287 		btp = mp->m_rtdev_targp;
288 		break;
289 	}
290 	if (!btp)
291 		return -ENODEV;
292 
293 	/*
294 	 * If the caller told us to verify beyond the end of the disk, tell the
295 	 * user exactly where that was.
296 	 */
297 	if (me->me_end_daddr > btp->bt_nr_sectors)
298 		me->me_end_daddr = btp->bt_nr_sectors;
299 
300 	/* start and end have to be aligned to the lba size */
301 	if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
302 			bdev_logical_block_size(btp->bt_bdev)))
303 		return -EINVAL;
304 
305 	/*
306 	 * end_daddr is the exclusive end of the range, so if start_daddr
307 	 * reaches there (or beyond), there's no work to be done.
308 	 */
309 	if (me->me_start_daddr >= me->me_end_daddr)
310 		return 0;
311 
312 	/*
313 	 * There are three ranges involved here:
314 	 *
315 	 *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
316 	 *    user wants to verify.  end_daddr can be beyond the end of the
317 	 *    disk; we'll constrain it to the end if necessary.
318 	 *
319 	 *  - [daddr, me->me_end_daddr) is the range that we have not yet
320 	 *    verified.  We update daddr after each successful read.
321 	 *    me->me_start_daddr is set to daddr before returning.
322 	 *
323 	 *  - [daddr, daddr + bio_bbcount) is the range that we're currently
324 	 *    verifying.
325 	 */
326 	daddr = me->me_start_daddr;
327 	bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
328 			  me->me_start_daddr;
329 
330 	folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
331 	if (!folio)
332 		return -ENOMEM;
333 
334 	trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
335 			folio);
336 
337 	bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
338 	if (!bio) {
339 		error = -ENOMEM;
340 		goto out_folio;
341 	}
342 
343 	while (bbcount > 0) {
344 		unsigned int	bio_bbcount;
345 		blk_status_t	bio_status;
346 
347 		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
348 		bio->bi_iter.bi_sector = daddr;
349 		bio_add_folio_nofail(bio, folio,
350 				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
351 				0);
352 
353 		/*
354 		 * Save the length of the bio before we submit it, because we
355 		 * need the original daddr and length for reporting IO errors
356 		 * if the bio fails.
357 		 */
358 		bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
359 		submit_bio_wait(bio);
360 		bio_status = bio->bi_status;
361 		if (bio_status != BLK_STS_OK) {
362 			xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
363 					bio_status);
364 			error = 0;
365 			break;
366 		}
367 
368 		daddr += bio_bbcount;
369 		bbcount -= bio_bbcount;
370 
371 		if (bbcount == 0)
372 			break;
373 
374 		if (me->me_rest_us) {
375 			ktime_t	expires;
376 
377 			expires = ktime_add_ns(ktime_get(),
378 					me->me_rest_us * 1000);
379 			set_current_state(TASK_KILLABLE);
380 			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
381 		}
382 
383 		if (fatal_signal_pending(current)) {
384 			error = -EINTR;
385 			break;
386 		}
387 
388 		cond_resched();
389 	}
390 
391 	bio_put(bio);
392 out_folio:
393 	folio_put(folio);
394 
395 	if (error)
396 		return error;
397 
398 	/*
399 	 * Advance start_daddr to the end of what we verified if there wasn't
400 	 * an operational error.
401 	 */
402 	me->me_start_daddr = daddr;
403 	trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
404 	return 0;
405 }
406 
407 int
408 xfs_ioc_verify_media(
409 	struct file			*file,
410 	struct xfs_verify_media __user	*arg)
411 {
412 	struct xfs_verify_media		me;
413 	struct xfs_inode		*ip = XFS_I(file_inode(file));
414 	struct xfs_mount		*mp = ip->i_mount;
415 	int				error;
416 
417 	if (!capable(CAP_SYS_ADMIN))
418 		return -EPERM;
419 
420 	if (copy_from_user(&me, arg, sizeof(me)))
421 		return -EFAULT;
422 
423 	if (me.me_pad)
424 		return -EINVAL;
425 	if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
426 		return -EINVAL;
427 
428 	switch (me.me_dev) {
429 	case XFS_DEV_DATA:
430 	case XFS_DEV_LOG:
431 	case XFS_DEV_RT:
432 		break;
433 	default:
434 		return -EINVAL;
435 	}
436 
437 	error = xfs_verify_media(mp, &me);
438 	if (error)
439 		return error;
440 
441 	if (copy_to_user(arg, &me, sizeof(me)))
442 		return -EFAULT;
443 
444 	return 0;
445 }
446