xref: /linux/fs/xfs/xfs_verify_media.c (revision 3577cfd738e29b3d54cdb10c45a56730346dfe8b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2026 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_bit.h"
13 #include "xfs_btree.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_trans.h"
17 #include "xfs_alloc.h"
18 #include "xfs_ag.h"
19 #include "xfs_rmap.h"
20 #include "xfs_rmap_btree.h"
21 #include "xfs_rtgroup.h"
22 #include "xfs_rtrmap_btree.h"
23 #include "xfs_health.h"
24 #include "xfs_healthmon.h"
25 #include "xfs_trace.h"
26 #include "xfs_verify_media.h"
27 
28 #include <linux/fserror.h>
29 
30 struct xfs_group_data_lost {
31 	xfs_agblock_t		startblock;
32 	xfs_extlen_t		blockcount;
33 };
34 
35 /* Report lost file data from rmap records */
36 static int
xfs_verify_report_data_lost(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)37 xfs_verify_report_data_lost(
38 	struct xfs_btree_cur		*cur,
39 	const struct xfs_rmap_irec	*rec,
40 	void				*data)
41 {
42 	struct xfs_mount		*mp = cur->bc_mp;
43 	struct xfs_inode		*ip;
44 	struct xfs_group_data_lost	*lost = data;
45 	xfs_fileoff_t			fileoff = rec->rm_offset;
46 	xfs_extlen_t			blocks = rec->rm_blockcount;
47 	const bool			is_attr =
48 			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
49 	const xfs_agblock_t		lost_end =
50 			lost->startblock + lost->blockcount;
51 	const xfs_agblock_t		rmap_end =
52 			rec->rm_startblock + rec->rm_blockcount;
53 	int				error = 0;
54 
55 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
56 	       return 0;
57 
58 	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
59 	if (error)
60 		return 0;
61 
62 	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
63 		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
64 		goto out_rele;
65 	}
66 
67 	if (is_attr) {
68 		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
69 		goto out_rele;
70 	}
71 
72 	if (lost->startblock > rec->rm_startblock) {
73 		fileoff += lost->startblock - rec->rm_startblock;
74 		blocks -= lost->startblock - rec->rm_startblock;
75 	}
76 	if (rmap_end > lost_end)
77 		blocks -= rmap_end - lost_end;
78 
79 	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
80 			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
81 
82 out_rele:
83 	xfs_irele(ip);
84 	return 0;
85 }
86 
87 /* Walk reverse mappings to look for all file data loss */
88 static int
xfs_verify_report_losses(struct xfs_mount * mp,enum xfs_group_type type,xfs_daddr_t daddr,u64 bblen)89 xfs_verify_report_losses(
90 	struct xfs_mount	*mp,
91 	enum xfs_group_type	type,
92 	xfs_daddr_t		daddr,
93 	u64			bblen)
94 {
95 	struct xfs_group	*xg = NULL;
96 	struct xfs_trans	*tp;
97 	xfs_fsblock_t		start_bno, end_bno;
98 	uint32_t		start_gno, end_gno;
99 	int			error;
100 
101 	if (type == XG_TYPE_RTG) {
102 		start_bno = xfs_daddr_to_rtb(mp, daddr);
103 		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
104 	} else {
105 		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
106 		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
107 	}
108 
109 	tp = xfs_trans_alloc_empty(mp);
110 	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
111 	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
112 	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
113 		struct xfs_buf		*agf_bp = NULL;
114 		struct xfs_rtgroup	*rtg = NULL;
115 		struct xfs_btree_cur	*cur;
116 		struct xfs_rmap_irec	ri_low = { };
117 		struct xfs_rmap_irec	ri_high;
118 		struct xfs_group_data_lost lost;
119 
120 		if (type == XG_TYPE_AG) {
121 			struct xfs_perag	*pag = to_perag(xg);
122 
123 			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
124 			if (error) {
125 				xfs_perag_rele(pag);
126 				break;
127 			}
128 
129 			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
130 		} else {
131 			rtg = to_rtg(xg);
132 			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
133 			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
134 		}
135 
136 		/*
137 		 * Set the rmap range from ri_low to ri_high, which represents
138 		 * a [start, end] where we looking for the files or metadata.
139 		 */
140 		memset(&ri_high, 0xFF, sizeof(ri_high));
141 		if (xg->xg_gno == start_gno)
142 			ri_low.rm_startblock =
143 				xfs_fsb_to_gbno(mp, start_bno, type);
144 		if (xg->xg_gno == end_gno)
145 			ri_high.rm_startblock =
146 				xfs_fsb_to_gbno(mp, end_bno, type);
147 
148 		lost.startblock = ri_low.rm_startblock;
149 		lost.blockcount = min(xg->xg_block_count,
150 				      ri_high.rm_startblock + 1) -
151 							ri_low.rm_startblock;
152 
153 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
154 				xfs_verify_report_data_lost, &lost);
155 		xfs_btree_del_cursor(cur, error);
156 		if (agf_bp)
157 			xfs_trans_brelse(tp, agf_bp);
158 		if (rtg)
159 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
160 		if (error) {
161 			xfs_group_rele(xg);
162 			break;
163 		}
164 	}
165 
166 	xfs_trans_cancel(tp);
167 	return 0;
168 }
169 
170 /*
171  * Compute the desired verify IO size.
172  *
173  * To minimize command overhead, we'd like to create bios that are 1MB, though
174  * we allow the user to ask for a smaller size.
175  */
176 static unsigned int
xfs_verify_iosize(const struct xfs_verify_media * me,struct xfs_buftarg * btp,uint64_t bbcount)177 xfs_verify_iosize(
178 	const struct xfs_verify_media	*me,
179 	struct xfs_buftarg		*btp,
180 	uint64_t			bbcount)
181 {
182 	unsigned int			iosize =
183 			min_not_zero(SZ_1M, me->me_max_io_size);
184 
185 	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
186 	ASSERT(BBTOB(bbcount) >= btp->bt_logical_sectorsize);
187 
188 	return clamp(iosize, btp->bt_logical_sectorsize, BBTOB(bbcount));
189 }
190 
191 /* Allocate as much memory as we can get for verification buffer. */
192 static struct folio *
xfs_verify_alloc_folio(const unsigned int iosize)193 xfs_verify_alloc_folio(
194 	const unsigned int	iosize)
195 {
196 	unsigned int		order = get_order(iosize);
197 
198 	while (order > 0) {
199 		struct folio	*folio =
200 			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
201 
202 		if (folio)
203 			return folio;
204 		order--;
205 	}
206 
207 	return folio_alloc(GFP_KERNEL, 0);
208 }
209 
210 /* Report any kind of problem verifying media */
211 static void
xfs_verify_media_error(struct xfs_mount * mp,struct xfs_verify_media * me,struct xfs_buftarg * btp,xfs_daddr_t daddr,unsigned int bio_bbcount,blk_status_t bio_status)212 xfs_verify_media_error(
213 	struct xfs_mount	*mp,
214 	struct xfs_verify_media	*me,
215 	struct xfs_buftarg	*btp,
216 	xfs_daddr_t		daddr,
217 	unsigned int		bio_bbcount,
218 	blk_status_t		bio_status)
219 {
220 	trace_xfs_verify_media_error(mp, me, btp->bt_dev, daddr, bio_bbcount,
221 			bio_status);
222 
223 	/*
224 	 * Pass any error, I/O or otherwise, up to the caller if we didn't
225 	 * successfully verify any bytes at all.
226 	 */
227 	if (me->me_start_daddr == daddr)
228 		me->me_ioerror = -blk_status_to_errno(bio_status);
229 
230 	/*
231 	 * PI validation failures, medium errors, or general IO errors are
232 	 * treated as indicators of data loss.  Everything else are (hopefully)
233 	 * transient errors and are not reported to healthmon or fsnotify.
234 	 */
235 	switch (bio_status) {
236 	case BLK_STS_PROTECTION:
237 	case BLK_STS_IOERR:
238 	case BLK_STS_MEDIUM:
239 		break;
240 	default:
241 		return;
242 	}
243 
244 	if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
245 		return;
246 
247 	xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
248 
249 	if (!xfs_has_rmapbt(mp))
250 		return;
251 
252 	switch (me->me_dev) {
253 	case XFS_DEV_DATA:
254 		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
255 		break;
256 	case XFS_DEV_RT:
257 		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
258 		break;
259 	}
260 }
261 
262 /* Verify the media of an xfs device by submitting read requests to the disk. */
263 static int
xfs_verify_media(struct xfs_mount * mp,struct xfs_verify_media * me)264 xfs_verify_media(
265 	struct xfs_mount	*mp,
266 	struct xfs_verify_media	*me)
267 {
268 	struct xfs_buftarg	*btp = NULL;
269 	struct bio		*bio;
270 	struct folio		*folio;
271 	xfs_daddr_t		daddr;
272 	uint64_t		bbcount;
273 	int			error = 0;
274 
275 	me->me_ioerror = 0;
276 
277 	switch (me->me_dev) {
278 	case XFS_DEV_DATA:
279 		btp = mp->m_ddev_targp;
280 		break;
281 	case XFS_DEV_LOG:
282 		if (mp->m_logdev_targp != mp->m_ddev_targp)
283 			btp = mp->m_logdev_targp;
284 		break;
285 	case XFS_DEV_RT:
286 		btp = mp->m_rtdev_targp;
287 		break;
288 	}
289 	if (!btp)
290 		return -ENODEV;
291 
292 	/*
293 	 * If the caller told us to verify beyond the end of the disk, tell the
294 	 * user exactly where that was.
295 	 */
296 	if (me->me_end_daddr > btp->bt_nr_sectors)
297 		me->me_end_daddr = btp->bt_nr_sectors;
298 
299 	/* start and end have to be aligned to the lba size */
300 	if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
301 			btp->bt_logical_sectorsize))
302 		return -EINVAL;
303 
304 	/*
305 	 * end_daddr is the exclusive end of the range, so if start_daddr
306 	 * reaches there (or beyond), there's no work to be done.
307 	 */
308 	if (me->me_start_daddr >= me->me_end_daddr)
309 		return 0;
310 
311 	/*
312 	 * There are three ranges involved here:
313 	 *
314 	 *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
315 	 *    user wants to verify.  end_daddr can be beyond the end of the
316 	 *    disk; we'll constrain it to the end if necessary.
317 	 *
318 	 *  - [daddr, me->me_end_daddr) is the range that we have not yet
319 	 *    verified.  We update daddr after each successful read.
320 	 *    me->me_start_daddr is set to daddr before returning.
321 	 *
322 	 *  - [daddr, daddr + bio_bbcount) is the range that we're currently
323 	 *    verifying.
324 	 */
325 	daddr = me->me_start_daddr;
326 	bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
327 			  me->me_start_daddr;
328 
329 	folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
330 	if (!folio)
331 		return -ENOMEM;
332 
333 	trace_xfs_verify_media(mp, me, btp->bt_dev, daddr, bbcount, folio);
334 
335 	bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
336 	if (!bio) {
337 		error = -ENOMEM;
338 		goto out_folio;
339 	}
340 
341 	while (bbcount > 0) {
342 		unsigned int	bio_bbcount;
343 		blk_status_t	bio_status;
344 
345 		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
346 		bio->bi_iter.bi_sector = daddr;
347 		bio_add_folio_nofail(bio, folio,
348 				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
349 				0);
350 
351 		/*
352 		 * Save the length of the bio before we submit it, because we
353 		 * need the original daddr and length for reporting IO errors
354 		 * if the bio fails.
355 		 */
356 		bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
357 		submit_bio_wait(bio);
358 		bio_status = bio->bi_status;
359 		if (bio_status != BLK_STS_OK) {
360 			xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
361 					bio_status);
362 			error = 0;
363 			break;
364 		}
365 
366 		daddr += bio_bbcount;
367 		bbcount -= bio_bbcount;
368 
369 		if (bbcount == 0)
370 			break;
371 
372 		if (me->me_rest_us) {
373 			ktime_t	expires;
374 
375 			expires = ktime_add_ns(ktime_get(),
376 					me->me_rest_us * 1000);
377 			set_current_state(TASK_KILLABLE);
378 			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
379 		}
380 
381 		if (fatal_signal_pending(current)) {
382 			error = -EINTR;
383 			break;
384 		}
385 
386 		cond_resched();
387 	}
388 
389 	bio_put(bio);
390 out_folio:
391 	folio_put(folio);
392 
393 	if (error)
394 		return error;
395 
396 	/*
397 	 * Advance start_daddr to the end of what we verified if there wasn't
398 	 * an operational error.
399 	 */
400 	me->me_start_daddr = daddr;
401 	trace_xfs_verify_media_end(mp, me, btp->bt_dev);
402 	return 0;
403 }
404 
405 int
xfs_ioc_verify_media(struct file * file,struct xfs_verify_media __user * arg)406 xfs_ioc_verify_media(
407 	struct file			*file,
408 	struct xfs_verify_media __user	*arg)
409 {
410 	struct xfs_verify_media		me;
411 	struct xfs_inode		*ip = XFS_I(file_inode(file));
412 	struct xfs_mount		*mp = ip->i_mount;
413 	int				error;
414 
415 	if (!capable(CAP_SYS_ADMIN))
416 		return -EPERM;
417 
418 	if (copy_from_user(&me, arg, sizeof(me)))
419 		return -EFAULT;
420 
421 	if (me.me_pad)
422 		return -EINVAL;
423 	if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
424 		return -EINVAL;
425 
426 	switch (me.me_dev) {
427 	case XFS_DEV_DATA:
428 	case XFS_DEV_LOG:
429 	case XFS_DEV_RT:
430 		break;
431 	default:
432 		return -EINVAL;
433 	}
434 
435 	error = xfs_verify_media(mp, &me);
436 	if (error)
437 		return error;
438 
439 	if (copy_to_user(arg, &me, sizeof(me)))
440 		return -EFAULT;
441 
442 	return 0;
443 }
444