1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2026 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_bit.h"
13 #include "xfs_btree.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_trans.h"
17 #include "xfs_alloc.h"
18 #include "xfs_ag.h"
19 #include "xfs_rmap.h"
20 #include "xfs_rmap_btree.h"
21 #include "xfs_rtgroup.h"
22 #include "xfs_rtrmap_btree.h"
23 #include "xfs_health.h"
24 #include "xfs_healthmon.h"
25 #include "xfs_trace.h"
26 #include "xfs_verify_media.h"
27
28 #include <linux/fserror.h>
29
30 struct xfs_group_data_lost {
31 xfs_agblock_t startblock;
32 xfs_extlen_t blockcount;
33 };
34
35 /* Report lost file data from rmap records */
36 static int
xfs_verify_report_data_lost(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)37 xfs_verify_report_data_lost(
38 struct xfs_btree_cur *cur,
39 const struct xfs_rmap_irec *rec,
40 void *data)
41 {
42 struct xfs_mount *mp = cur->bc_mp;
43 struct xfs_inode *ip;
44 struct xfs_group_data_lost *lost = data;
45 xfs_fileoff_t fileoff = rec->rm_offset;
46 xfs_extlen_t blocks = rec->rm_blockcount;
47 const bool is_attr =
48 (rec->rm_flags & XFS_RMAP_ATTR_FORK);
49 const xfs_agblock_t lost_end =
50 lost->startblock + lost->blockcount;
51 const xfs_agblock_t rmap_end =
52 rec->rm_startblock + rec->rm_blockcount;
53 int error = 0;
54
55 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
56 return 0;
57
58 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
59 if (error)
60 return 0;
61
62 if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
63 xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
64 goto out_rele;
65 }
66
67 if (is_attr) {
68 xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
69 goto out_rele;
70 }
71
72 if (lost->startblock > rec->rm_startblock) {
73 fileoff += lost->startblock - rec->rm_startblock;
74 blocks -= lost->startblock - rec->rm_startblock;
75 }
76 if (rmap_end > lost_end)
77 blocks -= rmap_end - lost_end;
78
79 fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
80 XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
81
82 out_rele:
83 xfs_irele(ip);
84 return 0;
85 }
86
87 /* Walk reverse mappings to look for all file data loss */
88 static int
xfs_verify_report_losses(struct xfs_mount * mp,enum xfs_group_type type,xfs_daddr_t daddr,u64 bblen)89 xfs_verify_report_losses(
90 struct xfs_mount *mp,
91 enum xfs_group_type type,
92 xfs_daddr_t daddr,
93 u64 bblen)
94 {
95 struct xfs_group *xg = NULL;
96 struct xfs_trans *tp;
97 xfs_fsblock_t start_bno, end_bno;
98 uint32_t start_gno, end_gno;
99 int error;
100
101 if (type == XG_TYPE_RTG) {
102 start_bno = xfs_daddr_to_rtb(mp, daddr);
103 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
104 } else {
105 start_bno = XFS_DADDR_TO_FSB(mp, daddr);
106 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
107 }
108
109 tp = xfs_trans_alloc_empty(mp);
110 start_gno = xfs_fsb_to_gno(mp, start_bno, type);
111 end_gno = xfs_fsb_to_gno(mp, end_bno, type);
112 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
113 struct xfs_buf *agf_bp = NULL;
114 struct xfs_rtgroup *rtg = NULL;
115 struct xfs_btree_cur *cur;
116 struct xfs_rmap_irec ri_low = { };
117 struct xfs_rmap_irec ri_high;
118 struct xfs_group_data_lost lost;
119
120 if (type == XG_TYPE_AG) {
121 struct xfs_perag *pag = to_perag(xg);
122
123 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
124 if (error) {
125 xfs_perag_rele(pag);
126 break;
127 }
128
129 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
130 } else {
131 rtg = to_rtg(xg);
132 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
133 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
134 }
135
136 /*
137 * Set the rmap range from ri_low to ri_high, which represents
138 * a [start, end] where we looking for the files or metadata.
139 */
140 memset(&ri_high, 0xFF, sizeof(ri_high));
141 if (xg->xg_gno == start_gno)
142 ri_low.rm_startblock =
143 xfs_fsb_to_gbno(mp, start_bno, type);
144 if (xg->xg_gno == end_gno)
145 ri_high.rm_startblock =
146 xfs_fsb_to_gbno(mp, end_bno, type);
147
148 lost.startblock = ri_low.rm_startblock;
149 lost.blockcount = min(xg->xg_block_count,
150 ri_high.rm_startblock + 1) -
151 ri_low.rm_startblock;
152
153 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
154 xfs_verify_report_data_lost, &lost);
155 xfs_btree_del_cursor(cur, error);
156 if (agf_bp)
157 xfs_trans_brelse(tp, agf_bp);
158 if (rtg)
159 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
160 if (error) {
161 xfs_group_rele(xg);
162 break;
163 }
164 }
165
166 xfs_trans_cancel(tp);
167 return 0;
168 }
169
170 /*
171 * Compute the desired verify IO size.
172 *
173 * To minimize command overhead, we'd like to create bios that are 1MB, though
174 * we allow the user to ask for a smaller size.
175 */
176 static unsigned int
xfs_verify_iosize(const struct xfs_verify_media * me,struct xfs_buftarg * btp,uint64_t bbcount)177 xfs_verify_iosize(
178 const struct xfs_verify_media *me,
179 struct xfs_buftarg *btp,
180 uint64_t bbcount)
181 {
182 unsigned int iosize =
183 min_not_zero(SZ_1M, me->me_max_io_size);
184
185 BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
186 ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
187
188 return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
189 BBTOB(bbcount));
190 }
191
192 /* Allocate as much memory as we can get for verification buffer. */
193 static struct folio *
xfs_verify_alloc_folio(const unsigned int iosize)194 xfs_verify_alloc_folio(
195 const unsigned int iosize)
196 {
197 unsigned int order = get_order(iosize);
198
199 while (order > 0) {
200 struct folio *folio =
201 folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
202
203 if (folio)
204 return folio;
205 order--;
206 }
207
208 return folio_alloc(GFP_KERNEL, 0);
209 }
210
211 /* Report any kind of problem verifying media */
212 static void
xfs_verify_media_error(struct xfs_mount * mp,struct xfs_verify_media * me,struct xfs_buftarg * btp,xfs_daddr_t daddr,unsigned int bio_bbcount,blk_status_t bio_status)213 xfs_verify_media_error(
214 struct xfs_mount *mp,
215 struct xfs_verify_media *me,
216 struct xfs_buftarg *btp,
217 xfs_daddr_t daddr,
218 unsigned int bio_bbcount,
219 blk_status_t bio_status)
220 {
221 trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
222 bio_bbcount, bio_status);
223
224 /*
225 * Pass any error, I/O or otherwise, up to the caller if we didn't
226 * successfully verify any bytes at all.
227 */
228 if (me->me_start_daddr == daddr)
229 me->me_ioerror = -blk_status_to_errno(bio_status);
230
231 /*
232 * PI validation failures, medium errors, or general IO errors are
233 * treated as indicators of data loss. Everything else are (hopefully)
234 * transient errors and are not reported to healthmon or fsnotify.
235 */
236 switch (bio_status) {
237 case BLK_STS_PROTECTION:
238 case BLK_STS_IOERR:
239 case BLK_STS_MEDIUM:
240 break;
241 default:
242 return;
243 }
244
245 if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
246 return;
247
248 xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
249
250 if (!xfs_has_rmapbt(mp))
251 return;
252
253 switch (me->me_dev) {
254 case XFS_DEV_DATA:
255 xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
256 break;
257 case XFS_DEV_RT:
258 xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
259 break;
260 }
261 }
262
263 /* Verify the media of an xfs device by submitting read requests to the disk. */
264 static int
xfs_verify_media(struct xfs_mount * mp,struct xfs_verify_media * me)265 xfs_verify_media(
266 struct xfs_mount *mp,
267 struct xfs_verify_media *me)
268 {
269 struct xfs_buftarg *btp = NULL;
270 struct bio *bio;
271 struct folio *folio;
272 xfs_daddr_t daddr;
273 uint64_t bbcount;
274 int error = 0;
275
276 me->me_ioerror = 0;
277
278 switch (me->me_dev) {
279 case XFS_DEV_DATA:
280 btp = mp->m_ddev_targp;
281 break;
282 case XFS_DEV_LOG:
283 if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
284 btp = mp->m_logdev_targp;
285 break;
286 case XFS_DEV_RT:
287 btp = mp->m_rtdev_targp;
288 break;
289 }
290 if (!btp)
291 return -ENODEV;
292
293 /*
294 * If the caller told us to verify beyond the end of the disk, tell the
295 * user exactly where that was.
296 */
297 if (me->me_end_daddr > btp->bt_nr_sectors)
298 me->me_end_daddr = btp->bt_nr_sectors;
299
300 /* start and end have to be aligned to the lba size */
301 if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
302 bdev_logical_block_size(btp->bt_bdev)))
303 return -EINVAL;
304
305 /*
306 * end_daddr is the exclusive end of the range, so if start_daddr
307 * reaches there (or beyond), there's no work to be done.
308 */
309 if (me->me_start_daddr >= me->me_end_daddr)
310 return 0;
311
312 /*
313 * There are three ranges involved here:
314 *
315 * - [me->me_start_daddr, me->me_end_daddr) is the range that the
316 * user wants to verify. end_daddr can be beyond the end of the
317 * disk; we'll constrain it to the end if necessary.
318 *
319 * - [daddr, me->me_end_daddr) is the range that we have not yet
320 * verified. We update daddr after each successful read.
321 * me->me_start_daddr is set to daddr before returning.
322 *
323 * - [daddr, daddr + bio_bbcount) is the range that we're currently
324 * verifying.
325 */
326 daddr = me->me_start_daddr;
327 bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
328 me->me_start_daddr;
329
330 folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
331 if (!folio)
332 return -ENOMEM;
333
334 trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
335 folio);
336
337 bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
338 if (!bio) {
339 error = -ENOMEM;
340 goto out_folio;
341 }
342
343 while (bbcount > 0) {
344 unsigned int bio_bbcount;
345 blk_status_t bio_status;
346
347 bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
348 bio->bi_iter.bi_sector = daddr;
349 bio_add_folio_nofail(bio, folio,
350 min(bbcount << SECTOR_SHIFT, folio_size(folio)),
351 0);
352
353 /*
354 * Save the length of the bio before we submit it, because we
355 * need the original daddr and length for reporting IO errors
356 * if the bio fails.
357 */
358 bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
359 submit_bio_wait(bio);
360 bio_status = bio->bi_status;
361 if (bio_status != BLK_STS_OK) {
362 xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
363 bio_status);
364 error = 0;
365 break;
366 }
367
368 daddr += bio_bbcount;
369 bbcount -= bio_bbcount;
370
371 if (bbcount == 0)
372 break;
373
374 if (me->me_rest_us) {
375 ktime_t expires;
376
377 expires = ktime_add_ns(ktime_get(),
378 me->me_rest_us * 1000);
379 set_current_state(TASK_KILLABLE);
380 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
381 }
382
383 if (fatal_signal_pending(current)) {
384 error = -EINTR;
385 break;
386 }
387
388 cond_resched();
389 }
390
391 bio_put(bio);
392 out_folio:
393 folio_put(folio);
394
395 if (error)
396 return error;
397
398 /*
399 * Advance start_daddr to the end of what we verified if there wasn't
400 * an operational error.
401 */
402 me->me_start_daddr = daddr;
403 trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
404 return 0;
405 }
406
407 int
xfs_ioc_verify_media(struct file * file,struct xfs_verify_media __user * arg)408 xfs_ioc_verify_media(
409 struct file *file,
410 struct xfs_verify_media __user *arg)
411 {
412 struct xfs_verify_media me;
413 struct xfs_inode *ip = XFS_I(file_inode(file));
414 struct xfs_mount *mp = ip->i_mount;
415 int error;
416
417 if (!capable(CAP_SYS_ADMIN))
418 return -EPERM;
419
420 if (copy_from_user(&me, arg, sizeof(me)))
421 return -EFAULT;
422
423 if (me.me_pad)
424 return -EINVAL;
425 if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
426 return -EINVAL;
427
428 switch (me.me_dev) {
429 case XFS_DEV_DATA:
430 case XFS_DEV_LOG:
431 case XFS_DEV_RT:
432 break;
433 default:
434 return -EINVAL;
435 }
436
437 error = xfs_verify_media(mp, &me);
438 if (error)
439 return error;
440
441 if (copy_to_user(arg, &me, sizeof(me)))
442 return -EFAULT;
443
444 return 0;
445 }
446