1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2026 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_bit.h"
13 #include "xfs_btree.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_trans.h"
17 #include "xfs_alloc.h"
18 #include "xfs_ag.h"
19 #include "xfs_rmap.h"
20 #include "xfs_rmap_btree.h"
21 #include "xfs_rtgroup.h"
22 #include "xfs_rtrmap_btree.h"
23 #include "xfs_health.h"
24 #include "xfs_healthmon.h"
25 #include "xfs_trace.h"
26 #include "xfs_verify_media.h"
27
28 #include <linux/fserror.h>
29
30 struct xfs_group_data_lost {
31 xfs_agblock_t startblock;
32 xfs_extlen_t blockcount;
33 };
34
35 /* Report lost file data from rmap records */
36 static int
xfs_verify_report_data_lost(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)37 xfs_verify_report_data_lost(
38 struct xfs_btree_cur *cur,
39 const struct xfs_rmap_irec *rec,
40 void *data)
41 {
42 struct xfs_mount *mp = cur->bc_mp;
43 struct xfs_inode *ip;
44 struct xfs_group_data_lost *lost = data;
45 xfs_fileoff_t fileoff = rec->rm_offset;
46 xfs_extlen_t blocks = rec->rm_blockcount;
47 const bool is_attr =
48 (rec->rm_flags & XFS_RMAP_ATTR_FORK);
49 const xfs_agblock_t lost_end =
50 lost->startblock + lost->blockcount;
51 const xfs_agblock_t rmap_end =
52 rec->rm_startblock + rec->rm_blockcount;
53 int error = 0;
54
55 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
56 return 0;
57
58 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
59 if (error)
60 return 0;
61
62 if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
63 xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
64 goto out_rele;
65 }
66
67 if (is_attr) {
68 xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
69 goto out_rele;
70 }
71
72 if (lost->startblock > rec->rm_startblock) {
73 fileoff += lost->startblock - rec->rm_startblock;
74 blocks -= lost->startblock - rec->rm_startblock;
75 }
76 if (rmap_end > lost_end)
77 blocks -= rmap_end - lost_end;
78
79 fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
80 XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
81
82 out_rele:
83 xfs_irele(ip);
84 return 0;
85 }
86
87 /* Walk reverse mappings to look for all file data loss */
88 static int
xfs_verify_report_losses(struct xfs_mount * mp,enum xfs_group_type type,xfs_daddr_t daddr,u64 bblen)89 xfs_verify_report_losses(
90 struct xfs_mount *mp,
91 enum xfs_group_type type,
92 xfs_daddr_t daddr,
93 u64 bblen)
94 {
95 struct xfs_group *xg = NULL;
96 struct xfs_trans *tp;
97 xfs_fsblock_t start_bno, end_bno;
98 uint32_t start_gno, end_gno;
99 int error;
100
101 if (type == XG_TYPE_RTG) {
102 start_bno = xfs_daddr_to_rtb(mp, daddr);
103 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
104 } else {
105 start_bno = XFS_DADDR_TO_FSB(mp, daddr);
106 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
107 }
108
109 tp = xfs_trans_alloc_empty(mp);
110 start_gno = xfs_fsb_to_gno(mp, start_bno, type);
111 end_gno = xfs_fsb_to_gno(mp, end_bno, type);
112 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
113 struct xfs_buf *agf_bp = NULL;
114 struct xfs_rtgroup *rtg = NULL;
115 struct xfs_btree_cur *cur;
116 struct xfs_rmap_irec ri_low = { };
117 struct xfs_rmap_irec ri_high;
118 struct xfs_group_data_lost lost;
119
120 if (type == XG_TYPE_AG) {
121 struct xfs_perag *pag = to_perag(xg);
122
123 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
124 if (error) {
125 xfs_perag_rele(pag);
126 break;
127 }
128
129 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
130 } else {
131 rtg = to_rtg(xg);
132 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
133 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
134 }
135
136 /*
137 * Set the rmap range from ri_low to ri_high, which represents
138 * a [start, end] where we looking for the files or metadata.
139 */
140 memset(&ri_high, 0xFF, sizeof(ri_high));
141 if (xg->xg_gno == start_gno)
142 ri_low.rm_startblock =
143 xfs_fsb_to_gbno(mp, start_bno, type);
144 if (xg->xg_gno == end_gno)
145 ri_high.rm_startblock =
146 xfs_fsb_to_gbno(mp, end_bno, type);
147
148 lost.startblock = ri_low.rm_startblock;
149 lost.blockcount = min(xg->xg_block_count,
150 ri_high.rm_startblock + 1) -
151 ri_low.rm_startblock;
152
153 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
154 xfs_verify_report_data_lost, &lost);
155 xfs_btree_del_cursor(cur, error);
156 if (agf_bp)
157 xfs_trans_brelse(tp, agf_bp);
158 if (rtg)
159 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
160 if (error) {
161 xfs_group_rele(xg);
162 break;
163 }
164 }
165
166 xfs_trans_cancel(tp);
167 return 0;
168 }
169
170 /*
171 * Compute the desired verify IO size.
172 *
173 * To minimize command overhead, we'd like to create bios that are 1MB, though
174 * we allow the user to ask for a smaller size.
175 */
176 static unsigned int
xfs_verify_iosize(const struct xfs_verify_media * me,struct xfs_buftarg * btp,uint64_t bbcount)177 xfs_verify_iosize(
178 const struct xfs_verify_media *me,
179 struct xfs_buftarg *btp,
180 uint64_t bbcount)
181 {
182 unsigned int iosize =
183 min_not_zero(SZ_1M, me->me_max_io_size);
184
185 BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
186 ASSERT(BBTOB(bbcount) >= btp->bt_logical_sectorsize);
187
188 return clamp(iosize, btp->bt_logical_sectorsize, BBTOB(bbcount));
189 }
190
191 /* Allocate as much memory as we can get for verification buffer. */
192 static struct folio *
xfs_verify_alloc_folio(const unsigned int iosize)193 xfs_verify_alloc_folio(
194 const unsigned int iosize)
195 {
196 unsigned int order = get_order(iosize);
197
198 while (order > 0) {
199 struct folio *folio =
200 folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
201
202 if (folio)
203 return folio;
204 order--;
205 }
206
207 return folio_alloc(GFP_KERNEL, 0);
208 }
209
210 /* Report any kind of problem verifying media */
211 static void
xfs_verify_media_error(struct xfs_mount * mp,struct xfs_verify_media * me,struct xfs_buftarg * btp,xfs_daddr_t daddr,unsigned int bio_bbcount,blk_status_t bio_status)212 xfs_verify_media_error(
213 struct xfs_mount *mp,
214 struct xfs_verify_media *me,
215 struct xfs_buftarg *btp,
216 xfs_daddr_t daddr,
217 unsigned int bio_bbcount,
218 blk_status_t bio_status)
219 {
220 trace_xfs_verify_media_error(mp, me, btp->bt_dev, daddr, bio_bbcount,
221 bio_status);
222
223 /*
224 * Pass any error, I/O or otherwise, up to the caller if we didn't
225 * successfully verify any bytes at all.
226 */
227 if (me->me_start_daddr == daddr)
228 me->me_ioerror = -blk_status_to_errno(bio_status);
229
230 /*
231 * PI validation failures, medium errors, or general IO errors are
232 * treated as indicators of data loss. Everything else are (hopefully)
233 * transient errors and are not reported to healthmon or fsnotify.
234 */
235 switch (bio_status) {
236 case BLK_STS_PROTECTION:
237 case BLK_STS_IOERR:
238 case BLK_STS_MEDIUM:
239 break;
240 default:
241 return;
242 }
243
244 if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
245 return;
246
247 xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
248
249 if (!xfs_has_rmapbt(mp))
250 return;
251
252 switch (me->me_dev) {
253 case XFS_DEV_DATA:
254 xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
255 break;
256 case XFS_DEV_RT:
257 xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
258 break;
259 }
260 }
261
262 /* Verify the media of an xfs device by submitting read requests to the disk. */
263 static int
xfs_verify_media(struct xfs_mount * mp,struct xfs_verify_media * me)264 xfs_verify_media(
265 struct xfs_mount *mp,
266 struct xfs_verify_media *me)
267 {
268 struct xfs_buftarg *btp = NULL;
269 struct bio *bio;
270 struct folio *folio;
271 xfs_daddr_t daddr;
272 uint64_t bbcount;
273 int error = 0;
274
275 me->me_ioerror = 0;
276
277 switch (me->me_dev) {
278 case XFS_DEV_DATA:
279 btp = mp->m_ddev_targp;
280 break;
281 case XFS_DEV_LOG:
282 if (mp->m_logdev_targp != mp->m_ddev_targp)
283 btp = mp->m_logdev_targp;
284 break;
285 case XFS_DEV_RT:
286 btp = mp->m_rtdev_targp;
287 break;
288 }
289 if (!btp)
290 return -ENODEV;
291
292 /*
293 * If the caller told us to verify beyond the end of the disk, tell the
294 * user exactly where that was.
295 */
296 if (me->me_end_daddr > btp->bt_nr_sectors)
297 me->me_end_daddr = btp->bt_nr_sectors;
298
299 /* start and end have to be aligned to the lba size */
300 if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
301 btp->bt_logical_sectorsize))
302 return -EINVAL;
303
304 /*
305 * end_daddr is the exclusive end of the range, so if start_daddr
306 * reaches there (or beyond), there's no work to be done.
307 */
308 if (me->me_start_daddr >= me->me_end_daddr)
309 return 0;
310
311 /*
312 * There are three ranges involved here:
313 *
314 * - [me->me_start_daddr, me->me_end_daddr) is the range that the
315 * user wants to verify. end_daddr can be beyond the end of the
316 * disk; we'll constrain it to the end if necessary.
317 *
318 * - [daddr, me->me_end_daddr) is the range that we have not yet
319 * verified. We update daddr after each successful read.
320 * me->me_start_daddr is set to daddr before returning.
321 *
322 * - [daddr, daddr + bio_bbcount) is the range that we're currently
323 * verifying.
324 */
325 daddr = me->me_start_daddr;
326 bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
327 me->me_start_daddr;
328
329 folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
330 if (!folio)
331 return -ENOMEM;
332
333 trace_xfs_verify_media(mp, me, btp->bt_dev, daddr, bbcount, folio);
334
335 bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
336 if (!bio) {
337 error = -ENOMEM;
338 goto out_folio;
339 }
340
341 while (bbcount > 0) {
342 unsigned int bio_bbcount;
343 blk_status_t bio_status;
344
345 bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
346 bio->bi_iter.bi_sector = daddr;
347 bio_add_folio_nofail(bio, folio,
348 min(bbcount << SECTOR_SHIFT, folio_size(folio)),
349 0);
350
351 /*
352 * Save the length of the bio before we submit it, because we
353 * need the original daddr and length for reporting IO errors
354 * if the bio fails.
355 */
356 bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
357 submit_bio_wait(bio);
358 bio_status = bio->bi_status;
359 if (bio_status != BLK_STS_OK) {
360 xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
361 bio_status);
362 error = 0;
363 break;
364 }
365
366 daddr += bio_bbcount;
367 bbcount -= bio_bbcount;
368
369 if (bbcount == 0)
370 break;
371
372 if (me->me_rest_us) {
373 ktime_t expires;
374
375 expires = ktime_add_ns(ktime_get(),
376 me->me_rest_us * 1000);
377 set_current_state(TASK_KILLABLE);
378 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
379 }
380
381 if (fatal_signal_pending(current)) {
382 error = -EINTR;
383 break;
384 }
385
386 cond_resched();
387 }
388
389 bio_put(bio);
390 out_folio:
391 folio_put(folio);
392
393 if (error)
394 return error;
395
396 /*
397 * Advance start_daddr to the end of what we verified if there wasn't
398 * an operational error.
399 */
400 me->me_start_daddr = daddr;
401 trace_xfs_verify_media_end(mp, me, btp->bt_dev);
402 return 0;
403 }
404
405 int
xfs_ioc_verify_media(struct file * file,struct xfs_verify_media __user * arg)406 xfs_ioc_verify_media(
407 struct file *file,
408 struct xfs_verify_media __user *arg)
409 {
410 struct xfs_verify_media me;
411 struct xfs_inode *ip = XFS_I(file_inode(file));
412 struct xfs_mount *mp = ip->i_mount;
413 int error;
414
415 if (!capable(CAP_SYS_ADMIN))
416 return -EPERM;
417
418 if (copy_from_user(&me, arg, sizeof(me)))
419 return -EFAULT;
420
421 if (me.me_pad)
422 return -EINVAL;
423 if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
424 return -EINVAL;
425
426 switch (me.me_dev) {
427 case XFS_DEV_DATA:
428 case XFS_DEV_LOG:
429 case XFS_DEV_RT:
430 break;
431 default:
432 return -EINVAL;
433 }
434
435 error = xfs_verify_media(mp, &me);
436 if (error)
437 return error;
438
439 if (copy_to_user(arg, &me, sizeof(me)))
440 return -EFAULT;
441
442 return 0;
443 }
444