1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2026 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_bit.h" 13 #include "xfs_btree.h" 14 #include "xfs_inode.h" 15 #include "xfs_icache.h" 16 #include "xfs_trans.h" 17 #include "xfs_alloc.h" 18 #include "xfs_ag.h" 19 #include "xfs_rmap.h" 20 #include "xfs_rmap_btree.h" 21 #include "xfs_rtgroup.h" 22 #include "xfs_rtrmap_btree.h" 23 #include "xfs_health.h" 24 #include "xfs_healthmon.h" 25 #include "xfs_trace.h" 26 #include "xfs_verify_media.h" 27 28 #include <linux/fserror.h> 29 30 struct xfs_group_data_lost { 31 xfs_agblock_t startblock; 32 xfs_extlen_t blockcount; 33 }; 34 35 /* Report lost file data from rmap records */ 36 static int 37 xfs_verify_report_data_lost( 38 struct xfs_btree_cur *cur, 39 const struct xfs_rmap_irec *rec, 40 void *data) 41 { 42 struct xfs_mount *mp = cur->bc_mp; 43 struct xfs_inode *ip; 44 struct xfs_group_data_lost *lost = data; 45 xfs_fileoff_t fileoff = rec->rm_offset; 46 xfs_extlen_t blocks = rec->rm_blockcount; 47 const bool is_attr = 48 (rec->rm_flags & XFS_RMAP_ATTR_FORK); 49 const xfs_agblock_t lost_end = 50 lost->startblock + lost->blockcount; 51 const xfs_agblock_t rmap_end = 52 rec->rm_startblock + rec->rm_blockcount; 53 int error = 0; 54 55 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) 56 return 0; 57 58 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip); 59 if (error) 60 return 0; 61 62 if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { 63 xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK); 64 goto out_rele; 65 } 66 67 if (is_attr) { 68 xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR); 69 goto out_rele; 70 } 71 72 if (lost->startblock > rec->rm_startblock) { 73 fileoff += lost->startblock - rec->rm_startblock; 74 blocks -= lost->startblock - rec->rm_startblock; 75 } 76 if (rmap_end > lost_end) 77 blocks -= rmap_end - lost_end; 78 79 fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff), 80 XFS_FSB_TO_B(mp, blocks), GFP_NOFS); 81 82 out_rele: 83 xfs_irele(ip); 84 return 0; 85 } 86 87 /* Walk reverse mappings to look for all file data loss */ 88 static int 89 xfs_verify_report_losses( 90 struct xfs_mount *mp, 91 enum xfs_group_type type, 92 xfs_daddr_t daddr, 93 u64 bblen) 94 { 95 struct xfs_group *xg = NULL; 96 struct xfs_trans *tp; 97 xfs_fsblock_t start_bno, end_bno; 98 uint32_t start_gno, end_gno; 99 int error; 100 101 if (type == XG_TYPE_RTG) { 102 start_bno = xfs_daddr_to_rtb(mp, daddr); 103 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); 104 } else { 105 start_bno = XFS_DADDR_TO_FSB(mp, daddr); 106 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); 107 } 108 109 tp = xfs_trans_alloc_empty(mp); 110 start_gno = xfs_fsb_to_gno(mp, start_bno, type); 111 end_gno = xfs_fsb_to_gno(mp, end_bno, type); 112 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { 113 struct xfs_buf *agf_bp = NULL; 114 struct xfs_rtgroup *rtg = NULL; 115 struct xfs_btree_cur *cur; 116 struct xfs_rmap_irec ri_low = { }; 117 struct xfs_rmap_irec ri_high; 118 struct xfs_group_data_lost lost; 119 120 if (type == XG_TYPE_AG) { 121 struct xfs_perag *pag = to_perag(xg); 122 123 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 124 if (error) { 125 xfs_perag_put(pag); 126 break; 127 } 128 129 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 130 } else { 131 rtg = to_rtg(xg); 132 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 133 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 134 } 135 136 /* 137 * Set the rmap range from ri_low to ri_high, which represents 138 * a [start, end] where we looking for the files or metadata. 139 */ 140 memset(&ri_high, 0xFF, sizeof(ri_high)); 141 if (xg->xg_gno == start_gno) 142 ri_low.rm_startblock = 143 xfs_fsb_to_gbno(mp, start_bno, type); 144 if (xg->xg_gno == end_gno) 145 ri_high.rm_startblock = 146 xfs_fsb_to_gbno(mp, end_bno, type); 147 148 lost.startblock = ri_low.rm_startblock; 149 lost.blockcount = min(xg->xg_block_count, 150 ri_high.rm_startblock + 1) - 151 ri_low.rm_startblock; 152 153 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 154 xfs_verify_report_data_lost, &lost); 155 xfs_btree_del_cursor(cur, error); 156 if (agf_bp) 157 xfs_trans_brelse(tp, agf_bp); 158 if (rtg) 159 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 160 if (error) { 161 xfs_group_put(xg); 162 break; 163 } 164 } 165 166 xfs_trans_cancel(tp); 167 return 0; 168 } 169 170 /* 171 * Compute the desired verify IO size. 172 * 173 * To minimize command overhead, we'd like to create bios that are 1MB, though 174 * we allow the user to ask for a smaller size. 175 */ 176 static unsigned int 177 xfs_verify_iosize( 178 const struct xfs_verify_media *me, 179 struct xfs_buftarg *btp, 180 uint64_t bbcount) 181 { 182 unsigned int iosize = 183 min_not_zero(SZ_1M, me->me_max_io_size); 184 185 BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); 186 ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); 187 188 return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), 189 BBTOB(bbcount)); 190 } 191 192 /* Allocate as much memory as we can get for verification buffer. */ 193 static struct folio * 194 xfs_verify_alloc_folio( 195 const unsigned int iosize) 196 { 197 unsigned int order = get_order(iosize); 198 199 while (order > 0) { 200 struct folio *folio = 201 folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); 202 203 if (folio) 204 return folio; 205 order--; 206 } 207 208 return folio_alloc(GFP_KERNEL, 0); 209 } 210 211 /* Report any kind of problem verifying media */ 212 static void 213 xfs_verify_media_error( 214 struct xfs_mount *mp, 215 struct xfs_verify_media *me, 216 struct xfs_buftarg *btp, 217 xfs_daddr_t daddr, 218 unsigned int bio_bbcount, 219 blk_status_t bio_status) 220 { 221 trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, 222 bio_bbcount, bio_status); 223 224 /* 225 * Pass any error, I/O or otherwise, up to the caller if we didn't 226 * successfully verify any bytes at all. 227 */ 228 if (me->me_start_daddr == daddr) 229 me->me_ioerror = -blk_status_to_errno(bio_status); 230 231 /* 232 * PI validation failures, medium errors, or general IO errors are 233 * treated as indicators of data loss. Everything else are (hopefully) 234 * transient errors and are not reported to healthmon or fsnotify. 235 */ 236 switch (bio_status) { 237 case BLK_STS_PROTECTION: 238 case BLK_STS_IOERR: 239 case BLK_STS_MEDIUM: 240 break; 241 default: 242 return; 243 } 244 245 if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT)) 246 return; 247 248 xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount); 249 250 if (!xfs_has_rmapbt(mp)) 251 return; 252 253 switch (me->me_dev) { 254 case XFS_DEV_DATA: 255 xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount); 256 break; 257 case XFS_DEV_RT: 258 xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount); 259 break; 260 } 261 } 262 263 /* Verify the media of an xfs device by submitting read requests to the disk. */ 264 static int 265 xfs_verify_media( 266 struct xfs_mount *mp, 267 struct xfs_verify_media *me) 268 { 269 struct xfs_buftarg *btp = NULL; 270 struct bio *bio; 271 struct folio *folio; 272 xfs_daddr_t daddr; 273 uint64_t bbcount; 274 int error = 0; 275 276 me->me_ioerror = 0; 277 278 switch (me->me_dev) { 279 case XFS_DEV_DATA: 280 btp = mp->m_ddev_targp; 281 break; 282 case XFS_DEV_LOG: 283 if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev) 284 btp = mp->m_logdev_targp; 285 break; 286 case XFS_DEV_RT: 287 btp = mp->m_rtdev_targp; 288 break; 289 } 290 if (!btp) 291 return -ENODEV; 292 293 /* 294 * If the caller told us to verify beyond the end of the disk, tell the 295 * user exactly where that was. 296 */ 297 if (me->me_end_daddr > btp->bt_nr_sectors) 298 me->me_end_daddr = btp->bt_nr_sectors; 299 300 /* start and end have to be aligned to the lba size */ 301 if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr), 302 bdev_logical_block_size(btp->bt_bdev))) 303 return -EINVAL; 304 305 /* 306 * end_daddr is the exclusive end of the range, so if start_daddr 307 * reaches there (or beyond), there's no work to be done. 308 */ 309 if (me->me_start_daddr >= me->me_end_daddr) 310 return 0; 311 312 /* 313 * There are three ranges involved here: 314 * 315 * - [me->me_start_daddr, me->me_end_daddr) is the range that the 316 * user wants to verify. end_daddr can be beyond the end of the 317 * disk; we'll constrain it to the end if necessary. 318 * 319 * - [daddr, me->me_end_daddr) is the range that we have not yet 320 * verified. We update daddr after each successful read. 321 * me->me_start_daddr is set to daddr before returning. 322 * 323 * - [daddr, daddr + bio_bbcount) is the range that we're currently 324 * verifying. 325 */ 326 daddr = me->me_start_daddr; 327 bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) - 328 me->me_start_daddr; 329 330 folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount)); 331 if (!folio) 332 return -ENOMEM; 333 334 trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount, 335 folio); 336 337 bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL); 338 if (!bio) { 339 error = -ENOMEM; 340 goto out_folio; 341 } 342 343 while (bbcount > 0) { 344 unsigned int bio_bbcount; 345 blk_status_t bio_status; 346 347 bio_reset(bio, btp->bt_bdev, REQ_OP_READ); 348 bio->bi_iter.bi_sector = daddr; 349 bio_add_folio_nofail(bio, folio, 350 min(bbcount << SECTOR_SHIFT, folio_size(folio)), 351 0); 352 353 /* 354 * Save the length of the bio before we submit it, because we 355 * need the original daddr and length for reporting IO errors 356 * if the bio fails. 357 */ 358 bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT; 359 submit_bio_wait(bio); 360 bio_status = bio->bi_status; 361 if (bio_status != BLK_STS_OK) { 362 xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount, 363 bio_status); 364 error = 0; 365 break; 366 } 367 368 daddr += bio_bbcount; 369 bbcount -= bio_bbcount; 370 371 if (bbcount == 0) 372 break; 373 374 if (me->me_rest_us) { 375 ktime_t expires; 376 377 expires = ktime_add_ns(ktime_get(), 378 me->me_rest_us * 1000); 379 set_current_state(TASK_KILLABLE); 380 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 381 } 382 383 if (fatal_signal_pending(current)) { 384 error = -EINTR; 385 break; 386 } 387 388 cond_resched(); 389 } 390 391 bio_put(bio); 392 out_folio: 393 folio_put(folio); 394 395 if (error) 396 return error; 397 398 /* 399 * Advance start_daddr to the end of what we verified if there wasn't 400 * an operational error. 401 */ 402 me->me_start_daddr = daddr; 403 trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev); 404 return 0; 405 } 406 407 int 408 xfs_ioc_verify_media( 409 struct file *file, 410 struct xfs_verify_media __user *arg) 411 { 412 struct xfs_verify_media me; 413 struct xfs_inode *ip = XFS_I(file_inode(file)); 414 struct xfs_mount *mp = ip->i_mount; 415 int error; 416 417 if (!capable(CAP_SYS_ADMIN)) 418 return -EPERM; 419 420 if (copy_from_user(&me, arg, sizeof(me))) 421 return -EFAULT; 422 423 if (me.me_pad) 424 return -EINVAL; 425 if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS) 426 return -EINVAL; 427 428 switch (me.me_dev) { 429 case XFS_DEV_DATA: 430 case XFS_DEV_LOG: 431 case XFS_DEV_RT: 432 break; 433 default: 434 return -EINVAL; 435 } 436 437 error = xfs_verify_media(mp, &me); 438 if (error) 439 return error; 440 441 if (copy_to_user(arg, &me, sizeof(me))) 442 return -EFAULT; 443 444 return 0; 445 } 446