1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2022 Fujitsu. All Rights Reserved. 4 */ 5 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_alloc.h" 13 #include "xfs_bit.h" 14 #include "xfs_btree.h" 15 #include "xfs_inode.h" 16 #include "xfs_icache.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_rtalloc.h" 20 #include "xfs_trans.h" 21 #include "xfs_ag.h" 22 #include "xfs_notify_failure.h" 23 #include "xfs_rtgroup.h" 24 #include "xfs_rtrmap_btree.h" 25 #include "xfs_healthmon.h" 26 27 #include <linux/mm.h> 28 #include <linux/dax.h> 29 #include <linux/fs.h> 30 #include <linux/fserror.h> 31 32 struct xfs_failure_info { 33 xfs_agblock_t startblock; 34 xfs_extlen_t blockcount; 35 int mf_flags; 36 bool want_shutdown; 37 }; 38 39 static pgoff_t 40 xfs_failure_pgoff( 41 struct xfs_mount *mp, 42 const struct xfs_rmap_irec *rec, 43 const struct xfs_failure_info *notify) 44 { 45 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); 46 47 if (notify->startblock > rec->rm_startblock) 48 pos += XFS_FSB_TO_B(mp, 49 notify->startblock - rec->rm_startblock); 50 return pos >> PAGE_SHIFT; 51 } 52 53 static unsigned long 54 xfs_failure_pgcnt( 55 struct xfs_mount *mp, 56 const struct xfs_rmap_irec *rec, 57 const struct xfs_failure_info *notify) 58 { 59 xfs_agblock_t end_rec; 60 xfs_agblock_t end_notify; 61 xfs_agblock_t start_cross; 62 xfs_agblock_t end_cross; 63 64 start_cross = max(rec->rm_startblock, notify->startblock); 65 66 end_rec = rec->rm_startblock + rec->rm_blockcount; 67 end_notify = notify->startblock + notify->blockcount; 68 end_cross = min(end_rec, end_notify); 69 70 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; 71 } 72 73 static int 74 xfs_dax_failure_fn( 75 struct xfs_btree_cur *cur, 76 const struct xfs_rmap_irec *rec, 77 void *data) 78 { 79 struct xfs_mount *mp = cur->bc_mp; 80 struct xfs_inode *ip; 81 struct xfs_failure_info *notify = data; 82 struct address_space *mapping; 83 pgoff_t pgoff; 84 unsigned long pgcnt; 85 int error = 0; 86 87 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || 88 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { 89 /* Continue the query because this isn't a failure. */ 90 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 91 return 0; 92 notify->want_shutdown = true; 93 return 0; 94 } 95 96 /* Get files that incore, filter out others that are not in use. */ 97 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 98 0, &ip); 99 /* Continue the rmap query if the inode isn't incore */ 100 if (error == -ENODATA) 101 return 0; 102 if (error) { 103 notify->want_shutdown = true; 104 return 0; 105 } 106 107 mapping = VFS_I(ip)->i_mapping; 108 pgoff = xfs_failure_pgoff(mp, rec, notify); 109 pgcnt = xfs_failure_pgcnt(mp, rec, notify); 110 111 /* Continue the rmap query if the inode isn't a dax file. */ 112 if (dax_mapping(mapping)) 113 error = mf_dax_kill_procs(mapping, pgoff, pgcnt, 114 notify->mf_flags); 115 116 /* Invalidate the cache in dax pages. */ 117 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 118 invalidate_inode_pages2_range(mapping, pgoff, 119 pgoff + pgcnt - 1); 120 121 fserror_report_data_lost(VFS_I(ip), (u64)pgoff << PAGE_SHIFT, 122 (u64)pgcnt << PAGE_SHIFT, GFP_NOFS); 123 124 xfs_irele(ip); 125 return error; 126 } 127 128 static int 129 xfs_dax_notify_failure_freeze( 130 struct xfs_mount *mp) 131 { 132 struct super_block *sb = mp->m_super; 133 int error; 134 135 error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL); 136 if (error) 137 xfs_emerg(mp, "already frozen by kernel, err=%d", error); 138 139 return error; 140 } 141 142 static void 143 xfs_dax_notify_failure_thaw( 144 struct xfs_mount *mp, 145 bool kernel_frozen) 146 { 147 struct super_block *sb = mp->m_super; 148 int error; 149 150 if (kernel_frozen) { 151 error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL); 152 if (error) 153 xfs_emerg(mp, "still frozen after notify failure, err=%d", 154 error); 155 } 156 157 /* 158 * Also thaw userspace call anyway because the device is about to be 159 * removed immediately. 160 */ 161 thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); 162 } 163 164 static int 165 xfs_dax_translate_range( 166 struct xfs_buftarg *btp, 167 u64 offset, 168 u64 len, 169 xfs_daddr_t *daddr, 170 uint64_t *bblen) 171 { 172 u64 dev_start = btp->bt_dax_part_off; 173 u64 dev_len = BBTOB(btp->bt_nr_sectors); 174 u64 dev_end = dev_start + dev_len - 1; 175 176 /* Notify failure on the whole device. */ 177 if (offset == 0 && len == U64_MAX) { 178 offset = dev_start; 179 len = dev_len; 180 } 181 182 /* Ignore the range out of filesystem area */ 183 if (offset + len - 1 < dev_start) 184 return -ENXIO; 185 if (offset > dev_end) 186 return -ENXIO; 187 188 /* Calculate the real range when it touches the boundary */ 189 if (offset > dev_start) 190 offset -= dev_start; 191 else { 192 len -= dev_start - offset; 193 offset = 0; 194 } 195 if (offset + len - 1 > dev_end) 196 len = dev_end - offset + 1; 197 198 *daddr = BTOBB(offset); 199 *bblen = BTOBB(len); 200 return 0; 201 } 202 203 static int 204 xfs_dax_notify_logdev_failure( 205 struct xfs_mount *mp, 206 u64 offset, 207 u64 len, 208 int mf_flags) 209 { 210 xfs_daddr_t daddr; 211 uint64_t bblen; 212 int error; 213 214 /* 215 * Return ENXIO instead of shutting down the filesystem if the failed 216 * region is beyond the end of the log. 217 */ 218 error = xfs_dax_translate_range(mp->m_logdev_targp, 219 offset, len, &daddr, &bblen); 220 if (error) 221 return error; 222 223 xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen); 224 225 /* 226 * In the pre-remove case the failure notification is attempting to 227 * trigger a force unmount. The expectation is that the device is 228 * still present, but its removal is in progress and can not be 229 * cancelled, proceed with accessing the log device. 230 */ 231 if (mf_flags & MF_MEM_PRE_REMOVE) 232 return 0; 233 234 xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 235 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 236 return -EFSCORRUPTED; 237 } 238 239 static int 240 xfs_dax_notify_dev_failure( 241 struct xfs_mount *mp, 242 u64 offset, 243 u64 len, 244 int mf_flags, 245 enum xfs_group_type type) 246 { 247 struct xfs_failure_info notify = { .mf_flags = mf_flags }; 248 struct xfs_trans *tp = NULL; 249 struct xfs_btree_cur *cur = NULL; 250 int error = 0; 251 bool kernel_frozen = false; 252 uint32_t start_gno, end_gno; 253 xfs_fsblock_t start_bno, end_bno; 254 xfs_daddr_t daddr; 255 uint64_t bblen; 256 struct xfs_group *xg = NULL; 257 258 error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type), 259 offset, len, &daddr, &bblen); 260 if (error) 261 return error; 262 263 xfs_healthmon_report_media(mp, 264 type == XG_TYPE_RTG ? XFS_DEV_RT : XFS_DEV_DATA, 265 daddr, bblen); 266 267 if (!xfs_has_rmapbt(mp)) { 268 xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 269 return -EOPNOTSUPP; 270 } 271 272 if (type == XG_TYPE_RTG) { 273 start_bno = xfs_daddr_to_rtb(mp, daddr); 274 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); 275 } else { 276 start_bno = XFS_DADDR_TO_FSB(mp, daddr); 277 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); 278 } 279 280 if (mf_flags & MF_MEM_PRE_REMOVE) { 281 xfs_info(mp, "Device is about to be removed!"); 282 /* 283 * Freeze fs to prevent new mappings from being created. 284 * - Keep going on if others already hold the kernel forzen. 285 * - Keep going on if other errors too because this device is 286 * starting to fail. 287 * - If kernel frozen state is hold successfully here, thaw it 288 * here as well at the end. 289 */ 290 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; 291 } 292 293 tp = xfs_trans_alloc_empty(mp); 294 start_gno = xfs_fsb_to_gno(mp, start_bno, type); 295 end_gno = xfs_fsb_to_gno(mp, end_bno, type); 296 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { 297 struct xfs_buf *agf_bp = NULL; 298 struct xfs_rtgroup *rtg = NULL; 299 struct xfs_rmap_irec ri_low = { }; 300 struct xfs_rmap_irec ri_high; 301 302 if (type == XG_TYPE_AG) { 303 struct xfs_perag *pag = to_perag(xg); 304 305 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 306 if (error) { 307 xfs_perag_put(pag); 308 break; 309 } 310 311 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 312 } else { 313 rtg = to_rtg(xg); 314 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 315 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 316 } 317 318 /* 319 * Set the rmap range from ri_low to ri_high, which represents 320 * a [start, end] where we looking for the files or metadata. 321 */ 322 memset(&ri_high, 0xFF, sizeof(ri_high)); 323 if (xg->xg_gno == start_gno) 324 ri_low.rm_startblock = 325 xfs_fsb_to_gbno(mp, start_bno, type); 326 if (xg->xg_gno == end_gno) 327 ri_high.rm_startblock = 328 xfs_fsb_to_gbno(mp, end_bno, type); 329 330 notify.startblock = ri_low.rm_startblock; 331 notify.blockcount = min(xg->xg_block_count, 332 ri_high.rm_startblock + 1) - 333 ri_low.rm_startblock; 334 335 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 336 xfs_dax_failure_fn, ¬ify); 337 xfs_btree_del_cursor(cur, error); 338 if (agf_bp) 339 xfs_trans_brelse(tp, agf_bp); 340 if (rtg) 341 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 342 if (error) { 343 xfs_group_put(xg); 344 break; 345 } 346 } 347 348 xfs_trans_cancel(tp); 349 350 /* 351 * Shutdown fs from a force umount in pre-remove case which won't fail, 352 * so errors can be ignored. Otherwise, shutdown the filesystem with 353 * CORRUPT flag if error occured or notify.want_shutdown was set during 354 * RMAP querying. 355 */ 356 if (mf_flags & MF_MEM_PRE_REMOVE) 357 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 358 else if (error || notify.want_shutdown) { 359 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 360 if (!error) 361 error = -EFSCORRUPTED; 362 } 363 364 /* Thaw the fs if it has been frozen before. */ 365 if (mf_flags & MF_MEM_PRE_REMOVE) 366 xfs_dax_notify_failure_thaw(mp, kernel_frozen); 367 368 return error; 369 } 370 371 static int 372 xfs_dax_notify_failure( 373 struct dax_device *dax_dev, 374 u64 offset, 375 u64 len, 376 int mf_flags) 377 { 378 struct xfs_mount *mp = dax_holder(dax_dev); 379 380 if (!(mp->m_super->s_flags & SB_BORN)) { 381 xfs_warn(mp, "filesystem is not ready for notify_failure()!"); 382 return -EIO; 383 } 384 385 if (mp->m_logdev_targp != mp->m_ddev_targp && 386 mp->m_logdev_targp->bt_daxdev == dax_dev) { 387 return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags); 388 } 389 390 return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags, 391 (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ? 392 XG_TYPE_RTG : XG_TYPE_AG); 393 } 394 395 const struct dax_holder_operations xfs_dax_holder_operations = { 396 .notify_failure = xfs_dax_notify_failure, 397 }; 398