1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2022 Fujitsu. All Rights Reserved. 4 */ 5 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_alloc.h" 13 #include "xfs_bit.h" 14 #include "xfs_btree.h" 15 #include "xfs_inode.h" 16 #include "xfs_icache.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_rtalloc.h" 20 #include "xfs_trans.h" 21 #include "xfs_ag.h" 22 #include "xfs_notify_failure.h" 23 #include "xfs_rtgroup.h" 24 #include "xfs_rtrmap_btree.h" 25 26 #include <linux/mm.h> 27 #include <linux/dax.h> 28 #include <linux/fs.h> 29 30 struct xfs_failure_info { 31 xfs_agblock_t startblock; 32 xfs_extlen_t blockcount; 33 int mf_flags; 34 bool want_shutdown; 35 }; 36 37 static pgoff_t 38 xfs_failure_pgoff( 39 struct xfs_mount *mp, 40 const struct xfs_rmap_irec *rec, 41 const struct xfs_failure_info *notify) 42 { 43 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); 44 45 if (notify->startblock > rec->rm_startblock) 46 pos += XFS_FSB_TO_B(mp, 47 notify->startblock - rec->rm_startblock); 48 return pos >> PAGE_SHIFT; 49 } 50 51 static unsigned long 52 xfs_failure_pgcnt( 53 struct xfs_mount *mp, 54 const struct xfs_rmap_irec *rec, 55 const struct xfs_failure_info *notify) 56 { 57 xfs_agblock_t end_rec; 58 xfs_agblock_t end_notify; 59 xfs_agblock_t start_cross; 60 xfs_agblock_t end_cross; 61 62 start_cross = max(rec->rm_startblock, notify->startblock); 63 64 end_rec = rec->rm_startblock + rec->rm_blockcount; 65 end_notify = notify->startblock + notify->blockcount; 66 end_cross = min(end_rec, end_notify); 67 68 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; 69 } 70 71 static int 72 xfs_dax_failure_fn( 73 struct xfs_btree_cur *cur, 74 const struct xfs_rmap_irec *rec, 75 void *data) 76 { 77 struct xfs_mount *mp = cur->bc_mp; 78 struct xfs_inode *ip; 79 struct xfs_failure_info *notify = data; 80 struct address_space *mapping; 81 pgoff_t pgoff; 82 unsigned long pgcnt; 83 int error = 0; 84 85 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || 86 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { 87 /* Continue the query because this isn't a failure. */ 88 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 89 return 0; 90 notify->want_shutdown = true; 91 return 0; 92 } 93 94 /* Get files that incore, filter out others that are not in use. */ 95 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 96 0, &ip); 97 /* Continue the rmap query if the inode isn't incore */ 98 if (error == -ENODATA) 99 return 0; 100 if (error) { 101 notify->want_shutdown = true; 102 return 0; 103 } 104 105 mapping = VFS_I(ip)->i_mapping; 106 pgoff = xfs_failure_pgoff(mp, rec, notify); 107 pgcnt = xfs_failure_pgcnt(mp, rec, notify); 108 109 /* Continue the rmap query if the inode isn't a dax file. */ 110 if (dax_mapping(mapping)) 111 error = mf_dax_kill_procs(mapping, pgoff, pgcnt, 112 notify->mf_flags); 113 114 /* Invalidate the cache in dax pages. */ 115 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 116 invalidate_inode_pages2_range(mapping, pgoff, 117 pgoff + pgcnt - 1); 118 119 xfs_irele(ip); 120 return error; 121 } 122 123 static int 124 xfs_dax_notify_failure_freeze( 125 struct xfs_mount *mp) 126 { 127 struct super_block *sb = mp->m_super; 128 int error; 129 130 error = freeze_super(sb, FREEZE_HOLDER_KERNEL); 131 if (error) 132 xfs_emerg(mp, "already frozen by kernel, err=%d", error); 133 134 return error; 135 } 136 137 static void 138 xfs_dax_notify_failure_thaw( 139 struct xfs_mount *mp, 140 bool kernel_frozen) 141 { 142 struct super_block *sb = mp->m_super; 143 int error; 144 145 if (kernel_frozen) { 146 error = thaw_super(sb, FREEZE_HOLDER_KERNEL); 147 if (error) 148 xfs_emerg(mp, "still frozen after notify failure, err=%d", 149 error); 150 } 151 152 /* 153 * Also thaw userspace call anyway because the device is about to be 154 * removed immediately. 155 */ 156 thaw_super(sb, FREEZE_HOLDER_USERSPACE); 157 } 158 159 static int 160 xfs_dax_translate_range( 161 struct xfs_buftarg *btp, 162 u64 offset, 163 u64 len, 164 xfs_daddr_t *daddr, 165 uint64_t *bblen) 166 { 167 u64 dev_start = btp->bt_dax_part_off; 168 u64 dev_len = bdev_nr_bytes(btp->bt_bdev); 169 u64 dev_end = dev_start + dev_len - 1; 170 171 /* Notify failure on the whole device. */ 172 if (offset == 0 && len == U64_MAX) { 173 offset = dev_start; 174 len = dev_len; 175 } 176 177 /* Ignore the range out of filesystem area */ 178 if (offset + len - 1 < dev_start) 179 return -ENXIO; 180 if (offset > dev_end) 181 return -ENXIO; 182 183 /* Calculate the real range when it touches the boundary */ 184 if (offset > dev_start) 185 offset -= dev_start; 186 else { 187 len -= dev_start - offset; 188 offset = 0; 189 } 190 if (offset + len - 1 > dev_end) 191 len = dev_end - offset + 1; 192 193 *daddr = BTOBB(offset); 194 *bblen = BTOBB(len); 195 return 0; 196 } 197 198 static int 199 xfs_dax_notify_logdev_failure( 200 struct xfs_mount *mp, 201 u64 offset, 202 u64 len, 203 int mf_flags) 204 { 205 xfs_daddr_t daddr; 206 uint64_t bblen; 207 int error; 208 209 /* 210 * Return ENXIO instead of shutting down the filesystem if the failed 211 * region is beyond the end of the log. 212 */ 213 error = xfs_dax_translate_range(mp->m_logdev_targp, 214 offset, len, &daddr, &bblen); 215 if (error) 216 return error; 217 218 /* 219 * In the pre-remove case the failure notification is attempting to 220 * trigger a force unmount. The expectation is that the device is 221 * still present, but its removal is in progress and can not be 222 * cancelled, proceed with accessing the log device. 223 */ 224 if (mf_flags & MF_MEM_PRE_REMOVE) 225 return 0; 226 227 xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 228 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 229 return -EFSCORRUPTED; 230 } 231 232 static int 233 xfs_dax_notify_dev_failure( 234 struct xfs_mount *mp, 235 u64 offset, 236 u64 len, 237 int mf_flags, 238 enum xfs_group_type type) 239 { 240 struct xfs_failure_info notify = { .mf_flags = mf_flags }; 241 struct xfs_trans *tp = NULL; 242 struct xfs_btree_cur *cur = NULL; 243 int error = 0; 244 bool kernel_frozen = false; 245 uint32_t start_gno, end_gno; 246 xfs_fsblock_t start_bno, end_bno; 247 xfs_daddr_t daddr; 248 uint64_t bblen; 249 struct xfs_group *xg = NULL; 250 251 if (!xfs_has_rmapbt(mp)) { 252 xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 253 return -EOPNOTSUPP; 254 } 255 256 error = xfs_dax_translate_range(type == XG_TYPE_RTG ? 257 mp->m_rtdev_targp : mp->m_ddev_targp, 258 offset, len, &daddr, &bblen); 259 if (error) 260 return error; 261 262 if (type == XG_TYPE_RTG) { 263 start_bno = xfs_daddr_to_rtb(mp, daddr); 264 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); 265 } else { 266 start_bno = XFS_DADDR_TO_FSB(mp, daddr); 267 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); 268 } 269 270 if (mf_flags & MF_MEM_PRE_REMOVE) { 271 xfs_info(mp, "Device is about to be removed!"); 272 /* 273 * Freeze fs to prevent new mappings from being created. 274 * - Keep going on if others already hold the kernel forzen. 275 * - Keep going on if other errors too because this device is 276 * starting to fail. 277 * - If kernel frozen state is hold successfully here, thaw it 278 * here as well at the end. 279 */ 280 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; 281 } 282 283 error = xfs_trans_alloc_empty(mp, &tp); 284 if (error) 285 goto out; 286 287 start_gno = xfs_fsb_to_gno(mp, start_bno, type); 288 end_gno = xfs_fsb_to_gno(mp, end_bno, type); 289 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { 290 struct xfs_buf *agf_bp = NULL; 291 struct xfs_rtgroup *rtg = NULL; 292 struct xfs_rmap_irec ri_low = { }; 293 struct xfs_rmap_irec ri_high; 294 295 if (type == XG_TYPE_AG) { 296 struct xfs_perag *pag = to_perag(xg); 297 298 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 299 if (error) { 300 xfs_perag_put(pag); 301 break; 302 } 303 304 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 305 } else { 306 rtg = to_rtg(xg); 307 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 308 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 309 } 310 311 /* 312 * Set the rmap range from ri_low to ri_high, which represents 313 * a [start, end] where we looking for the files or metadata. 314 */ 315 memset(&ri_high, 0xFF, sizeof(ri_high)); 316 if (xg->xg_gno == start_gno) 317 ri_low.rm_startblock = 318 xfs_fsb_to_gbno(mp, start_bno, type); 319 if (xg->xg_gno == end_gno) 320 ri_high.rm_startblock = 321 xfs_fsb_to_gbno(mp, end_bno, type); 322 323 notify.startblock = ri_low.rm_startblock; 324 notify.blockcount = min(xg->xg_block_count, 325 ri_high.rm_startblock + 1) - 326 ri_low.rm_startblock; 327 328 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 329 xfs_dax_failure_fn, ¬ify); 330 xfs_btree_del_cursor(cur, error); 331 if (agf_bp) 332 xfs_trans_brelse(tp, agf_bp); 333 if (rtg) 334 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 335 if (error) { 336 xfs_group_put(xg); 337 break; 338 } 339 } 340 341 xfs_trans_cancel(tp); 342 343 /* 344 * Shutdown fs from a force umount in pre-remove case which won't fail, 345 * so errors can be ignored. Otherwise, shutdown the filesystem with 346 * CORRUPT flag if error occured or notify.want_shutdown was set during 347 * RMAP querying. 348 */ 349 if (mf_flags & MF_MEM_PRE_REMOVE) 350 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 351 else if (error || notify.want_shutdown) { 352 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 353 if (!error) 354 error = -EFSCORRUPTED; 355 } 356 357 out: 358 /* Thaw the fs if it has been frozen before. */ 359 if (mf_flags & MF_MEM_PRE_REMOVE) 360 xfs_dax_notify_failure_thaw(mp, kernel_frozen); 361 362 return error; 363 } 364 365 static int 366 xfs_dax_notify_failure( 367 struct dax_device *dax_dev, 368 u64 offset, 369 u64 len, 370 int mf_flags) 371 { 372 struct xfs_mount *mp = dax_holder(dax_dev); 373 374 if (!(mp->m_super->s_flags & SB_BORN)) { 375 xfs_warn(mp, "filesystem is not ready for notify_failure()!"); 376 return -EIO; 377 } 378 379 if (mp->m_logdev_targp != mp->m_ddev_targp && 380 mp->m_logdev_targp->bt_daxdev == dax_dev) { 381 return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags); 382 } 383 384 return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags, 385 (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ? 386 XG_TYPE_RTG : XG_TYPE_AG); 387 } 388 389 const struct dax_holder_operations xfs_dax_holder_operations = { 390 .notify_failure = xfs_dax_notify_failure, 391 }; 392