1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2022 Fujitsu. All Rights Reserved. 4 */ 5 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_alloc.h" 13 #include "xfs_bit.h" 14 #include "xfs_btree.h" 15 #include "xfs_inode.h" 16 #include "xfs_icache.h" 17 #include "xfs_rmap.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_rtalloc.h" 20 #include "xfs_trans.h" 21 #include "xfs_ag.h" 22 23 #include <linux/mm.h> 24 #include <linux/dax.h> 25 #include <linux/fs.h> 26 27 struct xfs_failure_info { 28 xfs_agblock_t startblock; 29 xfs_extlen_t blockcount; 30 int mf_flags; 31 bool want_shutdown; 32 }; 33 34 static pgoff_t 35 xfs_failure_pgoff( 36 struct xfs_mount *mp, 37 const struct xfs_rmap_irec *rec, 38 const struct xfs_failure_info *notify) 39 { 40 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); 41 42 if (notify->startblock > rec->rm_startblock) 43 pos += XFS_FSB_TO_B(mp, 44 notify->startblock - rec->rm_startblock); 45 return pos >> PAGE_SHIFT; 46 } 47 48 static unsigned long 49 xfs_failure_pgcnt( 50 struct xfs_mount *mp, 51 const struct xfs_rmap_irec *rec, 52 const struct xfs_failure_info *notify) 53 { 54 xfs_agblock_t end_rec; 55 xfs_agblock_t end_notify; 56 xfs_agblock_t start_cross; 57 xfs_agblock_t end_cross; 58 59 start_cross = max(rec->rm_startblock, notify->startblock); 60 61 end_rec = rec->rm_startblock + rec->rm_blockcount; 62 end_notify = notify->startblock + notify->blockcount; 63 end_cross = min(end_rec, end_notify); 64 65 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; 66 } 67 68 static int 69 xfs_dax_failure_fn( 70 struct xfs_btree_cur *cur, 71 const struct xfs_rmap_irec *rec, 72 void *data) 73 { 74 struct xfs_mount *mp = cur->bc_mp; 75 struct xfs_inode *ip; 76 struct xfs_failure_info *notify = data; 77 struct address_space *mapping; 78 pgoff_t pgoff; 79 unsigned long pgcnt; 80 int error = 0; 81 82 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || 83 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { 84 /* Continue the query because this isn't a failure. */ 85 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 86 return 0; 87 notify->want_shutdown = true; 88 return 0; 89 } 90 91 /* Get files that incore, filter out others that are not in use. */ 92 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 93 0, &ip); 94 /* Continue the rmap query if the inode isn't incore */ 95 if (error == -ENODATA) 96 return 0; 97 if (error) { 98 notify->want_shutdown = true; 99 return 0; 100 } 101 102 mapping = VFS_I(ip)->i_mapping; 103 pgoff = xfs_failure_pgoff(mp, rec, notify); 104 pgcnt = xfs_failure_pgcnt(mp, rec, notify); 105 106 /* Continue the rmap query if the inode isn't a dax file. */ 107 if (dax_mapping(mapping)) 108 error = mf_dax_kill_procs(mapping, pgoff, pgcnt, 109 notify->mf_flags); 110 111 /* Invalidate the cache in dax pages. */ 112 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 113 invalidate_inode_pages2_range(mapping, pgoff, 114 pgoff + pgcnt - 1); 115 116 xfs_irele(ip); 117 return error; 118 } 119 120 static int 121 xfs_dax_notify_failure_freeze( 122 struct xfs_mount *mp) 123 { 124 struct super_block *sb = mp->m_super; 125 int error; 126 127 error = freeze_super(sb, FREEZE_HOLDER_KERNEL); 128 if (error) 129 xfs_emerg(mp, "already frozen by kernel, err=%d", error); 130 131 return error; 132 } 133 134 static void 135 xfs_dax_notify_failure_thaw( 136 struct xfs_mount *mp, 137 bool kernel_frozen) 138 { 139 struct super_block *sb = mp->m_super; 140 int error; 141 142 if (kernel_frozen) { 143 error = thaw_super(sb, FREEZE_HOLDER_KERNEL); 144 if (error) 145 xfs_emerg(mp, "still frozen after notify failure, err=%d", 146 error); 147 } 148 149 /* 150 * Also thaw userspace call anyway because the device is about to be 151 * removed immediately. 152 */ 153 thaw_super(sb, FREEZE_HOLDER_USERSPACE); 154 } 155 156 static int 157 xfs_dax_notify_ddev_failure( 158 struct xfs_mount *mp, 159 xfs_daddr_t daddr, 160 xfs_daddr_t bblen, 161 int mf_flags) 162 { 163 struct xfs_failure_info notify = { .mf_flags = mf_flags }; 164 struct xfs_trans *tp = NULL; 165 struct xfs_btree_cur *cur = NULL; 166 struct xfs_buf *agf_bp = NULL; 167 int error = 0; 168 bool kernel_frozen = false; 169 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); 170 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); 171 xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, 172 daddr + bblen - 1); 173 xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); 174 175 if (mf_flags & MF_MEM_PRE_REMOVE) { 176 xfs_info(mp, "Device is about to be removed!"); 177 /* 178 * Freeze fs to prevent new mappings from being created. 179 * - Keep going on if others already hold the kernel forzen. 180 * - Keep going on if other errors too because this device is 181 * starting to fail. 182 * - If kernel frozen state is hold successfully here, thaw it 183 * here as well at the end. 184 */ 185 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; 186 } 187 188 error = xfs_trans_alloc_empty(mp, &tp); 189 if (error) 190 goto out; 191 192 for (; agno <= end_agno; agno++) { 193 struct xfs_rmap_irec ri_low = { }; 194 struct xfs_rmap_irec ri_high; 195 struct xfs_agf *agf; 196 struct xfs_perag *pag; 197 xfs_agblock_t range_agend; 198 199 pag = xfs_perag_get(mp, agno); 200 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 201 if (error) { 202 xfs_perag_put(pag); 203 break; 204 } 205 206 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 207 208 /* 209 * Set the rmap range from ri_low to ri_high, which represents 210 * a [start, end] where we looking for the files or metadata. 211 */ 212 memset(&ri_high, 0xFF, sizeof(ri_high)); 213 ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); 214 if (agno == end_agno) 215 ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); 216 217 agf = agf_bp->b_addr; 218 range_agend = min(be32_to_cpu(agf->agf_length) - 1, 219 ri_high.rm_startblock); 220 notify.startblock = ri_low.rm_startblock; 221 notify.blockcount = range_agend + 1 - ri_low.rm_startblock; 222 223 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 224 xfs_dax_failure_fn, ¬ify); 225 xfs_btree_del_cursor(cur, error); 226 xfs_trans_brelse(tp, agf_bp); 227 xfs_perag_put(pag); 228 if (error) 229 break; 230 231 fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); 232 } 233 234 xfs_trans_cancel(tp); 235 236 /* 237 * Shutdown fs from a force umount in pre-remove case which won't fail, 238 * so errors can be ignored. Otherwise, shutdown the filesystem with 239 * CORRUPT flag if error occured or notify.want_shutdown was set during 240 * RMAP querying. 241 */ 242 if (mf_flags & MF_MEM_PRE_REMOVE) 243 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 244 else if (error || notify.want_shutdown) { 245 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 246 if (!error) 247 error = -EFSCORRUPTED; 248 } 249 250 out: 251 /* Thaw the fs if it has been frozen before. */ 252 if (mf_flags & MF_MEM_PRE_REMOVE) 253 xfs_dax_notify_failure_thaw(mp, kernel_frozen); 254 255 return error; 256 } 257 258 static int 259 xfs_dax_notify_failure( 260 struct dax_device *dax_dev, 261 u64 offset, 262 u64 len, 263 int mf_flags) 264 { 265 struct xfs_mount *mp = dax_holder(dax_dev); 266 u64 ddev_start; 267 u64 ddev_end; 268 269 if (!(mp->m_super->s_flags & SB_BORN)) { 270 xfs_warn(mp, "filesystem is not ready for notify_failure()!"); 271 return -EIO; 272 } 273 274 if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { 275 xfs_debug(mp, 276 "notify_failure() not supported on realtime device!"); 277 return -EOPNOTSUPP; 278 } 279 280 if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && 281 mp->m_logdev_targp != mp->m_ddev_targp) { 282 /* 283 * In the pre-remove case the failure notification is attempting 284 * to trigger a force unmount. The expectation is that the 285 * device is still present, but its removal is in progress and 286 * can not be cancelled, proceed with accessing the log device. 287 */ 288 if (mf_flags & MF_MEM_PRE_REMOVE) 289 return 0; 290 xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 291 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 292 return -EFSCORRUPTED; 293 } 294 295 if (!xfs_has_rmapbt(mp)) { 296 xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 297 return -EOPNOTSUPP; 298 } 299 300 ddev_start = mp->m_ddev_targp->bt_dax_part_off; 301 ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; 302 303 /* Notify failure on the whole device. */ 304 if (offset == 0 && len == U64_MAX) { 305 offset = ddev_start; 306 len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); 307 } 308 309 /* Ignore the range out of filesystem area */ 310 if (offset + len - 1 < ddev_start) 311 return -ENXIO; 312 if (offset > ddev_end) 313 return -ENXIO; 314 315 /* Calculate the real range when it touches the boundary */ 316 if (offset > ddev_start) 317 offset -= ddev_start; 318 else { 319 len -= ddev_start - offset; 320 offset = 0; 321 } 322 if (offset + len - 1 > ddev_end) 323 len = ddev_end - offset + 1; 324 325 return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), 326 mf_flags); 327 } 328 329 const struct dax_holder_operations xfs_dax_holder_operations = { 330 .notify_failure = xfs_dax_notify_failure, 331 }; 332