xref: /linux/fs/xfs/xfs_notify_failure.c (revision be3382ecdf317f005e7d47356d0a9256cc36dd88)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
4  */
5 
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 
23 #include <linux/mm.h>
24 #include <linux/dax.h>
25 #include <linux/fs.h>
26 
27 struct xfs_failure_info {
28 	xfs_agblock_t		startblock;
29 	xfs_extlen_t		blockcount;
30 	int			mf_flags;
31 	bool			want_shutdown;
32 };
33 
34 static pgoff_t
35 xfs_failure_pgoff(
36 	struct xfs_mount		*mp,
37 	const struct xfs_rmap_irec	*rec,
38 	const struct xfs_failure_info	*notify)
39 {
40 	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);
41 
42 	if (notify->startblock > rec->rm_startblock)
43 		pos += XFS_FSB_TO_B(mp,
44 				notify->startblock - rec->rm_startblock);
45 	return pos >> PAGE_SHIFT;
46 }
47 
48 static unsigned long
49 xfs_failure_pgcnt(
50 	struct xfs_mount		*mp,
51 	const struct xfs_rmap_irec	*rec,
52 	const struct xfs_failure_info	*notify)
53 {
54 	xfs_agblock_t			end_rec;
55 	xfs_agblock_t			end_notify;
56 	xfs_agblock_t			start_cross;
57 	xfs_agblock_t			end_cross;
58 
59 	start_cross = max(rec->rm_startblock, notify->startblock);
60 
61 	end_rec = rec->rm_startblock + rec->rm_blockcount;
62 	end_notify = notify->startblock + notify->blockcount;
63 	end_cross = min(end_rec, end_notify);
64 
65 	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
66 }
67 
68 static int
69 xfs_dax_failure_fn(
70 	struct xfs_btree_cur		*cur,
71 	const struct xfs_rmap_irec	*rec,
72 	void				*data)
73 {
74 	struct xfs_mount		*mp = cur->bc_mp;
75 	struct xfs_inode		*ip;
76 	struct xfs_failure_info		*notify = data;
77 	struct address_space		*mapping;
78 	pgoff_t				pgoff;
79 	unsigned long			pgcnt;
80 	int				error = 0;
81 
82 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
83 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
84 		/* Continue the query because this isn't a failure. */
85 		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
86 			return 0;
87 		notify->want_shutdown = true;
88 		return 0;
89 	}
90 
91 	/* Get files that incore, filter out others that are not in use. */
92 	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
93 			 0, &ip);
94 	/* Continue the rmap query if the inode isn't incore */
95 	if (error == -ENODATA)
96 		return 0;
97 	if (error) {
98 		notify->want_shutdown = true;
99 		return 0;
100 	}
101 
102 	mapping = VFS_I(ip)->i_mapping;
103 	pgoff = xfs_failure_pgoff(mp, rec, notify);
104 	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
105 
106 	/* Continue the rmap query if the inode isn't a dax file. */
107 	if (dax_mapping(mapping))
108 		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
109 					  notify->mf_flags);
110 
111 	/* Invalidate the cache in dax pages. */
112 	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
113 		invalidate_inode_pages2_range(mapping, pgoff,
114 					      pgoff + pgcnt - 1);
115 
116 	xfs_irele(ip);
117 	return error;
118 }
119 
120 static int
121 xfs_dax_notify_failure_freeze(
122 	struct xfs_mount	*mp)
123 {
124 	struct super_block	*sb = mp->m_super;
125 	int			error;
126 
127 	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
128 	if (error)
129 		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
130 
131 	return error;
132 }
133 
134 static void
135 xfs_dax_notify_failure_thaw(
136 	struct xfs_mount	*mp,
137 	bool			kernel_frozen)
138 {
139 	struct super_block	*sb = mp->m_super;
140 	int			error;
141 
142 	if (kernel_frozen) {
143 		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
144 		if (error)
145 			xfs_emerg(mp, "still frozen after notify failure, err=%d",
146 				error);
147 	}
148 
149 	/*
150 	 * Also thaw userspace call anyway because the device is about to be
151 	 * removed immediately.
152 	 */
153 	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
154 }
155 
156 static int
157 xfs_dax_notify_ddev_failure(
158 	struct xfs_mount	*mp,
159 	xfs_daddr_t		daddr,
160 	xfs_daddr_t		bblen,
161 	int			mf_flags)
162 {
163 	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
164 	struct xfs_trans	*tp = NULL;
165 	struct xfs_btree_cur	*cur = NULL;
166 	struct xfs_buf		*agf_bp = NULL;
167 	int			error = 0;
168 	bool			kernel_frozen = false;
169 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
170 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
171 	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
172 							     daddr + bblen - 1);
173 	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
174 
175 	if (mf_flags & MF_MEM_PRE_REMOVE) {
176 		xfs_info(mp, "Device is about to be removed!");
177 		/*
178 		 * Freeze fs to prevent new mappings from being created.
179 		 * - Keep going on if others already hold the kernel forzen.
180 		 * - Keep going on if other errors too because this device is
181 		 *   starting to fail.
182 		 * - If kernel frozen state is hold successfully here, thaw it
183 		 *   here as well at the end.
184 		 */
185 		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
186 	}
187 
188 	error = xfs_trans_alloc_empty(mp, &tp);
189 	if (error)
190 		goto out;
191 
192 	for (; agno <= end_agno; agno++) {
193 		struct xfs_rmap_irec	ri_low = { };
194 		struct xfs_rmap_irec	ri_high;
195 		struct xfs_agf		*agf;
196 		struct xfs_perag	*pag;
197 		xfs_agblock_t		range_agend;
198 
199 		pag = xfs_perag_get(mp, agno);
200 		error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
201 		if (error) {
202 			xfs_perag_put(pag);
203 			break;
204 		}
205 
206 		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
207 
208 		/*
209 		 * Set the rmap range from ri_low to ri_high, which represents
210 		 * a [start, end] where we looking for the files or metadata.
211 		 */
212 		memset(&ri_high, 0xFF, sizeof(ri_high));
213 		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
214 		if (agno == end_agno)
215 			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
216 
217 		agf = agf_bp->b_addr;
218 		range_agend = min(be32_to_cpu(agf->agf_length) - 1,
219 				ri_high.rm_startblock);
220 		notify.startblock = ri_low.rm_startblock;
221 		notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
222 
223 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
224 				xfs_dax_failure_fn, &notify);
225 		xfs_btree_del_cursor(cur, error);
226 		xfs_trans_brelse(tp, agf_bp);
227 		xfs_perag_put(pag);
228 		if (error)
229 			break;
230 
231 		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
232 	}
233 
234 	xfs_trans_cancel(tp);
235 
236 	/*
237 	 * Shutdown fs from a force umount in pre-remove case which won't fail,
238 	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
239 	 * CORRUPT flag if error occured or notify.want_shutdown was set during
240 	 * RMAP querying.
241 	 */
242 	if (mf_flags & MF_MEM_PRE_REMOVE)
243 		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
244 	else if (error || notify.want_shutdown) {
245 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
246 		if (!error)
247 			error = -EFSCORRUPTED;
248 	}
249 
250 out:
251 	/* Thaw the fs if it has been frozen before. */
252 	if (mf_flags & MF_MEM_PRE_REMOVE)
253 		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
254 
255 	return error;
256 }
257 
258 static int
259 xfs_dax_notify_failure(
260 	struct dax_device	*dax_dev,
261 	u64			offset,
262 	u64			len,
263 	int			mf_flags)
264 {
265 	struct xfs_mount	*mp = dax_holder(dax_dev);
266 	u64			ddev_start;
267 	u64			ddev_end;
268 
269 	if (!(mp->m_super->s_flags & SB_BORN)) {
270 		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
271 		return -EIO;
272 	}
273 
274 	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
275 		xfs_debug(mp,
276 			 "notify_failure() not supported on realtime device!");
277 		return -EOPNOTSUPP;
278 	}
279 
280 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
281 	    mp->m_logdev_targp != mp->m_ddev_targp) {
282 		/*
283 		 * In the pre-remove case the failure notification is attempting
284 		 * to trigger a force unmount.  The expectation is that the
285 		 * device is still present, but its removal is in progress and
286 		 * can not be cancelled, proceed with accessing the log device.
287 		 */
288 		if (mf_flags & MF_MEM_PRE_REMOVE)
289 			return 0;
290 		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
291 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
292 		return -EFSCORRUPTED;
293 	}
294 
295 	if (!xfs_has_rmapbt(mp)) {
296 		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
297 		return -EOPNOTSUPP;
298 	}
299 
300 	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
301 	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
302 
303 	/* Notify failure on the whole device. */
304 	if (offset == 0 && len == U64_MAX) {
305 		offset = ddev_start;
306 		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
307 	}
308 
309 	/* Ignore the range out of filesystem area */
310 	if (offset + len - 1 < ddev_start)
311 		return -ENXIO;
312 	if (offset > ddev_end)
313 		return -ENXIO;
314 
315 	/* Calculate the real range when it touches the boundary */
316 	if (offset > ddev_start)
317 		offset -= ddev_start;
318 	else {
319 		len -= ddev_start - offset;
320 		offset = 0;
321 	}
322 	if (offset + len - 1 > ddev_end)
323 		len = ddev_end - offset + 1;
324 
325 	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
326 			mf_flags);
327 }
328 
329 const struct dax_holder_operations xfs_dax_holder_operations = {
330 	.notify_failure		= xfs_dax_notify_failure,
331 };
332