xref: /linux/fs/xfs/xfs_notify_failure.c (revision b477ff98d903618a1ab8247861f2ea6e70c0f0f8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
4  */
5 
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 #include "xfs_notify_failure.h"
23 #include "xfs_rtgroup.h"
24 #include "xfs_rtrmap_btree.h"
25 
26 #include <linux/mm.h>
27 #include <linux/dax.h>
28 #include <linux/fs.h>
29 
30 struct xfs_failure_info {
31 	xfs_agblock_t		startblock;
32 	xfs_extlen_t		blockcount;
33 	int			mf_flags;
34 	bool			want_shutdown;
35 };
36 
37 static pgoff_t
xfs_failure_pgoff(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)38 xfs_failure_pgoff(
39 	struct xfs_mount		*mp,
40 	const struct xfs_rmap_irec	*rec,
41 	const struct xfs_failure_info	*notify)
42 {
43 	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);
44 
45 	if (notify->startblock > rec->rm_startblock)
46 		pos += XFS_FSB_TO_B(mp,
47 				notify->startblock - rec->rm_startblock);
48 	return pos >> PAGE_SHIFT;
49 }
50 
51 static unsigned long
xfs_failure_pgcnt(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)52 xfs_failure_pgcnt(
53 	struct xfs_mount		*mp,
54 	const struct xfs_rmap_irec	*rec,
55 	const struct xfs_failure_info	*notify)
56 {
57 	xfs_agblock_t			end_rec;
58 	xfs_agblock_t			end_notify;
59 	xfs_agblock_t			start_cross;
60 	xfs_agblock_t			end_cross;
61 
62 	start_cross = max(rec->rm_startblock, notify->startblock);
63 
64 	end_rec = rec->rm_startblock + rec->rm_blockcount;
65 	end_notify = notify->startblock + notify->blockcount;
66 	end_cross = min(end_rec, end_notify);
67 
68 	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
69 }
70 
71 static int
xfs_dax_failure_fn(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)72 xfs_dax_failure_fn(
73 	struct xfs_btree_cur		*cur,
74 	const struct xfs_rmap_irec	*rec,
75 	void				*data)
76 {
77 	struct xfs_mount		*mp = cur->bc_mp;
78 	struct xfs_inode		*ip;
79 	struct xfs_failure_info		*notify = data;
80 	struct address_space		*mapping;
81 	pgoff_t				pgoff;
82 	unsigned long			pgcnt;
83 	int				error = 0;
84 
85 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
86 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
87 		/* Continue the query because this isn't a failure. */
88 		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
89 			return 0;
90 		notify->want_shutdown = true;
91 		return 0;
92 	}
93 
94 	/* Get files that incore, filter out others that are not in use. */
95 	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
96 			 0, &ip);
97 	/* Continue the rmap query if the inode isn't incore */
98 	if (error == -ENODATA)
99 		return 0;
100 	if (error) {
101 		notify->want_shutdown = true;
102 		return 0;
103 	}
104 
105 	mapping = VFS_I(ip)->i_mapping;
106 	pgoff = xfs_failure_pgoff(mp, rec, notify);
107 	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
108 
109 	/* Continue the rmap query if the inode isn't a dax file. */
110 	if (dax_mapping(mapping))
111 		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
112 					  notify->mf_flags);
113 
114 	/* Invalidate the cache in dax pages. */
115 	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
116 		invalidate_inode_pages2_range(mapping, pgoff,
117 					      pgoff + pgcnt - 1);
118 
119 	xfs_irele(ip);
120 	return error;
121 }
122 
123 static int
xfs_dax_notify_failure_freeze(struct xfs_mount * mp)124 xfs_dax_notify_failure_freeze(
125 	struct xfs_mount	*mp)
126 {
127 	struct super_block	*sb = mp->m_super;
128 	int			error;
129 
130 	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
131 	if (error)
132 		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
133 
134 	return error;
135 }
136 
137 static void
xfs_dax_notify_failure_thaw(struct xfs_mount * mp,bool kernel_frozen)138 xfs_dax_notify_failure_thaw(
139 	struct xfs_mount	*mp,
140 	bool			kernel_frozen)
141 {
142 	struct super_block	*sb = mp->m_super;
143 	int			error;
144 
145 	if (kernel_frozen) {
146 		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
147 		if (error)
148 			xfs_emerg(mp, "still frozen after notify failure, err=%d",
149 				error);
150 	}
151 
152 	/*
153 	 * Also thaw userspace call anyway because the device is about to be
154 	 * removed immediately.
155 	 */
156 	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
157 }
158 
159 static int
xfs_dax_translate_range(struct xfs_buftarg * btp,u64 offset,u64 len,xfs_daddr_t * daddr,uint64_t * bblen)160 xfs_dax_translate_range(
161 	struct xfs_buftarg	*btp,
162 	u64			offset,
163 	u64			len,
164 	xfs_daddr_t		*daddr,
165 	uint64_t		*bblen)
166 {
167 	u64			dev_start = btp->bt_dax_part_off;
168 	u64			dev_len = bdev_nr_bytes(btp->bt_bdev);
169 	u64			dev_end = dev_start + dev_len - 1;
170 
171 	/* Notify failure on the whole device. */
172 	if (offset == 0 && len == U64_MAX) {
173 		offset = dev_start;
174 		len = dev_len;
175 	}
176 
177 	/* Ignore the range out of filesystem area */
178 	if (offset + len - 1 < dev_start)
179 		return -ENXIO;
180 	if (offset > dev_end)
181 		return -ENXIO;
182 
183 	/* Calculate the real range when it touches the boundary */
184 	if (offset > dev_start)
185 		offset -= dev_start;
186 	else {
187 		len -= dev_start - offset;
188 		offset = 0;
189 	}
190 	if (offset + len - 1 > dev_end)
191 		len = dev_end - offset + 1;
192 
193 	*daddr = BTOBB(offset);
194 	*bblen = BTOBB(len);
195 	return 0;
196 }
197 
198 static int
xfs_dax_notify_logdev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags)199 xfs_dax_notify_logdev_failure(
200 	struct xfs_mount	*mp,
201 	u64			offset,
202 	u64			len,
203 	int			mf_flags)
204 {
205 	xfs_daddr_t		daddr;
206 	uint64_t		bblen;
207 	int			error;
208 
209 	/*
210 	 * Return ENXIO instead of shutting down the filesystem if the failed
211 	 * region is beyond the end of the log.
212 	 */
213 	error = xfs_dax_translate_range(mp->m_logdev_targp,
214 			offset, len, &daddr, &bblen);
215 	if (error)
216 		return error;
217 
218 	/*
219 	 * In the pre-remove case the failure notification is attempting to
220 	 * trigger a force unmount.  The expectation is that the device is
221 	 * still present, but its removal is in progress and can not be
222 	 * cancelled, proceed with accessing the log device.
223 	 */
224 	if (mf_flags & MF_MEM_PRE_REMOVE)
225 		return 0;
226 
227 	xfs_err(mp, "ondisk log corrupt, shutting down fs!");
228 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
229 	return -EFSCORRUPTED;
230 }
231 
232 static int
xfs_dax_notify_dev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags,enum xfs_group_type type)233 xfs_dax_notify_dev_failure(
234 	struct xfs_mount	*mp,
235 	u64			offset,
236 	u64			len,
237 	int			mf_flags,
238 	enum xfs_group_type	type)
239 {
240 	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
241 	struct xfs_trans	*tp = NULL;
242 	struct xfs_btree_cur	*cur = NULL;
243 	int			error = 0;
244 	bool			kernel_frozen = false;
245 	uint32_t		start_gno, end_gno;
246 	xfs_fsblock_t		start_bno, end_bno;
247 	xfs_daddr_t		daddr;
248 	uint64_t		bblen;
249 	struct xfs_group	*xg = NULL;
250 
251 	if (!xfs_has_rmapbt(mp)) {
252 		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
253 		return -EOPNOTSUPP;
254 	}
255 
256 	error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
257 			mp->m_rtdev_targp : mp->m_ddev_targp,
258 			offset, len, &daddr, &bblen);
259 	if (error)
260 		return error;
261 
262 	if (type == XG_TYPE_RTG) {
263 		start_bno = xfs_daddr_to_rtb(mp, daddr);
264 		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
265 	} else {
266 		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
267 		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
268 	}
269 
270 	if (mf_flags & MF_MEM_PRE_REMOVE) {
271 		xfs_info(mp, "Device is about to be removed!");
272 		/*
273 		 * Freeze fs to prevent new mappings from being created.
274 		 * - Keep going on if others already hold the kernel forzen.
275 		 * - Keep going on if other errors too because this device is
276 		 *   starting to fail.
277 		 * - If kernel frozen state is hold successfully here, thaw it
278 		 *   here as well at the end.
279 		 */
280 		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
281 	}
282 
283 	error = xfs_trans_alloc_empty(mp, &tp);
284 	if (error)
285 		goto out;
286 
287 	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
288 	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
289 	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
290 		struct xfs_buf		*agf_bp = NULL;
291 		struct xfs_rtgroup	*rtg = NULL;
292 		struct xfs_rmap_irec	ri_low = { };
293 		struct xfs_rmap_irec	ri_high;
294 
295 		if (type == XG_TYPE_AG) {
296 			struct xfs_perag	*pag = to_perag(xg);
297 
298 			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
299 			if (error) {
300 				xfs_perag_put(pag);
301 				break;
302 			}
303 
304 			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
305 		} else {
306 			rtg = to_rtg(xg);
307 			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
308 			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
309 		}
310 
311 		/*
312 		 * Set the rmap range from ri_low to ri_high, which represents
313 		 * a [start, end] where we looking for the files or metadata.
314 		 */
315 		memset(&ri_high, 0xFF, sizeof(ri_high));
316 		if (xg->xg_gno == start_gno)
317 			ri_low.rm_startblock =
318 				xfs_fsb_to_gbno(mp, start_bno, type);
319 		if (xg->xg_gno == end_gno)
320 			ri_high.rm_startblock =
321 				xfs_fsb_to_gbno(mp, end_bno, type);
322 
323 		notify.startblock = ri_low.rm_startblock;
324 		notify.blockcount = min(xg->xg_block_count,
325 					ri_high.rm_startblock + 1) -
326 					ri_low.rm_startblock;
327 
328 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
329 				xfs_dax_failure_fn, &notify);
330 		xfs_btree_del_cursor(cur, error);
331 		if (agf_bp)
332 			xfs_trans_brelse(tp, agf_bp);
333 		if (rtg)
334 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
335 		if (error) {
336 			xfs_group_put(xg);
337 			break;
338 		}
339 	}
340 
341 	xfs_trans_cancel(tp);
342 
343 	/*
344 	 * Shutdown fs from a force umount in pre-remove case which won't fail,
345 	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
346 	 * CORRUPT flag if error occured or notify.want_shutdown was set during
347 	 * RMAP querying.
348 	 */
349 	if (mf_flags & MF_MEM_PRE_REMOVE)
350 		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
351 	else if (error || notify.want_shutdown) {
352 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
353 		if (!error)
354 			error = -EFSCORRUPTED;
355 	}
356 
357 out:
358 	/* Thaw the fs if it has been frozen before. */
359 	if (mf_flags & MF_MEM_PRE_REMOVE)
360 		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
361 
362 	return error;
363 }
364 
365 static int
xfs_dax_notify_failure(struct dax_device * dax_dev,u64 offset,u64 len,int mf_flags)366 xfs_dax_notify_failure(
367 	struct dax_device	*dax_dev,
368 	u64			offset,
369 	u64			len,
370 	int			mf_flags)
371 {
372 	struct xfs_mount	*mp = dax_holder(dax_dev);
373 
374 	if (!(mp->m_super->s_flags & SB_BORN)) {
375 		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
376 		return -EIO;
377 	}
378 
379 	if (mp->m_logdev_targp != mp->m_ddev_targp &&
380 	    mp->m_logdev_targp->bt_daxdev == dax_dev) {
381 		return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
382 	}
383 
384 	return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
385 		(mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
386 				XG_TYPE_RTG : XG_TYPE_AG);
387 }
388 
389 const struct dax_holder_operations xfs_dax_holder_operations = {
390 	.notify_failure		= xfs_dax_notify_failure,
391 };
392