xref: /linux/fs/xfs/xfs_notify_failure.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
4  */
5 
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 #include "xfs_notify_failure.h"
23 #include "xfs_rtgroup.h"
24 #include "xfs_rtrmap_btree.h"
25 #include "xfs_healthmon.h"
26 
27 #include <linux/mm.h>
28 #include <linux/dax.h>
29 #include <linux/fs.h>
30 #include <linux/fserror.h>
31 
32 struct xfs_failure_info {
33 	xfs_agblock_t		startblock;
34 	xfs_extlen_t		blockcount;
35 	int			mf_flags;
36 	bool			want_shutdown;
37 };
38 
39 static pgoff_t
40 xfs_failure_pgoff(
41 	struct xfs_mount		*mp,
42 	const struct xfs_rmap_irec	*rec,
43 	const struct xfs_failure_info	*notify)
44 {
45 	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);
46 
47 	if (notify->startblock > rec->rm_startblock)
48 		pos += XFS_FSB_TO_B(mp,
49 				notify->startblock - rec->rm_startblock);
50 	return pos >> PAGE_SHIFT;
51 }
52 
53 static unsigned long
54 xfs_failure_pgcnt(
55 	struct xfs_mount		*mp,
56 	const struct xfs_rmap_irec	*rec,
57 	const struct xfs_failure_info	*notify)
58 {
59 	xfs_agblock_t			end_rec;
60 	xfs_agblock_t			end_notify;
61 	xfs_agblock_t			start_cross;
62 	xfs_agblock_t			end_cross;
63 
64 	start_cross = max(rec->rm_startblock, notify->startblock);
65 
66 	end_rec = rec->rm_startblock + rec->rm_blockcount;
67 	end_notify = notify->startblock + notify->blockcount;
68 	end_cross = min(end_rec, end_notify);
69 
70 	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
71 }
72 
73 static int
74 xfs_dax_failure_fn(
75 	struct xfs_btree_cur		*cur,
76 	const struct xfs_rmap_irec	*rec,
77 	void				*data)
78 {
79 	struct xfs_mount		*mp = cur->bc_mp;
80 	struct xfs_inode		*ip;
81 	struct xfs_failure_info		*notify = data;
82 	struct address_space		*mapping;
83 	pgoff_t				pgoff;
84 	unsigned long			pgcnt;
85 	int				error = 0;
86 
87 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
88 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
89 		/* Continue the query because this isn't a failure. */
90 		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
91 			return 0;
92 		notify->want_shutdown = true;
93 		return 0;
94 	}
95 
96 	/* Get files that incore, filter out others that are not in use. */
97 	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
98 			 0, &ip);
99 	/* Continue the rmap query if the inode isn't incore */
100 	if (error == -ENODATA)
101 		return 0;
102 	if (error) {
103 		notify->want_shutdown = true;
104 		return 0;
105 	}
106 
107 	mapping = VFS_I(ip)->i_mapping;
108 	pgoff = xfs_failure_pgoff(mp, rec, notify);
109 	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
110 
111 	/* Continue the rmap query if the inode isn't a dax file. */
112 	if (dax_mapping(mapping))
113 		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
114 					  notify->mf_flags);
115 
116 	/* Invalidate the cache in dax pages. */
117 	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
118 		invalidate_inode_pages2_range(mapping, pgoff,
119 					      pgoff + pgcnt - 1);
120 
121 	fserror_report_data_lost(VFS_I(ip), (u64)pgoff << PAGE_SHIFT,
122 			(u64)pgcnt << PAGE_SHIFT, GFP_NOFS);
123 
124 	xfs_irele(ip);
125 	return error;
126 }
127 
128 static int
129 xfs_dax_notify_failure_freeze(
130 	struct xfs_mount	*mp)
131 {
132 	struct super_block	*sb = mp->m_super;
133 	int			error;
134 
135 	error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
136 	if (error)
137 		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
138 
139 	return error;
140 }
141 
142 static void
143 xfs_dax_notify_failure_thaw(
144 	struct xfs_mount	*mp,
145 	bool			kernel_frozen)
146 {
147 	struct super_block	*sb = mp->m_super;
148 	int			error;
149 
150 	if (kernel_frozen) {
151 		error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
152 		if (error)
153 			xfs_emerg(mp, "still frozen after notify failure, err=%d",
154 				error);
155 	}
156 
157 	/*
158 	 * Also thaw userspace call anyway because the device is about to be
159 	 * removed immediately.
160 	 */
161 	thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
162 }
163 
164 static int
165 xfs_dax_translate_range(
166 	struct xfs_buftarg	*btp,
167 	u64			offset,
168 	u64			len,
169 	xfs_daddr_t		*daddr,
170 	uint64_t		*bblen)
171 {
172 	u64			dev_start = btp->bt_dax_part_off;
173 	u64			dev_len = BBTOB(btp->bt_nr_sectors);
174 	u64			dev_end = dev_start + dev_len - 1;
175 
176 	/* Notify failure on the whole device. */
177 	if (offset == 0 && len == U64_MAX) {
178 		offset = dev_start;
179 		len = dev_len;
180 	}
181 
182 	/* Ignore the range out of filesystem area */
183 	if (offset + len - 1 < dev_start)
184 		return -ENXIO;
185 	if (offset > dev_end)
186 		return -ENXIO;
187 
188 	/* Calculate the real range when it touches the boundary */
189 	if (offset > dev_start)
190 		offset -= dev_start;
191 	else {
192 		len -= dev_start - offset;
193 		offset = 0;
194 	}
195 	if (offset + len - 1 > dev_end)
196 		len = dev_end - offset + 1;
197 
198 	*daddr = BTOBB(offset);
199 	*bblen = BTOBB(len);
200 	return 0;
201 }
202 
203 static int
204 xfs_dax_notify_logdev_failure(
205 	struct xfs_mount	*mp,
206 	u64			offset,
207 	u64			len,
208 	int			mf_flags)
209 {
210 	xfs_daddr_t		daddr;
211 	uint64_t		bblen;
212 	int			error;
213 
214 	/*
215 	 * Return ENXIO instead of shutting down the filesystem if the failed
216 	 * region is beyond the end of the log.
217 	 */
218 	error = xfs_dax_translate_range(mp->m_logdev_targp,
219 			offset, len, &daddr, &bblen);
220 	if (error)
221 		return error;
222 
223 	xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen);
224 
225 	/*
226 	 * In the pre-remove case the failure notification is attempting to
227 	 * trigger a force unmount.  The expectation is that the device is
228 	 * still present, but its removal is in progress and can not be
229 	 * cancelled, proceed with accessing the log device.
230 	 */
231 	if (mf_flags & MF_MEM_PRE_REMOVE)
232 		return 0;
233 
234 	xfs_err(mp, "ondisk log corrupt, shutting down fs!");
235 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
236 	return -EFSCORRUPTED;
237 }
238 
239 static int
240 xfs_dax_notify_dev_failure(
241 	struct xfs_mount	*mp,
242 	u64			offset,
243 	u64			len,
244 	int			mf_flags,
245 	enum xfs_group_type	type)
246 {
247 	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
248 	struct xfs_trans	*tp = NULL;
249 	struct xfs_btree_cur	*cur = NULL;
250 	int			error = 0;
251 	bool			kernel_frozen = false;
252 	uint32_t		start_gno, end_gno;
253 	xfs_fsblock_t		start_bno, end_bno;
254 	xfs_daddr_t		daddr;
255 	uint64_t		bblen;
256 	struct xfs_group	*xg = NULL;
257 
258 	error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
259 			offset, len, &daddr, &bblen);
260 	if (error)
261 		return error;
262 
263 	xfs_healthmon_report_media(mp,
264 			type == XG_TYPE_RTG ?  XFS_DEV_RT : XFS_DEV_DATA,
265 			daddr, bblen);
266 
267 	if (!xfs_has_rmapbt(mp)) {
268 		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
269 		return -EOPNOTSUPP;
270 	}
271 
272 	if (type == XG_TYPE_RTG) {
273 		start_bno = xfs_daddr_to_rtb(mp, daddr);
274 		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
275 	} else {
276 		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
277 		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
278 	}
279 
280 	if (mf_flags & MF_MEM_PRE_REMOVE) {
281 		xfs_info(mp, "Device is about to be removed!");
282 		/*
283 		 * Freeze fs to prevent new mappings from being created.
284 		 * - Keep going on if others already hold the kernel forzen.
285 		 * - Keep going on if other errors too because this device is
286 		 *   starting to fail.
287 		 * - If kernel frozen state is hold successfully here, thaw it
288 		 *   here as well at the end.
289 		 */
290 		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
291 	}
292 
293 	tp = xfs_trans_alloc_empty(mp);
294 	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
295 	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
296 	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
297 		struct xfs_buf		*agf_bp = NULL;
298 		struct xfs_rtgroup	*rtg = NULL;
299 		struct xfs_rmap_irec	ri_low = { };
300 		struct xfs_rmap_irec	ri_high;
301 
302 		if (type == XG_TYPE_AG) {
303 			struct xfs_perag	*pag = to_perag(xg);
304 
305 			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
306 			if (error) {
307 				xfs_perag_put(pag);
308 				break;
309 			}
310 
311 			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
312 		} else {
313 			rtg = to_rtg(xg);
314 			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
315 			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
316 		}
317 
318 		/*
319 		 * Set the rmap range from ri_low to ri_high, which represents
320 		 * a [start, end] where we looking for the files or metadata.
321 		 */
322 		memset(&ri_high, 0xFF, sizeof(ri_high));
323 		if (xg->xg_gno == start_gno)
324 			ri_low.rm_startblock =
325 				xfs_fsb_to_gbno(mp, start_bno, type);
326 		if (xg->xg_gno == end_gno)
327 			ri_high.rm_startblock =
328 				xfs_fsb_to_gbno(mp, end_bno, type);
329 
330 		notify.startblock = ri_low.rm_startblock;
331 		notify.blockcount = min(xg->xg_block_count,
332 					ri_high.rm_startblock + 1) -
333 					ri_low.rm_startblock;
334 
335 		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
336 				xfs_dax_failure_fn, &notify);
337 		xfs_btree_del_cursor(cur, error);
338 		if (agf_bp)
339 			xfs_trans_brelse(tp, agf_bp);
340 		if (rtg)
341 			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
342 		if (error) {
343 			xfs_group_put(xg);
344 			break;
345 		}
346 	}
347 
348 	xfs_trans_cancel(tp);
349 
350 	/*
351 	 * Shutdown fs from a force umount in pre-remove case which won't fail,
352 	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
353 	 * CORRUPT flag if error occured or notify.want_shutdown was set during
354 	 * RMAP querying.
355 	 */
356 	if (mf_flags & MF_MEM_PRE_REMOVE)
357 		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
358 	else if (error || notify.want_shutdown) {
359 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
360 		if (!error)
361 			error = -EFSCORRUPTED;
362 	}
363 
364 	/* Thaw the fs if it has been frozen before. */
365 	if (mf_flags & MF_MEM_PRE_REMOVE)
366 		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
367 
368 	return error;
369 }
370 
371 static int
372 xfs_dax_notify_failure(
373 	struct dax_device	*dax_dev,
374 	u64			offset,
375 	u64			len,
376 	int			mf_flags)
377 {
378 	struct xfs_mount	*mp = dax_holder(dax_dev);
379 
380 	if (!(mp->m_super->s_flags & SB_BORN)) {
381 		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
382 		return -EIO;
383 	}
384 
385 	if (mp->m_logdev_targp != mp->m_ddev_targp &&
386 	    mp->m_logdev_targp->bt_daxdev == dax_dev) {
387 		return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
388 	}
389 
390 	return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
391 		(mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
392 				XG_TYPE_RTG : XG_TYPE_AG);
393 }
394 
395 const struct dax_holder_operations xfs_dax_holder_operations = {
396 	.notify_failure		= xfs_dax_notify_failure,
397 };
398