1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2022 Fujitsu. All Rights Reserved.
4 */
5
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 #include "xfs_notify_failure.h"
23 #include "xfs_rtgroup.h"
24 #include "xfs_rtrmap_btree.h"
25 #include "xfs_healthmon.h"
26
27 #include <linux/mm.h>
28 #include <linux/dax.h>
29 #include <linux/fs.h>
30 #include <linux/fserror.h>
31
32 struct xfs_failure_info {
33 xfs_agblock_t startblock;
34 xfs_extlen_t blockcount;
35 int mf_flags;
36 bool want_shutdown;
37 };
38
39 static pgoff_t
xfs_failure_pgoff(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)40 xfs_failure_pgoff(
41 struct xfs_mount *mp,
42 const struct xfs_rmap_irec *rec,
43 const struct xfs_failure_info *notify)
44 {
45 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
46
47 if (notify->startblock > rec->rm_startblock)
48 pos += XFS_FSB_TO_B(mp,
49 notify->startblock - rec->rm_startblock);
50 return pos >> PAGE_SHIFT;
51 }
52
53 static unsigned long
xfs_failure_pgcnt(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)54 xfs_failure_pgcnt(
55 struct xfs_mount *mp,
56 const struct xfs_rmap_irec *rec,
57 const struct xfs_failure_info *notify)
58 {
59 xfs_agblock_t end_rec;
60 xfs_agblock_t end_notify;
61 xfs_agblock_t start_cross;
62 xfs_agblock_t end_cross;
63
64 start_cross = max(rec->rm_startblock, notify->startblock);
65
66 end_rec = rec->rm_startblock + rec->rm_blockcount;
67 end_notify = notify->startblock + notify->blockcount;
68 end_cross = min(end_rec, end_notify);
69
70 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
71 }
72
73 static int
xfs_dax_failure_fn(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)74 xfs_dax_failure_fn(
75 struct xfs_btree_cur *cur,
76 const struct xfs_rmap_irec *rec,
77 void *data)
78 {
79 struct xfs_mount *mp = cur->bc_mp;
80 struct xfs_inode *ip;
81 struct xfs_failure_info *notify = data;
82 struct address_space *mapping;
83 pgoff_t pgoff;
84 unsigned long pgcnt;
85 int error = 0;
86
87 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
88 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
89 /* Continue the query because this isn't a failure. */
90 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
91 return 0;
92 notify->want_shutdown = true;
93 return 0;
94 }
95
96 /* Get files that incore, filter out others that are not in use. */
97 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
98 0, &ip);
99 /* Continue the rmap query if the inode isn't incore */
100 if (error == -ENODATA)
101 return 0;
102 if (error) {
103 notify->want_shutdown = true;
104 return 0;
105 }
106
107 mapping = VFS_I(ip)->i_mapping;
108 pgoff = xfs_failure_pgoff(mp, rec, notify);
109 pgcnt = xfs_failure_pgcnt(mp, rec, notify);
110
111 /* Continue the rmap query if the inode isn't a dax file. */
112 if (dax_mapping(mapping))
113 error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
114 notify->mf_flags);
115
116 /* Invalidate the cache in dax pages. */
117 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
118 invalidate_inode_pages2_range(mapping, pgoff,
119 pgoff + pgcnt - 1);
120
121 fserror_report_data_lost(VFS_I(ip), (u64)pgoff << PAGE_SHIFT,
122 (u64)pgcnt << PAGE_SHIFT, GFP_NOFS);
123
124 xfs_irele(ip);
125 return error;
126 }
127
128 static int
xfs_dax_notify_failure_freeze(struct xfs_mount * mp)129 xfs_dax_notify_failure_freeze(
130 struct xfs_mount *mp)
131 {
132 struct super_block *sb = mp->m_super;
133 int error;
134
135 error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
136 if (error)
137 xfs_emerg(mp, "already frozen by kernel, err=%d", error);
138
139 return error;
140 }
141
142 static void
xfs_dax_notify_failure_thaw(struct xfs_mount * mp,bool kernel_frozen)143 xfs_dax_notify_failure_thaw(
144 struct xfs_mount *mp,
145 bool kernel_frozen)
146 {
147 struct super_block *sb = mp->m_super;
148 int error;
149
150 if (kernel_frozen) {
151 error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
152 if (error)
153 xfs_emerg(mp, "still frozen after notify failure, err=%d",
154 error);
155 }
156
157 /*
158 * Also thaw userspace call anyway because the device is about to be
159 * removed immediately.
160 */
161 thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
162 }
163
164 static int
xfs_dax_translate_range(struct xfs_buftarg * btp,u64 offset,u64 len,xfs_daddr_t * daddr,uint64_t * bblen)165 xfs_dax_translate_range(
166 struct xfs_buftarg *btp,
167 u64 offset,
168 u64 len,
169 xfs_daddr_t *daddr,
170 uint64_t *bblen)
171 {
172 u64 dev_start = btp->bt_dax_part_off;
173 u64 dev_len = BBTOB(btp->bt_nr_sectors);
174 u64 dev_end = dev_start + dev_len - 1;
175
176 /* Notify failure on the whole device. */
177 if (offset == 0 && len == U64_MAX) {
178 offset = dev_start;
179 len = dev_len;
180 }
181
182 /* Ignore the range out of filesystem area */
183 if (offset + len - 1 < dev_start)
184 return -ENXIO;
185 if (offset > dev_end)
186 return -ENXIO;
187
188 /* Calculate the real range when it touches the boundary */
189 if (offset > dev_start)
190 offset -= dev_start;
191 else {
192 len -= dev_start - offset;
193 offset = 0;
194 }
195 if (offset + len - 1 > dev_end)
196 len = dev_end - offset + 1;
197
198 *daddr = BTOBB(offset);
199 *bblen = BTOBB(len);
200 return 0;
201 }
202
203 static int
xfs_dax_notify_logdev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags)204 xfs_dax_notify_logdev_failure(
205 struct xfs_mount *mp,
206 u64 offset,
207 u64 len,
208 int mf_flags)
209 {
210 xfs_daddr_t daddr;
211 uint64_t bblen;
212 int error;
213
214 /*
215 * Return ENXIO instead of shutting down the filesystem if the failed
216 * region is beyond the end of the log.
217 */
218 error = xfs_dax_translate_range(mp->m_logdev_targp,
219 offset, len, &daddr, &bblen);
220 if (error)
221 return error;
222
223 xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen);
224
225 /*
226 * In the pre-remove case the failure notification is attempting to
227 * trigger a force unmount. The expectation is that the device is
228 * still present, but its removal is in progress and can not be
229 * cancelled, proceed with accessing the log device.
230 */
231 if (mf_flags & MF_MEM_PRE_REMOVE)
232 return 0;
233
234 xfs_err(mp, "ondisk log corrupt, shutting down fs!");
235 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
236 return -EFSCORRUPTED;
237 }
238
239 static int
xfs_dax_notify_dev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags,enum xfs_group_type type)240 xfs_dax_notify_dev_failure(
241 struct xfs_mount *mp,
242 u64 offset,
243 u64 len,
244 int mf_flags,
245 enum xfs_group_type type)
246 {
247 struct xfs_failure_info notify = { .mf_flags = mf_flags };
248 struct xfs_trans *tp = NULL;
249 struct xfs_btree_cur *cur = NULL;
250 int error = 0;
251 bool kernel_frozen = false;
252 uint32_t start_gno, end_gno;
253 xfs_fsblock_t start_bno, end_bno;
254 xfs_daddr_t daddr;
255 uint64_t bblen;
256 struct xfs_group *xg = NULL;
257
258 error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
259 offset, len, &daddr, &bblen);
260 if (error)
261 return error;
262
263 xfs_healthmon_report_media(mp,
264 type == XG_TYPE_RTG ? XFS_DEV_RT : XFS_DEV_DATA,
265 daddr, bblen);
266
267 if (!xfs_has_rmapbt(mp)) {
268 xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
269 return -EOPNOTSUPP;
270 }
271
272 if (type == XG_TYPE_RTG) {
273 start_bno = xfs_daddr_to_rtb(mp, daddr);
274 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
275 } else {
276 start_bno = XFS_DADDR_TO_FSB(mp, daddr);
277 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
278 }
279
280 if (mf_flags & MF_MEM_PRE_REMOVE) {
281 xfs_info(mp, "Device is about to be removed!");
282 /*
283 * Freeze fs to prevent new mappings from being created.
284 * - Keep going on if others already hold the kernel forzen.
285 * - Keep going on if other errors too because this device is
286 * starting to fail.
287 * - If kernel frozen state is hold successfully here, thaw it
288 * here as well at the end.
289 */
290 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
291 }
292
293 tp = xfs_trans_alloc_empty(mp);
294 start_gno = xfs_fsb_to_gno(mp, start_bno, type);
295 end_gno = xfs_fsb_to_gno(mp, end_bno, type);
296 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
297 struct xfs_buf *agf_bp = NULL;
298 struct xfs_rtgroup *rtg = NULL;
299 struct xfs_rmap_irec ri_low = { };
300 struct xfs_rmap_irec ri_high;
301
302 if (type == XG_TYPE_AG) {
303 struct xfs_perag *pag = to_perag(xg);
304
305 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
306 if (error) {
307 xfs_perag_rele(pag);
308 break;
309 }
310
311 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
312 } else {
313 rtg = to_rtg(xg);
314 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
315 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
316 }
317
318 /*
319 * Set the rmap range from ri_low to ri_high, which represents
320 * a [start, end] where we looking for the files or metadata.
321 */
322 memset(&ri_high, 0xFF, sizeof(ri_high));
323 if (xg->xg_gno == start_gno)
324 ri_low.rm_startblock =
325 xfs_fsb_to_gbno(mp, start_bno, type);
326 if (xg->xg_gno == end_gno)
327 ri_high.rm_startblock =
328 xfs_fsb_to_gbno(mp, end_bno, type);
329
330 notify.startblock = ri_low.rm_startblock;
331 notify.blockcount = min(xg->xg_block_count,
332 ri_high.rm_startblock + 1) -
333 ri_low.rm_startblock;
334
335 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
336 xfs_dax_failure_fn, ¬ify);
337 xfs_btree_del_cursor(cur, error);
338 if (agf_bp)
339 xfs_trans_brelse(tp, agf_bp);
340 if (rtg)
341 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
342 if (error) {
343 xfs_group_rele(xg);
344 break;
345 }
346 }
347
348 xfs_trans_cancel(tp);
349
350 /*
351 * Shutdown fs from a force umount in pre-remove case which won't fail,
352 * so errors can be ignored. Otherwise, shutdown the filesystem with
353 * CORRUPT flag if error occured or notify.want_shutdown was set during
354 * RMAP querying.
355 */
356 if (mf_flags & MF_MEM_PRE_REMOVE)
357 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
358 else if (error || notify.want_shutdown) {
359 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
360 if (!error)
361 error = -EFSCORRUPTED;
362 }
363
364 /* Thaw the fs if it has been frozen before. */
365 if (mf_flags & MF_MEM_PRE_REMOVE)
366 xfs_dax_notify_failure_thaw(mp, kernel_frozen);
367
368 return error;
369 }
370
371 static int
xfs_dax_notify_failure(struct dax_device * dax_dev,u64 offset,u64 len,int mf_flags)372 xfs_dax_notify_failure(
373 struct dax_device *dax_dev,
374 u64 offset,
375 u64 len,
376 int mf_flags)
377 {
378 struct xfs_mount *mp = dax_holder(dax_dev);
379
380 if (!(mp->m_super->s_flags & SB_BORN)) {
381 xfs_warn(mp, "filesystem is not ready for notify_failure()!");
382 return -EIO;
383 }
384
385 if (mp->m_logdev_targp != mp->m_ddev_targp &&
386 mp->m_logdev_targp->bt_daxdev == dax_dev) {
387 return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
388 }
389
390 return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
391 (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
392 XG_TYPE_RTG : XG_TYPE_AG);
393 }
394
395 const struct dax_holder_operations xfs_dax_holder_operations = {
396 .notify_failure = xfs_dax_notify_failure,
397 };
398