1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2022 Fujitsu. All Rights Reserved.
4 */
5
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 #include "xfs_notify_failure.h"
23 #include "xfs_rtgroup.h"
24 #include "xfs_rtrmap_btree.h"
25
26 #include <linux/mm.h>
27 #include <linux/dax.h>
28 #include <linux/fs.h>
29
30 struct xfs_failure_info {
31 xfs_agblock_t startblock;
32 xfs_extlen_t blockcount;
33 int mf_flags;
34 bool want_shutdown;
35 };
36
37 static pgoff_t
xfs_failure_pgoff(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)38 xfs_failure_pgoff(
39 struct xfs_mount *mp,
40 const struct xfs_rmap_irec *rec,
41 const struct xfs_failure_info *notify)
42 {
43 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
44
45 if (notify->startblock > rec->rm_startblock)
46 pos += XFS_FSB_TO_B(mp,
47 notify->startblock - rec->rm_startblock);
48 return pos >> PAGE_SHIFT;
49 }
50
51 static unsigned long
xfs_failure_pgcnt(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)52 xfs_failure_pgcnt(
53 struct xfs_mount *mp,
54 const struct xfs_rmap_irec *rec,
55 const struct xfs_failure_info *notify)
56 {
57 xfs_agblock_t end_rec;
58 xfs_agblock_t end_notify;
59 xfs_agblock_t start_cross;
60 xfs_agblock_t end_cross;
61
62 start_cross = max(rec->rm_startblock, notify->startblock);
63
64 end_rec = rec->rm_startblock + rec->rm_blockcount;
65 end_notify = notify->startblock + notify->blockcount;
66 end_cross = min(end_rec, end_notify);
67
68 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
69 }
70
71 static int
xfs_dax_failure_fn(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)72 xfs_dax_failure_fn(
73 struct xfs_btree_cur *cur,
74 const struct xfs_rmap_irec *rec,
75 void *data)
76 {
77 struct xfs_mount *mp = cur->bc_mp;
78 struct xfs_inode *ip;
79 struct xfs_failure_info *notify = data;
80 struct address_space *mapping;
81 pgoff_t pgoff;
82 unsigned long pgcnt;
83 int error = 0;
84
85 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
86 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
87 /* Continue the query because this isn't a failure. */
88 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
89 return 0;
90 notify->want_shutdown = true;
91 return 0;
92 }
93
94 /* Get files that incore, filter out others that are not in use. */
95 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
96 0, &ip);
97 /* Continue the rmap query if the inode isn't incore */
98 if (error == -ENODATA)
99 return 0;
100 if (error) {
101 notify->want_shutdown = true;
102 return 0;
103 }
104
105 mapping = VFS_I(ip)->i_mapping;
106 pgoff = xfs_failure_pgoff(mp, rec, notify);
107 pgcnt = xfs_failure_pgcnt(mp, rec, notify);
108
109 /* Continue the rmap query if the inode isn't a dax file. */
110 if (dax_mapping(mapping))
111 error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
112 notify->mf_flags);
113
114 /* Invalidate the cache in dax pages. */
115 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
116 invalidate_inode_pages2_range(mapping, pgoff,
117 pgoff + pgcnt - 1);
118
119 xfs_irele(ip);
120 return error;
121 }
122
123 static int
xfs_dax_notify_failure_freeze(struct xfs_mount * mp)124 xfs_dax_notify_failure_freeze(
125 struct xfs_mount *mp)
126 {
127 struct super_block *sb = mp->m_super;
128 int error;
129
130 error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
131 if (error)
132 xfs_emerg(mp, "already frozen by kernel, err=%d", error);
133
134 return error;
135 }
136
137 static void
xfs_dax_notify_failure_thaw(struct xfs_mount * mp,bool kernel_frozen)138 xfs_dax_notify_failure_thaw(
139 struct xfs_mount *mp,
140 bool kernel_frozen)
141 {
142 struct super_block *sb = mp->m_super;
143 int error;
144
145 if (kernel_frozen) {
146 error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
147 if (error)
148 xfs_emerg(mp, "still frozen after notify failure, err=%d",
149 error);
150 }
151
152 /*
153 * Also thaw userspace call anyway because the device is about to be
154 * removed immediately.
155 */
156 thaw_super(sb, FREEZE_HOLDER_USERSPACE);
157 }
158
159 static int
xfs_dax_translate_range(struct xfs_buftarg * btp,u64 offset,u64 len,xfs_daddr_t * daddr,uint64_t * bblen)160 xfs_dax_translate_range(
161 struct xfs_buftarg *btp,
162 u64 offset,
163 u64 len,
164 xfs_daddr_t *daddr,
165 uint64_t *bblen)
166 {
167 u64 dev_start = btp->bt_dax_part_off;
168 u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
169 u64 dev_end = dev_start + dev_len - 1;
170
171 /* Notify failure on the whole device. */
172 if (offset == 0 && len == U64_MAX) {
173 offset = dev_start;
174 len = dev_len;
175 }
176
177 /* Ignore the range out of filesystem area */
178 if (offset + len - 1 < dev_start)
179 return -ENXIO;
180 if (offset > dev_end)
181 return -ENXIO;
182
183 /* Calculate the real range when it touches the boundary */
184 if (offset > dev_start)
185 offset -= dev_start;
186 else {
187 len -= dev_start - offset;
188 offset = 0;
189 }
190 if (offset + len - 1 > dev_end)
191 len = dev_end - offset + 1;
192
193 *daddr = BTOBB(offset);
194 *bblen = BTOBB(len);
195 return 0;
196 }
197
198 static int
xfs_dax_notify_logdev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags)199 xfs_dax_notify_logdev_failure(
200 struct xfs_mount *mp,
201 u64 offset,
202 u64 len,
203 int mf_flags)
204 {
205 xfs_daddr_t daddr;
206 uint64_t bblen;
207 int error;
208
209 /*
210 * Return ENXIO instead of shutting down the filesystem if the failed
211 * region is beyond the end of the log.
212 */
213 error = xfs_dax_translate_range(mp->m_logdev_targp,
214 offset, len, &daddr, &bblen);
215 if (error)
216 return error;
217
218 /*
219 * In the pre-remove case the failure notification is attempting to
220 * trigger a force unmount. The expectation is that the device is
221 * still present, but its removal is in progress and can not be
222 * cancelled, proceed with accessing the log device.
223 */
224 if (mf_flags & MF_MEM_PRE_REMOVE)
225 return 0;
226
227 xfs_err(mp, "ondisk log corrupt, shutting down fs!");
228 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
229 return -EFSCORRUPTED;
230 }
231
232 static int
xfs_dax_notify_dev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags,enum xfs_group_type type)233 xfs_dax_notify_dev_failure(
234 struct xfs_mount *mp,
235 u64 offset,
236 u64 len,
237 int mf_flags,
238 enum xfs_group_type type)
239 {
240 struct xfs_failure_info notify = { .mf_flags = mf_flags };
241 struct xfs_trans *tp = NULL;
242 struct xfs_btree_cur *cur = NULL;
243 int error = 0;
244 bool kernel_frozen = false;
245 uint32_t start_gno, end_gno;
246 xfs_fsblock_t start_bno, end_bno;
247 xfs_daddr_t daddr;
248 uint64_t bblen;
249 struct xfs_group *xg = NULL;
250
251 if (!xfs_has_rmapbt(mp)) {
252 xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
253 return -EOPNOTSUPP;
254 }
255
256 error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
257 mp->m_rtdev_targp : mp->m_ddev_targp,
258 offset, len, &daddr, &bblen);
259 if (error)
260 return error;
261
262 if (type == XG_TYPE_RTG) {
263 start_bno = xfs_daddr_to_rtb(mp, daddr);
264 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
265 } else {
266 start_bno = XFS_DADDR_TO_FSB(mp, daddr);
267 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
268 }
269
270 if (mf_flags & MF_MEM_PRE_REMOVE) {
271 xfs_info(mp, "Device is about to be removed!");
272 /*
273 * Freeze fs to prevent new mappings from being created.
274 * - Keep going on if others already hold the kernel forzen.
275 * - Keep going on if other errors too because this device is
276 * starting to fail.
277 * - If kernel frozen state is hold successfully here, thaw it
278 * here as well at the end.
279 */
280 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
281 }
282
283 error = xfs_trans_alloc_empty(mp, &tp);
284 if (error)
285 goto out;
286
287 start_gno = xfs_fsb_to_gno(mp, start_bno, type);
288 end_gno = xfs_fsb_to_gno(mp, end_bno, type);
289 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
290 struct xfs_buf *agf_bp = NULL;
291 struct xfs_rtgroup *rtg = NULL;
292 struct xfs_rmap_irec ri_low = { };
293 struct xfs_rmap_irec ri_high;
294
295 if (type == XG_TYPE_AG) {
296 struct xfs_perag *pag = to_perag(xg);
297
298 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
299 if (error) {
300 xfs_perag_put(pag);
301 break;
302 }
303
304 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
305 } else {
306 rtg = to_rtg(xg);
307 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
308 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
309 }
310
311 /*
312 * Set the rmap range from ri_low to ri_high, which represents
313 * a [start, end] where we looking for the files or metadata.
314 */
315 memset(&ri_high, 0xFF, sizeof(ri_high));
316 if (xg->xg_gno == start_gno)
317 ri_low.rm_startblock =
318 xfs_fsb_to_gbno(mp, start_bno, type);
319 if (xg->xg_gno == end_gno)
320 ri_high.rm_startblock =
321 xfs_fsb_to_gbno(mp, end_bno, type);
322
323 notify.startblock = ri_low.rm_startblock;
324 notify.blockcount = min(xg->xg_block_count,
325 ri_high.rm_startblock + 1) -
326 ri_low.rm_startblock;
327
328 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
329 xfs_dax_failure_fn, ¬ify);
330 xfs_btree_del_cursor(cur, error);
331 if (agf_bp)
332 xfs_trans_brelse(tp, agf_bp);
333 if (rtg)
334 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
335 if (error) {
336 xfs_group_put(xg);
337 break;
338 }
339 }
340
341 xfs_trans_cancel(tp);
342
343 /*
344 * Shutdown fs from a force umount in pre-remove case which won't fail,
345 * so errors can be ignored. Otherwise, shutdown the filesystem with
346 * CORRUPT flag if error occured or notify.want_shutdown was set during
347 * RMAP querying.
348 */
349 if (mf_flags & MF_MEM_PRE_REMOVE)
350 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
351 else if (error || notify.want_shutdown) {
352 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
353 if (!error)
354 error = -EFSCORRUPTED;
355 }
356
357 out:
358 /* Thaw the fs if it has been frozen before. */
359 if (mf_flags & MF_MEM_PRE_REMOVE)
360 xfs_dax_notify_failure_thaw(mp, kernel_frozen);
361
362 return error;
363 }
364
365 static int
xfs_dax_notify_failure(struct dax_device * dax_dev,u64 offset,u64 len,int mf_flags)366 xfs_dax_notify_failure(
367 struct dax_device *dax_dev,
368 u64 offset,
369 u64 len,
370 int mf_flags)
371 {
372 struct xfs_mount *mp = dax_holder(dax_dev);
373
374 if (!(mp->m_super->s_flags & SB_BORN)) {
375 xfs_warn(mp, "filesystem is not ready for notify_failure()!");
376 return -EIO;
377 }
378
379 if (mp->m_logdev_targp != mp->m_ddev_targp &&
380 mp->m_logdev_targp->bt_daxdev == dax_dev) {
381 return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
382 }
383
384 return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
385 (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
386 XG_TYPE_RTG : XG_TYPE_AG);
387 }
388
389 const struct dax_holder_operations xfs_dax_holder_operations = {
390 .notify_failure = xfs_dax_notify_failure,
391 };
392