1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2024-2026 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trace.h"
15 #include "xfs_ag.h"
16 #include "xfs_btree.h"
17 #include "xfs_da_format.h"
18 #include "xfs_da_btree.h"
19 #include "xfs_quota_defs.h"
20 #include "xfs_rtgroup.h"
21 #include "xfs_health.h"
22 #include "xfs_healthmon.h"
23 #include "xfs_fsops.h"
24 #include "xfs_notify_failure.h"
25 #include "xfs_file.h"
26 #include "xfs_ioctl.h"
27
28 #include <linux/anon_inodes.h>
29 #include <linux/eventpoll.h>
30 #include <linux/poll.h>
31 #include <linux/fserror.h>
32
33 /*
34 * Live Health Monitoring
35 * ======================
36 *
37 * Autonomous self-healing of XFS filesystems requires a means for the kernel
38 * to send filesystem health events to a monitoring daemon in userspace. To
39 * accomplish this, we establish a thread_with_file kthread object to handle
40 * translating internal events about filesystem health into a format that can
41 * be parsed easily by userspace. When those internal events occur, the core
42 * filesystem code calls this health monitor to convey the events to userspace.
43 * Userspace reads events from the file descriptor returned by the ioctl.
44 *
45 * The healthmon abstraction has a weak reference to the host filesystem mount
46 * so that the queueing and processing of the events do not pin the mount and
47 * cannot slow down the main filesystem. The healthmon object can exist past
48 * the end of the filesystem mount.
49 */
50
51 /* sign of a detached health monitor */
52 #define DETACHED_MOUNT_COOKIE ((uintptr_t)0)
53
54 /* Constrain the number of event objects that can build up in memory. */
55 #define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \
56 sizeof(struct xfs_healthmon_event))
57
58 /* Constrain the size of the output buffer for read_iter. */
59 #define XFS_HEALTHMON_MAX_OUTBUF SZ_64K
60
61 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
62 static DEFINE_SPINLOCK(xfs_healthmon_lock);
63
64 /* Grab a reference to the healthmon object for a given mount, if any. */
65 static struct xfs_healthmon *
xfs_healthmon_get(struct xfs_mount * mp)66 xfs_healthmon_get(
67 struct xfs_mount *mp)
68 {
69 struct xfs_healthmon *hm;
70
71 rcu_read_lock();
72 hm = mp->m_healthmon;
73 if (hm && !refcount_inc_not_zero(&hm->ref))
74 hm = NULL;
75 rcu_read_unlock();
76
77 return hm;
78 }
79
80 /*
81 * Release the reference to a healthmon object. If there are no more holders,
82 * free the health monitor after an RCU grace period to eliminate possibility
83 * of races with xfs_healthmon_get.
84 */
85 static void
xfs_healthmon_put(struct xfs_healthmon * hm)86 xfs_healthmon_put(
87 struct xfs_healthmon *hm)
88 {
89 if (refcount_dec_and_test(&hm->ref)) {
90 struct xfs_healthmon_event *event;
91 struct xfs_healthmon_event *next = hm->first_event;
92
93 while ((event = next) != NULL) {
94 trace_xfs_healthmon_drop(hm, event);
95 next = event->next;
96 kfree(event);
97 }
98
99 kfree(hm->unmount_event);
100 kfree(hm->buffer);
101 mutex_destroy(&hm->lock);
102 kfree_rcu_mightsleep(hm);
103 }
104 }
105
106 /* Attach a health monitor to an xfs_mount. Only one allowed at a time. */
107 STATIC int
xfs_healthmon_attach(struct xfs_mount * mp,struct xfs_healthmon * hm)108 xfs_healthmon_attach(
109 struct xfs_mount *mp,
110 struct xfs_healthmon *hm)
111 {
112 spin_lock(&xfs_healthmon_lock);
113 if (mp->m_healthmon != NULL) {
114 spin_unlock(&xfs_healthmon_lock);
115 return -EEXIST;
116 }
117
118 refcount_inc(&hm->ref);
119 mp->m_healthmon = hm;
120 hm->mount_cookie = (uintptr_t)mp->m_super;
121 spin_unlock(&xfs_healthmon_lock);
122
123 return 0;
124 }
125
126 /* Detach a xfs mount from a specific healthmon instance. */
127 STATIC void
xfs_healthmon_detach(struct xfs_healthmon * hm)128 xfs_healthmon_detach(
129 struct xfs_healthmon *hm)
130 {
131 spin_lock(&xfs_healthmon_lock);
132 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
133 spin_unlock(&xfs_healthmon_lock);
134 return;
135 }
136
137 XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
138 hm->mount_cookie = DETACHED_MOUNT_COOKIE;
139 spin_unlock(&xfs_healthmon_lock);
140
141 trace_xfs_healthmon_detach(hm);
142 xfs_healthmon_put(hm);
143 }
144
xfs_healthmon_bump_events(struct xfs_healthmon * hm)145 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
146 {
147 hm->events++;
148 hm->total_events++;
149 }
150
xfs_healthmon_bump_lost(struct xfs_healthmon * hm)151 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
152 {
153 hm->lost_prev_event++;
154 hm->total_lost++;
155 }
156
157 /*
158 * If possible, merge a new event into an existing event. Returns whether or
159 * not it merged anything.
160 */
161 static bool
xfs_healthmon_merge_events(struct xfs_healthmon_event * existing,const struct xfs_healthmon_event * new)162 xfs_healthmon_merge_events(
163 struct xfs_healthmon_event *existing,
164 const struct xfs_healthmon_event *new)
165 {
166 if (!existing)
167 return false;
168
169 /* type and domain must match to merge events */
170 if (existing->type != new->type ||
171 existing->domain != new->domain)
172 return false;
173
174 switch (existing->type) {
175 case XFS_HEALTHMON_RUNNING:
176 case XFS_HEALTHMON_UNMOUNT:
177 /* should only ever be one of these events anyway */
178 return false;
179
180 case XFS_HEALTHMON_LOST:
181 existing->lostcount += new->lostcount;
182 return true;
183
184 case XFS_HEALTHMON_SICK:
185 case XFS_HEALTHMON_CORRUPT:
186 case XFS_HEALTHMON_HEALTHY:
187 switch (existing->domain) {
188 case XFS_HEALTHMON_FS:
189 existing->fsmask |= new->fsmask;
190 return true;
191 case XFS_HEALTHMON_AG:
192 case XFS_HEALTHMON_RTGROUP:
193 if (existing->group == new->group){
194 existing->grpmask |= new->grpmask;
195 return true;
196 }
197 return false;
198 case XFS_HEALTHMON_INODE:
199 if (existing->ino == new->ino &&
200 existing->gen == new->gen) {
201 existing->imask |= new->imask;
202 return true;
203 }
204 return false;
205 default:
206 ASSERT(0);
207 return false;
208 }
209 return false;
210
211 case XFS_HEALTHMON_SHUTDOWN:
212 /* yes, we can race to shutdown */
213 existing->flags |= new->flags;
214 return true;
215
216 case XFS_HEALTHMON_MEDIA_ERROR:
217 /* physically adjacent errors can merge */
218 if (existing->daddr + existing->bbcount == new->daddr) {
219 existing->bbcount += new->bbcount;
220 return true;
221 }
222 if (new->daddr + new->bbcount == existing->daddr) {
223 existing->daddr = new->daddr;
224 existing->bbcount += new->bbcount;
225 return true;
226 }
227 return false;
228
229 case XFS_HEALTHMON_BUFREAD:
230 case XFS_HEALTHMON_BUFWRITE:
231 case XFS_HEALTHMON_DIOREAD:
232 case XFS_HEALTHMON_DIOWRITE:
233 case XFS_HEALTHMON_DATALOST:
234 /* logically adjacent file ranges can merge */
235 if (existing->fino != new->fino || existing->fgen != new->fgen)
236 return false;
237
238 if (existing->fpos + existing->flen == new->fpos) {
239 existing->flen += new->flen;
240 return true;
241 }
242
243 if (new->fpos + new->flen == existing->fpos) {
244 existing->fpos = new->fpos;
245 existing->flen += new->flen;
246 return true;
247 }
248 return false;
249 }
250
251 return false;
252 }
253
254 /* Insert an event onto the start of the queue. */
255 static inline void
__xfs_healthmon_insert(struct xfs_healthmon * hm,struct xfs_healthmon_event * event)256 __xfs_healthmon_insert(
257 struct xfs_healthmon *hm,
258 struct xfs_healthmon_event *event)
259 {
260 struct timespec64 now;
261
262 ktime_get_coarse_real_ts64(&now);
263 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
264
265 event->next = hm->first_event;
266 if (!hm->first_event)
267 hm->first_event = event;
268 if (!hm->last_event)
269 hm->last_event = event;
270 xfs_healthmon_bump_events(hm);
271 wake_up(&hm->wait);
272
273 trace_xfs_healthmon_insert(hm, event);
274 }
275
276 /* Push an event onto the end of the queue. */
277 static inline void
__xfs_healthmon_push(struct xfs_healthmon * hm,struct xfs_healthmon_event * event)278 __xfs_healthmon_push(
279 struct xfs_healthmon *hm,
280 struct xfs_healthmon_event *event)
281 {
282 struct timespec64 now;
283
284 ktime_get_coarse_real_ts64(&now);
285 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
286
287 if (!hm->first_event)
288 hm->first_event = event;
289 if (hm->last_event)
290 hm->last_event->next = event;
291 hm->last_event = event;
292 event->next = NULL;
293 xfs_healthmon_bump_events(hm);
294 wake_up(&hm->wait);
295
296 trace_xfs_healthmon_push(hm, event);
297 }
298
299 /* Deal with any previously lost events */
300 static int
xfs_healthmon_clear_lost_prev(struct xfs_healthmon * hm)301 xfs_healthmon_clear_lost_prev(
302 struct xfs_healthmon *hm)
303 {
304 struct xfs_healthmon_event lost_event = {
305 .type = XFS_HEALTHMON_LOST,
306 .domain = XFS_HEALTHMON_MOUNT,
307 .lostcount = hm->lost_prev_event,
308 };
309 struct xfs_healthmon_event *event = NULL;
310
311 if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
312 trace_xfs_healthmon_merge(hm, hm->last_event);
313 wake_up(&hm->wait);
314 goto cleared;
315 }
316
317 if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
318 event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
319 GFP_NOFS);
320 if (!event)
321 return -ENOMEM;
322
323 __xfs_healthmon_push(hm, event);
324 cleared:
325 hm->lost_prev_event = 0;
326 return 0;
327 }
328
329 /*
330 * Push an event onto the end of the list after dealing with lost events and
331 * possibly full queues.
332 */
333 STATIC int
xfs_healthmon_push(struct xfs_healthmon * hm,const struct xfs_healthmon_event * template)334 xfs_healthmon_push(
335 struct xfs_healthmon *hm,
336 const struct xfs_healthmon_event *template)
337 {
338 struct xfs_healthmon_event *event = NULL;
339 int error = 0;
340
341 /*
342 * Locklessly check if the health monitor has already detached from the
343 * mount. If so, ignore the event. If we race with deactivation,
344 * we'll queue the event but never send it.
345 */
346 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
347 return -ESHUTDOWN;
348
349 mutex_lock(&hm->lock);
350
351 /* Report previously lost events before we do anything else */
352 if (hm->lost_prev_event) {
353 error = xfs_healthmon_clear_lost_prev(hm);
354 if (error)
355 goto out_unlock;
356 }
357
358 /* Try to merge with the newest event */
359 if (xfs_healthmon_merge_events(hm->last_event, template)) {
360 trace_xfs_healthmon_merge(hm, hm->last_event);
361 wake_up(&hm->wait);
362 goto out_unlock;
363 }
364
365 /* Only create a heap event object if we're not already at capacity. */
366 if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
367 event = kmemdup(template, sizeof(struct xfs_healthmon_event),
368 GFP_NOFS);
369 if (!event) {
370 /* No memory means we lose the event */
371 trace_xfs_healthmon_lost_event(hm);
372 xfs_healthmon_bump_lost(hm);
373 error = -ENOMEM;
374 goto out_unlock;
375 }
376
377 __xfs_healthmon_push(hm, event);
378
379 out_unlock:
380 mutex_unlock(&hm->lock);
381 return error;
382 }
383
384 /*
385 * Report that the filesystem is being unmounted, then detach the xfs mount
386 * from this healthmon instance.
387 */
388 void
xfs_healthmon_unmount(struct xfs_mount * mp)389 xfs_healthmon_unmount(
390 struct xfs_mount *mp)
391 {
392 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
393
394 if (!hm)
395 return;
396
397 trace_xfs_healthmon_report_unmount(hm);
398
399 /*
400 * Insert the unmount notification at the start of the event queue so
401 * that userspace knows the filesystem went away as soon as possible.
402 * There's nothing actionable for userspace after an unmount. Once
403 * we've inserted the unmount event, hm no longer owns that event.
404 */
405 __xfs_healthmon_insert(hm, hm->unmount_event);
406 hm->unmount_event = NULL;
407
408 xfs_healthmon_detach(hm);
409 xfs_healthmon_put(hm);
410 }
411
412 /* Compute the reporting mask for non-unmount metadata health events. */
413 static inline unsigned int
metadata_event_mask(struct xfs_healthmon * hm,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)414 metadata_event_mask(
415 struct xfs_healthmon *hm,
416 enum xfs_healthmon_type type,
417 unsigned int old_mask,
418 unsigned int new_mask)
419 {
420 /* If we want all events, return all events. */
421 if (hm->verbose)
422 return new_mask;
423
424 switch (type) {
425 case XFS_HEALTHMON_SICK:
426 /* Always report runtime corruptions */
427 return new_mask;
428 case XFS_HEALTHMON_CORRUPT:
429 /* Only report new fsck errors */
430 return new_mask & ~old_mask;
431 case XFS_HEALTHMON_HEALTHY:
432 /* Only report healthy metadata that got fixed */
433 return new_mask & old_mask;
434 default:
435 ASSERT(0);
436 break;
437 }
438
439 return 0;
440 }
441
442 /* Report XFS_FS_SICK_* events to healthmon */
443 void
xfs_healthmon_report_fs(struct xfs_mount * mp,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)444 xfs_healthmon_report_fs(
445 struct xfs_mount *mp,
446 enum xfs_healthmon_type type,
447 unsigned int old_mask,
448 unsigned int new_mask)
449 {
450 struct xfs_healthmon_event event = {
451 .type = type,
452 .domain = XFS_HEALTHMON_FS,
453 };
454 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
455
456 if (!hm)
457 return;
458
459 event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
460 ~XFS_SICK_FS_SECONDARY;
461 trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
462
463 if (event.fsmask)
464 xfs_healthmon_push(hm, &event);
465
466 xfs_healthmon_put(hm);
467 }
468
469 /* Report XFS_SICK_(AG|RG)* flags to healthmon */
470 void
xfs_healthmon_report_group(struct xfs_group * xg,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)471 xfs_healthmon_report_group(
472 struct xfs_group *xg,
473 enum xfs_healthmon_type type,
474 unsigned int old_mask,
475 unsigned int new_mask)
476 {
477 struct xfs_healthmon_event event = {
478 .type = type,
479 .group = xg->xg_gno,
480 };
481 struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount);
482
483 if (!hm)
484 return;
485
486 switch (xg->xg_type) {
487 case XG_TYPE_RTG:
488 event.domain = XFS_HEALTHMON_RTGROUP;
489 event.grpmask = metadata_event_mask(hm, type, old_mask,
490 new_mask) &
491 ~XFS_SICK_RG_SECONDARY;
492 break;
493 case XG_TYPE_AG:
494 event.domain = XFS_HEALTHMON_AG;
495 event.grpmask = metadata_event_mask(hm, type, old_mask,
496 new_mask) &
497 ~XFS_SICK_AG_SECONDARY;
498 break;
499 default:
500 ASSERT(0);
501 break;
502 }
503
504 trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
505
506 if (event.grpmask)
507 xfs_healthmon_push(hm, &event);
508
509 xfs_healthmon_put(hm);
510 }
511
512 /* Report XFS_SICK_INO_* flags to healthmon */
513 void
xfs_healthmon_report_inode(struct xfs_inode * ip,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)514 xfs_healthmon_report_inode(
515 struct xfs_inode *ip,
516 enum xfs_healthmon_type type,
517 unsigned int old_mask,
518 unsigned int new_mask)
519 {
520 struct xfs_healthmon_event event = {
521 .type = type,
522 .domain = XFS_HEALTHMON_INODE,
523 .ino = ip->i_ino,
524 .gen = VFS_I(ip)->i_generation,
525 };
526 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
527
528 if (!hm)
529 return;
530
531 event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
532 ~XFS_SICK_INO_SECONDARY;
533 trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
534
535 if (event.imask)
536 xfs_healthmon_push(hm, &event);
537
538 xfs_healthmon_put(hm);
539 }
540
541 /* Add a shutdown event to the reporting queue. */
542 void
xfs_healthmon_report_shutdown(struct xfs_mount * mp,uint32_t flags)543 xfs_healthmon_report_shutdown(
544 struct xfs_mount *mp,
545 uint32_t flags)
546 {
547 struct xfs_healthmon_event event = {
548 .type = XFS_HEALTHMON_SHUTDOWN,
549 .domain = XFS_HEALTHMON_MOUNT,
550 .flags = flags,
551 };
552 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
553
554 if (!hm)
555 return;
556
557 trace_xfs_healthmon_report_shutdown(hm, flags);
558
559 xfs_healthmon_push(hm, &event);
560 xfs_healthmon_put(hm);
561 }
562
563 static inline enum xfs_healthmon_domain
media_error_domain(enum xfs_device fdev)564 media_error_domain(
565 enum xfs_device fdev)
566 {
567 switch (fdev) {
568 case XFS_DEV_DATA:
569 return XFS_HEALTHMON_DATADEV;
570 case XFS_DEV_LOG:
571 return XFS_HEALTHMON_LOGDEV;
572 case XFS_DEV_RT:
573 return XFS_HEALTHMON_RTDEV;
574 }
575
576 ASSERT(0);
577 return 0;
578 }
579
580 /* Add a media error event to the reporting queue. */
581 void
xfs_healthmon_report_media(struct xfs_mount * mp,enum xfs_device fdev,xfs_daddr_t daddr,uint64_t bbcount)582 xfs_healthmon_report_media(
583 struct xfs_mount *mp,
584 enum xfs_device fdev,
585 xfs_daddr_t daddr,
586 uint64_t bbcount)
587 {
588 struct xfs_healthmon_event event = {
589 .type = XFS_HEALTHMON_MEDIA_ERROR,
590 .domain = media_error_domain(fdev),
591 .daddr = daddr,
592 .bbcount = bbcount,
593 };
594 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
595
596 if (!hm)
597 return;
598
599 trace_xfs_healthmon_report_media(hm, fdev, &event);
600
601 xfs_healthmon_push(hm, &event);
602 xfs_healthmon_put(hm);
603 }
604
file_ioerr_type(enum fserror_type action)605 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
606 {
607 switch (action) {
608 case FSERR_BUFFERED_READ:
609 return XFS_HEALTHMON_BUFREAD;
610 case FSERR_BUFFERED_WRITE:
611 return XFS_HEALTHMON_BUFWRITE;
612 case FSERR_DIRECTIO_READ:
613 return XFS_HEALTHMON_DIOREAD;
614 case FSERR_DIRECTIO_WRITE:
615 return XFS_HEALTHMON_DIOWRITE;
616 case FSERR_DATA_LOST:
617 return XFS_HEALTHMON_DATALOST;
618 case FSERR_METADATA:
619 /* filtered out by xfs_fs_report_error */
620 break;
621 }
622
623 ASSERT(0);
624 return -1;
625 }
626
627 /* Add a file io error event to the reporting queue. */
628 void
xfs_healthmon_report_file_ioerror(struct xfs_inode * ip,const struct fserror_event * p)629 xfs_healthmon_report_file_ioerror(
630 struct xfs_inode *ip,
631 const struct fserror_event *p)
632 {
633 struct xfs_healthmon_event event = {
634 .type = file_ioerr_type(p->type),
635 .domain = XFS_HEALTHMON_FILERANGE,
636 .fino = ip->i_ino,
637 .fgen = VFS_I(ip)->i_generation,
638 .fpos = p->pos,
639 .flen = p->len,
640 /* send positive error number to userspace */
641 .error = -p->error,
642 };
643 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
644
645 if (!hm)
646 return;
647
648 trace_xfs_healthmon_report_file_ioerror(hm, p);
649
650 xfs_healthmon_push(hm, &event);
651 xfs_healthmon_put(hm);
652 }
653
654 static inline void
xfs_healthmon_reset_outbuf(struct xfs_healthmon * hm)655 xfs_healthmon_reset_outbuf(
656 struct xfs_healthmon *hm)
657 {
658 hm->buftail = 0;
659 hm->bufhead = 0;
660 }
661
662 struct flags_map {
663 unsigned int in_mask;
664 unsigned int out_mask;
665 };
666
667 static const struct flags_map shutdown_map[] = {
668 { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
669 { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
670 { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
671 { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
672 { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
673 { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
674 };
675
676 static inline unsigned int
__map_flags(const struct flags_map * map,size_t array_len,unsigned int flags)677 __map_flags(
678 const struct flags_map *map,
679 size_t array_len,
680 unsigned int flags)
681 {
682 const struct flags_map *m;
683 unsigned int ret = 0;
684
685 for (m = map; m < map + array_len; m++) {
686 if (flags & m->in_mask)
687 ret |= m->out_mask;
688 }
689
690 return ret;
691 }
692
693 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
694
shutdown_mask(unsigned int in)695 static inline unsigned int shutdown_mask(unsigned int in)
696 {
697 return map_flags(shutdown_map, in);
698 }
699
700 static const unsigned int domain_map[] = {
701 [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
702 [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS,
703 [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG,
704 [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE,
705 [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
706 [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
707 [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
708 [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
709 [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
710 };
711
712 static const unsigned int type_map[] = {
713 [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING,
714 [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST,
715 [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK,
716 [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
717 [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
718 [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
719 [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
720 [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
721 [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
722 [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
723 [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
724 [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
725 [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST,
726 };
727
728 /* Render event as a V0 structure */
729 STATIC int
xfs_healthmon_format_v0(struct xfs_healthmon * hm,const struct xfs_healthmon_event * event)730 xfs_healthmon_format_v0(
731 struct xfs_healthmon *hm,
732 const struct xfs_healthmon_event *event)
733 {
734 struct xfs_health_monitor_event hme = {
735 .time_ns = event->time_ns,
736 };
737
738 trace_xfs_healthmon_format(hm, event);
739
740 if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
741 event->type < 0 || event->type >= ARRAY_SIZE(type_map))
742 return -EFSCORRUPTED;
743
744 hme.domain = domain_map[event->domain];
745 hme.type = type_map[event->type];
746
747 /* fill in the event-specific details */
748 switch (event->domain) {
749 case XFS_HEALTHMON_MOUNT:
750 switch (event->type) {
751 case XFS_HEALTHMON_LOST:
752 hme.e.lost.count = event->lostcount;
753 break;
754 case XFS_HEALTHMON_SHUTDOWN:
755 hme.e.shutdown.reasons = shutdown_mask(event->flags);
756 break;
757 default:
758 break;
759 }
760 break;
761 case XFS_HEALTHMON_FS:
762 hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
763 break;
764 case XFS_HEALTHMON_RTGROUP:
765 hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
766 hme.e.group.gno = event->group;
767 break;
768 case XFS_HEALTHMON_AG:
769 hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
770 hme.e.group.gno = event->group;
771 break;
772 case XFS_HEALTHMON_INODE:
773 hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
774 hme.e.inode.ino = event->ino;
775 hme.e.inode.gen = event->gen;
776 break;
777 case XFS_HEALTHMON_DATADEV:
778 case XFS_HEALTHMON_LOGDEV:
779 case XFS_HEALTHMON_RTDEV:
780 hme.e.media.daddr = event->daddr;
781 hme.e.media.bbcount = event->bbcount;
782 break;
783 case XFS_HEALTHMON_FILERANGE:
784 hme.e.filerange.ino = event->fino;
785 hme.e.filerange.gen = event->fgen;
786 hme.e.filerange.pos = event->fpos;
787 hme.e.filerange.len = event->flen;
788 hme.e.filerange.error = abs(event->error);
789 break;
790 default:
791 break;
792 }
793
794 ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
795
796 /* copy formatted object to the outbuf */
797 if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
798 memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
799 hm->bufhead += sizeof(hme);
800 }
801
802 return 0;
803 }
804
805 /* How many bytes are waiting in the outbuf to be copied? */
806 static inline size_t
xfs_healthmon_outbuf_bytes(struct xfs_healthmon * hm)807 xfs_healthmon_outbuf_bytes(
808 struct xfs_healthmon *hm)
809 {
810 if (hm->bufhead > hm->buftail)
811 return hm->bufhead - hm->buftail;
812 return 0;
813 }
814
815 /*
816 * Do we have something for userspace to read? This can mean unmount events,
817 * events pending in the queue, or pending bytes in the outbuf.
818 */
819 static inline bool
xfs_healthmon_has_eventdata(struct xfs_healthmon * hm)820 xfs_healthmon_has_eventdata(
821 struct xfs_healthmon *hm)
822 {
823 /*
824 * If the health monitor is already detached from the xfs_mount, we
825 * want reads to return 0 bytes even if there are no events, because
826 * userspace interprets that as EOF. If we race with deactivation,
827 * read_iter will take the necessary locks to discover that there are
828 * no events to send.
829 */
830 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
831 return true;
832
833 /*
834 * Either there are events waiting to be formatted into the buffer, or
835 * there's unread bytes in the buffer.
836 */
837 return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
838 }
839
840 /* Try to copy the rest of the outbuf to the iov iter. */
841 STATIC ssize_t
xfs_healthmon_copybuf(struct xfs_healthmon * hm,struct iov_iter * to)842 xfs_healthmon_copybuf(
843 struct xfs_healthmon *hm,
844 struct iov_iter *to)
845 {
846 size_t to_copy;
847 size_t w = 0;
848
849 trace_xfs_healthmon_copybuf(hm, to);
850
851 to_copy = xfs_healthmon_outbuf_bytes(hm);
852 if (to_copy) {
853 w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
854 if (!w)
855 return -EFAULT;
856
857 hm->buftail += w;
858 }
859
860 /*
861 * Nothing left to copy? Reset the output buffer cursors to the start
862 * since there's no live data in the buffer.
863 */
864 if (xfs_healthmon_outbuf_bytes(hm) == 0)
865 xfs_healthmon_reset_outbuf(hm);
866 return w;
867 }
868
869 /*
870 * Return a health monitoring event for formatting into the output buffer if
871 * there's enough space in the outbuf and an event waiting for us. Caller
872 * must hold i_rwsem on the healthmon file.
873 */
874 static inline struct xfs_healthmon_event *
xfs_healthmon_format_pop(struct xfs_healthmon * hm)875 xfs_healthmon_format_pop(
876 struct xfs_healthmon *hm)
877 {
878 struct xfs_healthmon_event *event;
879
880 if (hm->bufhead + sizeof(*event) > hm->bufsize)
881 return NULL;
882
883 mutex_lock(&hm->lock);
884 event = hm->first_event;
885 if (event) {
886 if (hm->last_event == event)
887 hm->last_event = NULL;
888 hm->first_event = event->next;
889 hm->events--;
890
891 trace_xfs_healthmon_pop(hm, event);
892 }
893 mutex_unlock(&hm->lock);
894 return event;
895 }
896
897 /* Allocate formatting buffer */
898 STATIC int
xfs_healthmon_alloc_outbuf(struct xfs_healthmon * hm,size_t user_bufsize)899 xfs_healthmon_alloc_outbuf(
900 struct xfs_healthmon *hm,
901 size_t user_bufsize)
902 {
903 void *outbuf;
904 size_t bufsize =
905 min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
906
907 outbuf = kzalloc(bufsize, GFP_KERNEL);
908 if (!outbuf) {
909 if (bufsize == PAGE_SIZE)
910 return -ENOMEM;
911
912 bufsize = PAGE_SIZE;
913 outbuf = kzalloc(bufsize, GFP_KERNEL);
914 if (!outbuf)
915 return -ENOMEM;
916 }
917
918 hm->buffer = outbuf;
919 hm->bufsize = bufsize;
920 hm->bufhead = 0;
921 hm->buftail = 0;
922
923 return 0;
924 }
925
926 /*
927 * Convey queued event data to userspace. First copy any remaining bytes in
928 * the outbuf, then format the oldest event into the outbuf and copy that too.
929 */
930 STATIC ssize_t
xfs_healthmon_read_iter(struct kiocb * iocb,struct iov_iter * to)931 xfs_healthmon_read_iter(
932 struct kiocb *iocb,
933 struct iov_iter *to)
934 {
935 struct file *file = iocb->ki_filp;
936 struct inode *inode = file_inode(file);
937 struct xfs_healthmon *hm = file->private_data;
938 struct xfs_healthmon_event *event;
939 size_t copied = 0;
940 ssize_t ret = 0;
941
942 if (file->f_flags & O_NONBLOCK) {
943 if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
944 return -EAGAIN;
945 } else {
946 ret = wait_event_interruptible(hm->wait,
947 xfs_healthmon_has_eventdata(hm));
948 if (ret)
949 return ret;
950
951 inode_lock(inode);
952 }
953
954 if (hm->bufsize == 0) {
955 ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
956 if (ret)
957 goto out_unlock;
958 }
959
960 trace_xfs_healthmon_read_start(hm);
961
962 /*
963 * If there's anything left in the output buffer, copy that before
964 * formatting more events.
965 */
966 ret = xfs_healthmon_copybuf(hm, to);
967 if (ret < 0)
968 goto out_unlock;
969 copied += ret;
970
971 while (iov_iter_count(to) > 0) {
972 /* Format the next events into the outbuf until it's full. */
973 while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
974 ret = xfs_healthmon_format_v0(hm, event);
975 kfree(event);
976 if (ret)
977 goto out_unlock;
978 }
979
980 /* Copy anything formatted into outbuf to userspace */
981 ret = xfs_healthmon_copybuf(hm, to);
982 if (ret <= 0)
983 break;
984
985 copied += ret;
986 }
987
988 out_unlock:
989 trace_xfs_healthmon_read_finish(hm);
990 inode_unlock(inode);
991 return copied ?: ret;
992 }
993
994 /* Poll for available events. */
995 STATIC __poll_t
xfs_healthmon_poll(struct file * file,struct poll_table_struct * wait)996 xfs_healthmon_poll(
997 struct file *file,
998 struct poll_table_struct *wait)
999 {
1000 struct xfs_healthmon *hm = file->private_data;
1001 __poll_t mask = 0;
1002
1003 poll_wait(file, &hm->wait, wait);
1004
1005 if (xfs_healthmon_has_eventdata(hm))
1006 mask |= EPOLLIN;
1007 return mask;
1008 }
1009
1010 /* Free the health monitoring information. */
1011 STATIC int
xfs_healthmon_release(struct inode * inode,struct file * file)1012 xfs_healthmon_release(
1013 struct inode *inode,
1014 struct file *file)
1015 {
1016 struct xfs_healthmon *hm = file->private_data;
1017
1018 trace_xfs_healthmon_release(hm);
1019
1020 /*
1021 * We might be closing the healthmon file before the filesystem
1022 * unmounts, because userspace processes can terminate at any time and
1023 * for any reason. Null out xfs_mount::m_healthmon so that another
1024 * process can create another health monitor file.
1025 */
1026 xfs_healthmon_detach(hm);
1027
1028 /*
1029 * Wake up any readers that might be left. There shouldn't be any
1030 * because the only users of the waiter are read and poll.
1031 */
1032 wake_up_all(&hm->wait);
1033
1034 xfs_healthmon_put(hm);
1035 return 0;
1036 }
1037
1038 /* Validate ioctl parameters. */
1039 static inline bool
xfs_healthmon_validate(const struct xfs_health_monitor * hmo)1040 xfs_healthmon_validate(
1041 const struct xfs_health_monitor *hmo)
1042 {
1043 if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
1044 return false;
1045 if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
1046 return false;
1047 if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
1048 return false;
1049 return true;
1050 }
1051
1052 /* Emit some data about the health monitoring fd. */
1053 static void
xfs_healthmon_show_fdinfo(struct seq_file * m,struct file * file)1054 xfs_healthmon_show_fdinfo(
1055 struct seq_file *m,
1056 struct file *file)
1057 {
1058 struct xfs_healthmon *hm = file->private_data;
1059
1060 mutex_lock(&hm->lock);
1061 seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
1062 hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
1063 "dead" : "alive",
1064 MAJOR(hm->dev), MINOR(hm->dev),
1065 hm->total_events,
1066 hm->total_lost);
1067 mutex_unlock(&hm->lock);
1068 }
1069
1070 /* Reconfigure the health monitor. */
1071 STATIC long
xfs_healthmon_reconfigure(struct file * file,unsigned int cmd,void __user * arg)1072 xfs_healthmon_reconfigure(
1073 struct file *file,
1074 unsigned int cmd,
1075 void __user *arg)
1076 {
1077 struct xfs_health_monitor hmo;
1078 struct xfs_healthmon *hm = file->private_data;
1079
1080 if (copy_from_user(&hmo, arg, sizeof(hmo)))
1081 return -EFAULT;
1082
1083 if (!xfs_healthmon_validate(&hmo))
1084 return -EINVAL;
1085
1086 mutex_lock(&hm->lock);
1087 hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
1088 mutex_unlock(&hm->lock);
1089
1090 return 0;
1091 }
1092
1093 /* Does the fd point to the same filesystem as the one we're monitoring? */
1094 STATIC long
xfs_healthmon_file_on_monitored_fs(struct file * file,unsigned int cmd,void __user * arg)1095 xfs_healthmon_file_on_monitored_fs(
1096 struct file *file,
1097 unsigned int cmd,
1098 void __user *arg)
1099 {
1100 struct xfs_health_file_on_monitored_fs hms;
1101 struct xfs_healthmon *hm = file->private_data;
1102 struct inode *hms_inode;
1103
1104 if (copy_from_user(&hms, arg, sizeof(hms)))
1105 return -EFAULT;
1106
1107 if (hms.flags)
1108 return -EINVAL;
1109
1110 CLASS(fd, hms_fd)(hms.fd);
1111 if (fd_empty(hms_fd))
1112 return -EBADF;
1113
1114 hms_inode = file_inode(fd_file(hms_fd));
1115 mutex_lock(&hm->lock);
1116 if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
1117 mutex_unlock(&hm->lock);
1118 return -ESTALE;
1119 }
1120
1121 mutex_unlock(&hm->lock);
1122 return 0;
1123 }
1124
1125 /* Handle ioctls for the health monitoring thread. */
1126 STATIC long
xfs_healthmon_ioctl(struct file * file,unsigned int cmd,unsigned long p)1127 xfs_healthmon_ioctl(
1128 struct file *file,
1129 unsigned int cmd,
1130 unsigned long p)
1131 {
1132 void __user *arg = (void __user *)p;
1133
1134 switch (cmd) {
1135 case XFS_IOC_HEALTH_MONITOR:
1136 return xfs_healthmon_reconfigure(file, cmd, arg);
1137 case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
1138 return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
1139 default:
1140 break;
1141 }
1142
1143 return -ENOTTY;
1144 }
1145
1146 static const struct file_operations xfs_healthmon_fops = {
1147 .owner = THIS_MODULE,
1148 .show_fdinfo = xfs_healthmon_show_fdinfo,
1149 .read_iter = xfs_healthmon_read_iter,
1150 .poll = xfs_healthmon_poll,
1151 .release = xfs_healthmon_release,
1152 .unlocked_ioctl = xfs_healthmon_ioctl,
1153 };
1154
1155 /*
1156 * Create a health monitoring file. Returns an index to the fd table or a
1157 * negative errno.
1158 */
1159 long
xfs_ioc_health_monitor(struct file * file,struct xfs_health_monitor __user * arg)1160 xfs_ioc_health_monitor(
1161 struct file *file,
1162 struct xfs_health_monitor __user *arg)
1163 {
1164 struct xfs_health_monitor hmo;
1165 struct xfs_healthmon_event *running_event;
1166 struct xfs_healthmon *hm;
1167 struct xfs_inode *ip = XFS_I(file_inode(file));
1168 struct xfs_mount *mp = ip->i_mount;
1169 int ret;
1170
1171 /*
1172 * The only intended user of the health monitoring system should be the
1173 * xfs_healer daemon running on behalf of the whole filesystem in the
1174 * initial user namespace. IOWs, we don't allow unprivileged userspace
1175 * (they can use fsnotify) nor do we allow containers.
1176 */
1177 if (!capable(CAP_SYS_ADMIN))
1178 return -EPERM;
1179 if (ip->i_ino != mp->m_sb.sb_rootino)
1180 return -EPERM;
1181 if (current_user_ns() != &init_user_ns)
1182 return -EPERM;
1183
1184 if (copy_from_user(&hmo, arg, sizeof(hmo)))
1185 return -EFAULT;
1186
1187 if (!xfs_healthmon_validate(&hmo))
1188 return -EINVAL;
1189
1190 hm = kzalloc_obj(*hm);
1191 if (!hm)
1192 return -ENOMEM;
1193 hm->dev = mp->m_super->s_dev;
1194 refcount_set(&hm->ref, 1);
1195
1196 mutex_init(&hm->lock);
1197 init_waitqueue_head(&hm->wait);
1198
1199 if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
1200 hm->verbose = true;
1201
1202 /* Queue up the first event that lets the client know we're running. */
1203 running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1204 if (!running_event) {
1205 ret = -ENOMEM;
1206 goto out_hm;
1207 }
1208 running_event->type = XFS_HEALTHMON_RUNNING;
1209 running_event->domain = XFS_HEALTHMON_MOUNT;
1210 __xfs_healthmon_insert(hm, running_event);
1211
1212 /*
1213 * Preallocate the unmount event so that we can't fail to notify the
1214 * filesystem later. This is key for triggering fast exit of the
1215 * xfs_healer daemon.
1216 */
1217 hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1218 if (!hm->unmount_event) {
1219 ret = -ENOMEM;
1220 goto out_hm;
1221 }
1222 hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
1223 hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
1224
1225 /*
1226 * Try to attach this health monitor to the xfs_mount. The monitor is
1227 * considered live and will receive events if this succeeds.
1228 */
1229 ret = xfs_healthmon_attach(mp, hm);
1230 if (ret)
1231 goto out_hm;
1232
1233 /*
1234 * Create the anonymous file and install a fd for it. If it succeeds,
1235 * the file owns hm and can go away at any time, so we must not access
1236 * it again. This must go last because we can't undo a fd table
1237 * installation.
1238 */
1239 ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
1240 O_CLOEXEC | O_RDONLY);
1241 if (ret < 0)
1242 goto out_mp;
1243
1244 trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
1245
1246 return ret;
1247
1248 out_mp:
1249 xfs_healthmon_detach(hm);
1250 out_hm:
1251 ASSERT(refcount_read(&hm->ref) == 1);
1252 xfs_healthmon_put(hm);
1253 return ret;
1254 }
1255