1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2024-2026 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trace.h"
15 #include "xfs_ag.h"
16 #include "xfs_btree.h"
17 #include "xfs_da_format.h"
18 #include "xfs_da_btree.h"
19 #include "xfs_quota_defs.h"
20 #include "xfs_rtgroup.h"
21 #include "xfs_health.h"
22 #include "xfs_healthmon.h"
23 #include "xfs_fsops.h"
24 #include "xfs_notify_failure.h"
25 #include "xfs_file.h"
26 #include "xfs_ioctl.h"
27
28 #include <linux/anon_inodes.h>
29 #include <linux/eventpoll.h>
30 #include <linux/poll.h>
31 #include <linux/fserror.h>
32
33 /*
34 * Live Health Monitoring
35 * ======================
36 *
37 * Autonomous self-healing of XFS filesystems requires a means for the kernel
38 * to send filesystem health events to a monitoring daemon in userspace. To
39 * accomplish this, we establish a thread_with_file kthread object to handle
40 * translating internal events about filesystem health into a format that can
41 * be parsed easily by userspace. When those internal events occur, the core
42 * filesystem code calls this health monitor to convey the events to userspace.
43 * Userspace reads events from the file descriptor returned by the ioctl.
44 *
45 * The healthmon abstraction has a weak reference to the host filesystem mount
46 * so that the queueing and processing of the events do not pin the mount and
47 * cannot slow down the main filesystem. The healthmon object can exist past
48 * the end of the filesystem mount.
49 */
50
51 /* sign of a detached health monitor */
52 #define DETACHED_MOUNT_COOKIE ((uintptr_t)0)
53
54 /* Constrain the number of event objects that can build up in memory. */
55 #define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \
56 sizeof(struct xfs_healthmon_event))
57
58 /* Constrain the size of the output buffer for read_iter. */
59 #define XFS_HEALTHMON_MAX_OUTBUF SZ_64K
60
61 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
62 static DEFINE_SPINLOCK(xfs_healthmon_lock);
63
64 /* Grab a reference to the healthmon object for a given mount, if any. */
65 static struct xfs_healthmon *
xfs_healthmon_get(struct xfs_mount * mp)66 xfs_healthmon_get(
67 struct xfs_mount *mp)
68 {
69 struct xfs_healthmon *hm;
70
71 rcu_read_lock();
72 hm = rcu_dereference(mp->m_healthmon);
73 if (hm && !refcount_inc_not_zero(&hm->ref))
74 hm = NULL;
75 rcu_read_unlock();
76
77 return hm;
78 }
79
80 /*
81 * Release the reference to a healthmon object. If there are no more holders,
82 * free the health monitor after an RCU grace period to eliminate possibility
83 * of races with xfs_healthmon_get.
84 */
85 static void
xfs_healthmon_put(struct xfs_healthmon * hm)86 xfs_healthmon_put(
87 struct xfs_healthmon *hm)
88 {
89 if (refcount_dec_and_test(&hm->ref)) {
90 struct xfs_healthmon_event *event;
91 struct xfs_healthmon_event *next = hm->first_event;
92
93 while ((event = next) != NULL) {
94 trace_xfs_healthmon_drop(hm, event);
95 next = event->next;
96 kfree(event);
97 }
98
99 kfree(hm->unmount_event);
100 kfree(hm->buffer);
101 mutex_destroy(&hm->lock);
102 kfree_rcu_mightsleep(hm);
103 }
104 }
105
106 /* Attach a health monitor to an xfs_mount. Only one allowed at a time. */
107 STATIC int
xfs_healthmon_attach(struct xfs_mount * mp,struct xfs_healthmon * hm)108 xfs_healthmon_attach(
109 struct xfs_mount *mp,
110 struct xfs_healthmon *hm)
111 {
112 spin_lock(&xfs_healthmon_lock);
113 if (rcu_access_pointer(mp->m_healthmon) != NULL) {
114 spin_unlock(&xfs_healthmon_lock);
115 return -EEXIST;
116 }
117
118 refcount_inc(&hm->ref);
119 rcu_assign_pointer(mp->m_healthmon, hm);
120 hm->mount_cookie = (uintptr_t)mp->m_super;
121 spin_unlock(&xfs_healthmon_lock);
122
123 return 0;
124 }
125
126 /* Detach a xfs mount from a specific healthmon instance. */
127 STATIC void
xfs_healthmon_detach(struct xfs_healthmon * hm)128 xfs_healthmon_detach(
129 struct xfs_healthmon *hm)
130 {
131 struct xfs_mount *mp;
132
133 spin_lock(&xfs_healthmon_lock);
134 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
135 spin_unlock(&xfs_healthmon_lock);
136 return;
137 }
138
139 mp = XFS_M((struct super_block *)hm->mount_cookie);
140 rcu_assign_pointer(mp->m_healthmon, NULL);
141 hm->mount_cookie = DETACHED_MOUNT_COOKIE;
142 spin_unlock(&xfs_healthmon_lock);
143
144 /*
145 * Wake up any readers that might remain. This can happen if unmount
146 * races with the healthmon fd owner entering ->read_iter, having
147 * already emptied the event queue.
148 *
149 * In the ->release case there shouldn't be any readers because the
150 * only users of the waiter are read and poll.
151 */
152 wake_up_all(&hm->wait);
153
154 trace_xfs_healthmon_detach(hm);
155 xfs_healthmon_put(hm);
156 }
157
xfs_healthmon_bump_events(struct xfs_healthmon * hm)158 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
159 {
160 hm->events++;
161 hm->total_events++;
162 }
163
xfs_healthmon_bump_lost(struct xfs_healthmon * hm)164 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
165 {
166 hm->lost_prev_event++;
167 hm->total_lost++;
168 }
169
170 /*
171 * If possible, merge a new event into an existing event. Returns whether or
172 * not it merged anything.
173 */
174 static bool
xfs_healthmon_merge_events(struct xfs_healthmon_event * existing,const struct xfs_healthmon_event * new)175 xfs_healthmon_merge_events(
176 struct xfs_healthmon_event *existing,
177 const struct xfs_healthmon_event *new)
178 {
179 if (!existing)
180 return false;
181
182 /* type and domain must match to merge events */
183 if (existing->type != new->type ||
184 existing->domain != new->domain)
185 return false;
186
187 switch (existing->type) {
188 case XFS_HEALTHMON_RUNNING:
189 case XFS_HEALTHMON_UNMOUNT:
190 /* should only ever be one of these events anyway */
191 return false;
192
193 case XFS_HEALTHMON_LOST:
194 existing->lostcount += new->lostcount;
195 return true;
196
197 case XFS_HEALTHMON_SICK:
198 case XFS_HEALTHMON_CORRUPT:
199 case XFS_HEALTHMON_HEALTHY:
200 switch (existing->domain) {
201 case XFS_HEALTHMON_FS:
202 existing->fsmask |= new->fsmask;
203 return true;
204 case XFS_HEALTHMON_AG:
205 case XFS_HEALTHMON_RTGROUP:
206 if (existing->group == new->group){
207 existing->grpmask |= new->grpmask;
208 return true;
209 }
210 return false;
211 case XFS_HEALTHMON_INODE:
212 if (existing->ino == new->ino &&
213 existing->gen == new->gen) {
214 existing->imask |= new->imask;
215 return true;
216 }
217 return false;
218 default:
219 ASSERT(0);
220 return false;
221 }
222 return false;
223
224 case XFS_HEALTHMON_SHUTDOWN:
225 /* yes, we can race to shutdown */
226 existing->flags |= new->flags;
227 return true;
228
229 case XFS_HEALTHMON_MEDIA_ERROR:
230 /* physically adjacent errors can merge */
231 if (existing->daddr + existing->bbcount == new->daddr) {
232 existing->bbcount += new->bbcount;
233 return true;
234 }
235 if (new->daddr + new->bbcount == existing->daddr) {
236 existing->daddr = new->daddr;
237 existing->bbcount += new->bbcount;
238 return true;
239 }
240 return false;
241
242 case XFS_HEALTHMON_BUFREAD:
243 case XFS_HEALTHMON_BUFWRITE:
244 case XFS_HEALTHMON_DIOREAD:
245 case XFS_HEALTHMON_DIOWRITE:
246 case XFS_HEALTHMON_DATALOST:
247 /* logically adjacent file ranges can merge */
248 if (existing->fino != new->fino || existing->fgen != new->fgen)
249 return false;
250
251 if (existing->fpos + existing->flen == new->fpos) {
252 existing->flen += new->flen;
253 return true;
254 }
255
256 if (new->fpos + new->flen == existing->fpos) {
257 existing->fpos = new->fpos;
258 existing->flen += new->flen;
259 return true;
260 }
261 return false;
262 }
263
264 return false;
265 }
266
267 /* Insert an event onto the start of the queue. */
268 static inline void
__xfs_healthmon_insert(struct xfs_healthmon * hm,struct xfs_healthmon_event * event)269 __xfs_healthmon_insert(
270 struct xfs_healthmon *hm,
271 struct xfs_healthmon_event *event)
272 {
273 struct timespec64 now;
274
275 ktime_get_coarse_real_ts64(&now);
276 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
277
278 event->next = hm->first_event;
279 if (!hm->first_event)
280 hm->first_event = event;
281 if (!hm->last_event)
282 hm->last_event = event;
283 xfs_healthmon_bump_events(hm);
284 wake_up(&hm->wait);
285
286 trace_xfs_healthmon_insert(hm, event);
287 }
288
289 /* Push an event onto the end of the queue. */
290 static inline void
__xfs_healthmon_push(struct xfs_healthmon * hm,struct xfs_healthmon_event * event)291 __xfs_healthmon_push(
292 struct xfs_healthmon *hm,
293 struct xfs_healthmon_event *event)
294 {
295 struct timespec64 now;
296
297 ktime_get_coarse_real_ts64(&now);
298 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
299
300 if (!hm->first_event)
301 hm->first_event = event;
302 if (hm->last_event)
303 hm->last_event->next = event;
304 hm->last_event = event;
305 event->next = NULL;
306 xfs_healthmon_bump_events(hm);
307 wake_up(&hm->wait);
308
309 trace_xfs_healthmon_push(hm, event);
310 }
311
312 /* Deal with any previously lost events */
313 static int
xfs_healthmon_clear_lost_prev(struct xfs_healthmon * hm)314 xfs_healthmon_clear_lost_prev(
315 struct xfs_healthmon *hm)
316 {
317 struct xfs_healthmon_event lost_event = {
318 .type = XFS_HEALTHMON_LOST,
319 .domain = XFS_HEALTHMON_MOUNT,
320 .lostcount = hm->lost_prev_event,
321 };
322 struct xfs_healthmon_event *event = NULL;
323
324 if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
325 trace_xfs_healthmon_merge(hm, hm->last_event);
326 wake_up(&hm->wait);
327 goto cleared;
328 }
329
330 if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
331 event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
332 GFP_NOFS);
333 if (!event)
334 return -ENOMEM;
335
336 __xfs_healthmon_push(hm, event);
337 cleared:
338 hm->lost_prev_event = 0;
339 return 0;
340 }
341
342 /*
343 * Push an event onto the end of the list after dealing with lost events and
344 * possibly full queues.
345 */
346 STATIC int
xfs_healthmon_push(struct xfs_healthmon * hm,const struct xfs_healthmon_event * template)347 xfs_healthmon_push(
348 struct xfs_healthmon *hm,
349 const struct xfs_healthmon_event *template)
350 {
351 struct xfs_healthmon_event *event = NULL;
352 int error = 0;
353
354 /*
355 * Locklessly check if the health monitor has already detached from the
356 * mount. If so, ignore the event. If we race with deactivation,
357 * we'll queue the event but never send it.
358 */
359 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
360 return -ESHUTDOWN;
361
362 mutex_lock(&hm->lock);
363
364 /* Report previously lost events before we do anything else */
365 if (hm->lost_prev_event) {
366 error = xfs_healthmon_clear_lost_prev(hm);
367 if (error)
368 goto out_unlock;
369 }
370
371 /* Try to merge with the newest event */
372 if (xfs_healthmon_merge_events(hm->last_event, template)) {
373 trace_xfs_healthmon_merge(hm, hm->last_event);
374 wake_up(&hm->wait);
375 goto out_unlock;
376 }
377
378 /* Only create a heap event object if we're not already at capacity. */
379 if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
380 event = kmemdup(template, sizeof(struct xfs_healthmon_event),
381 GFP_NOFS);
382 if (!event) {
383 /* No memory means we lose the event */
384 trace_xfs_healthmon_lost_event(hm);
385 xfs_healthmon_bump_lost(hm);
386 error = -ENOMEM;
387 goto out_unlock;
388 }
389
390 __xfs_healthmon_push(hm, event);
391
392 out_unlock:
393 mutex_unlock(&hm->lock);
394 return error;
395 }
396
397 /*
398 * Report that the filesystem is being unmounted, then detach the xfs mount
399 * from this healthmon instance.
400 */
401 void
xfs_healthmon_unmount(struct xfs_mount * mp)402 xfs_healthmon_unmount(
403 struct xfs_mount *mp)
404 {
405 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
406
407 if (!hm)
408 return;
409
410 trace_xfs_healthmon_report_unmount(hm);
411
412 /*
413 * Insert the unmount notification at the start of the event queue so
414 * that userspace knows the filesystem went away as soon as possible.
415 * There's nothing actionable for userspace after an unmount. Once
416 * we've inserted the unmount event, hm no longer owns that event.
417 */
418 __xfs_healthmon_insert(hm, hm->unmount_event);
419 hm->unmount_event = NULL;
420
421 xfs_healthmon_detach(hm);
422 xfs_healthmon_put(hm);
423 }
424
425 /* Compute the reporting mask for non-unmount metadata health events. */
426 static inline unsigned int
metadata_event_mask(struct xfs_healthmon * hm,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)427 metadata_event_mask(
428 struct xfs_healthmon *hm,
429 enum xfs_healthmon_type type,
430 unsigned int old_mask,
431 unsigned int new_mask)
432 {
433 /* If we want all events, return all events. */
434 if (hm->verbose)
435 return new_mask;
436
437 switch (type) {
438 case XFS_HEALTHMON_SICK:
439 /* Always report runtime corruptions */
440 return new_mask;
441 case XFS_HEALTHMON_CORRUPT:
442 /* Only report new fsck errors */
443 return new_mask & ~old_mask;
444 case XFS_HEALTHMON_HEALTHY:
445 /* Only report healthy metadata that got fixed */
446 return new_mask & old_mask;
447 default:
448 ASSERT(0);
449 break;
450 }
451
452 return 0;
453 }
454
455 /* Report XFS_FS_SICK_* events to healthmon */
456 void
xfs_healthmon_report_fs(struct xfs_mount * mp,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)457 xfs_healthmon_report_fs(
458 struct xfs_mount *mp,
459 enum xfs_healthmon_type type,
460 unsigned int old_mask,
461 unsigned int new_mask)
462 {
463 struct xfs_healthmon_event event = {
464 .type = type,
465 .domain = XFS_HEALTHMON_FS,
466 };
467 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
468
469 if (!hm)
470 return;
471
472 event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
473 ~XFS_SICK_FS_SECONDARY;
474 trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
475
476 if (event.fsmask)
477 xfs_healthmon_push(hm, &event);
478
479 xfs_healthmon_put(hm);
480 }
481
482 /* Report XFS_SICK_(AG|RG)* flags to healthmon */
483 void
xfs_healthmon_report_group(struct xfs_group * xg,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)484 xfs_healthmon_report_group(
485 struct xfs_group *xg,
486 enum xfs_healthmon_type type,
487 unsigned int old_mask,
488 unsigned int new_mask)
489 {
490 struct xfs_healthmon_event event = {
491 .type = type,
492 .group = xg->xg_gno,
493 };
494 struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount);
495
496 if (!hm)
497 return;
498
499 switch (xg->xg_type) {
500 case XG_TYPE_RTG:
501 event.domain = XFS_HEALTHMON_RTGROUP;
502 event.grpmask = metadata_event_mask(hm, type, old_mask,
503 new_mask) &
504 ~XFS_SICK_RG_SECONDARY;
505 break;
506 case XG_TYPE_AG:
507 event.domain = XFS_HEALTHMON_AG;
508 event.grpmask = metadata_event_mask(hm, type, old_mask,
509 new_mask) &
510 ~XFS_SICK_AG_SECONDARY;
511 break;
512 default:
513 ASSERT(0);
514 break;
515 }
516
517 trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
518
519 if (event.grpmask)
520 xfs_healthmon_push(hm, &event);
521
522 xfs_healthmon_put(hm);
523 }
524
525 /* Report XFS_SICK_INO_* flags to healthmon */
526 void
xfs_healthmon_report_inode(struct xfs_inode * ip,enum xfs_healthmon_type type,unsigned int old_mask,unsigned int new_mask)527 xfs_healthmon_report_inode(
528 struct xfs_inode *ip,
529 enum xfs_healthmon_type type,
530 unsigned int old_mask,
531 unsigned int new_mask)
532 {
533 struct xfs_healthmon_event event = {
534 .type = type,
535 .domain = XFS_HEALTHMON_INODE,
536 .ino = ip->i_ino,
537 .gen = VFS_I(ip)->i_generation,
538 };
539 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
540
541 if (!hm)
542 return;
543
544 event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
545 ~XFS_SICK_INO_SECONDARY;
546 trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
547
548 if (event.imask)
549 xfs_healthmon_push(hm, &event);
550
551 xfs_healthmon_put(hm);
552 }
553
554 /* Add a shutdown event to the reporting queue. */
555 void
xfs_healthmon_report_shutdown(struct xfs_mount * mp,uint32_t flags)556 xfs_healthmon_report_shutdown(
557 struct xfs_mount *mp,
558 uint32_t flags)
559 {
560 struct xfs_healthmon_event event = {
561 .type = XFS_HEALTHMON_SHUTDOWN,
562 .domain = XFS_HEALTHMON_MOUNT,
563 .flags = flags,
564 };
565 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
566
567 if (!hm)
568 return;
569
570 trace_xfs_healthmon_report_shutdown(hm, flags);
571
572 xfs_healthmon_push(hm, &event);
573 xfs_healthmon_put(hm);
574 }
575
576 static inline enum xfs_healthmon_domain
media_error_domain(enum xfs_device fdev)577 media_error_domain(
578 enum xfs_device fdev)
579 {
580 switch (fdev) {
581 case XFS_DEV_DATA:
582 return XFS_HEALTHMON_DATADEV;
583 case XFS_DEV_LOG:
584 return XFS_HEALTHMON_LOGDEV;
585 case XFS_DEV_RT:
586 return XFS_HEALTHMON_RTDEV;
587 }
588
589 ASSERT(0);
590 return 0;
591 }
592
593 /* Add a media error event to the reporting queue. */
594 void
xfs_healthmon_report_media(struct xfs_mount * mp,enum xfs_device fdev,xfs_daddr_t daddr,uint64_t bbcount)595 xfs_healthmon_report_media(
596 struct xfs_mount *mp,
597 enum xfs_device fdev,
598 xfs_daddr_t daddr,
599 uint64_t bbcount)
600 {
601 struct xfs_healthmon_event event = {
602 .type = XFS_HEALTHMON_MEDIA_ERROR,
603 .domain = media_error_domain(fdev),
604 .daddr = daddr,
605 .bbcount = bbcount,
606 };
607 struct xfs_healthmon *hm = xfs_healthmon_get(mp);
608
609 if (!hm)
610 return;
611
612 trace_xfs_healthmon_report_media(hm, fdev, &event);
613
614 xfs_healthmon_push(hm, &event);
615 xfs_healthmon_put(hm);
616 }
617
file_ioerr_type(enum fserror_type action)618 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
619 {
620 switch (action) {
621 case FSERR_BUFFERED_READ:
622 return XFS_HEALTHMON_BUFREAD;
623 case FSERR_BUFFERED_WRITE:
624 return XFS_HEALTHMON_BUFWRITE;
625 case FSERR_DIRECTIO_READ:
626 return XFS_HEALTHMON_DIOREAD;
627 case FSERR_DIRECTIO_WRITE:
628 return XFS_HEALTHMON_DIOWRITE;
629 case FSERR_DATA_LOST:
630 return XFS_HEALTHMON_DATALOST;
631 case FSERR_METADATA:
632 /* filtered out by xfs_fs_report_error */
633 break;
634 }
635
636 ASSERT(0);
637 return -1;
638 }
639
640 /* Add a file io error event to the reporting queue. */
641 void
xfs_healthmon_report_file_ioerror(struct xfs_inode * ip,const struct fserror_event * p)642 xfs_healthmon_report_file_ioerror(
643 struct xfs_inode *ip,
644 const struct fserror_event *p)
645 {
646 struct xfs_healthmon_event event = {
647 .type = file_ioerr_type(p->type),
648 .domain = XFS_HEALTHMON_FILERANGE,
649 .fino = ip->i_ino,
650 .fgen = VFS_I(ip)->i_generation,
651 .fpos = p->pos,
652 .flen = p->len,
653 /* send positive error number to userspace */
654 .error = -p->error,
655 };
656 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
657
658 if (!hm)
659 return;
660
661 trace_xfs_healthmon_report_file_ioerror(hm, p);
662
663 xfs_healthmon_push(hm, &event);
664 xfs_healthmon_put(hm);
665 }
666
667 static inline void
xfs_healthmon_reset_outbuf(struct xfs_healthmon * hm)668 xfs_healthmon_reset_outbuf(
669 struct xfs_healthmon *hm)
670 {
671 hm->buftail = 0;
672 hm->bufhead = 0;
673 }
674
675 struct flags_map {
676 unsigned int in_mask;
677 unsigned int out_mask;
678 };
679
680 static const struct flags_map shutdown_map[] = {
681 { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
682 { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
683 { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
684 { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
685 { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
686 { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
687 };
688
689 static inline unsigned int
__map_flags(const struct flags_map * map,size_t array_len,unsigned int flags)690 __map_flags(
691 const struct flags_map *map,
692 size_t array_len,
693 unsigned int flags)
694 {
695 const struct flags_map *m;
696 unsigned int ret = 0;
697
698 for (m = map; m < map + array_len; m++) {
699 if (flags & m->in_mask)
700 ret |= m->out_mask;
701 }
702
703 return ret;
704 }
705
706 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
707
shutdown_mask(unsigned int in)708 static inline unsigned int shutdown_mask(unsigned int in)
709 {
710 return map_flags(shutdown_map, in);
711 }
712
713 static const unsigned int domain_map[] = {
714 [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
715 [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS,
716 [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG,
717 [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE,
718 [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
719 [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
720 [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
721 [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
722 [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
723 };
724
725 static const unsigned int type_map[] = {
726 [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING,
727 [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST,
728 [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK,
729 [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
730 [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
731 [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
732 [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
733 [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
734 [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
735 [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
736 [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
737 [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
738 [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST,
739 };
740
741 /* Render event as a V0 structure */
742 STATIC int
xfs_healthmon_format_v0(struct xfs_healthmon * hm,const struct xfs_healthmon_event * event)743 xfs_healthmon_format_v0(
744 struct xfs_healthmon *hm,
745 const struct xfs_healthmon_event *event)
746 {
747 struct xfs_health_monitor_event hme = {
748 .time_ns = event->time_ns,
749 };
750
751 trace_xfs_healthmon_format(hm, event);
752
753 if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
754 event->type < 0 || event->type >= ARRAY_SIZE(type_map))
755 return -EFSCORRUPTED;
756
757 hme.domain = domain_map[event->domain];
758 hme.type = type_map[event->type];
759
760 /* fill in the event-specific details */
761 switch (event->domain) {
762 case XFS_HEALTHMON_MOUNT:
763 switch (event->type) {
764 case XFS_HEALTHMON_LOST:
765 hme.e.lost.count = event->lostcount;
766 break;
767 case XFS_HEALTHMON_SHUTDOWN:
768 hme.e.shutdown.reasons = shutdown_mask(event->flags);
769 break;
770 default:
771 break;
772 }
773 break;
774 case XFS_HEALTHMON_FS:
775 hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
776 break;
777 case XFS_HEALTHMON_RTGROUP:
778 hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
779 hme.e.group.gno = event->group;
780 break;
781 case XFS_HEALTHMON_AG:
782 hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
783 hme.e.group.gno = event->group;
784 break;
785 case XFS_HEALTHMON_INODE:
786 hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
787 hme.e.inode.ino = event->ino;
788 hme.e.inode.gen = event->gen;
789 break;
790 case XFS_HEALTHMON_DATADEV:
791 case XFS_HEALTHMON_LOGDEV:
792 case XFS_HEALTHMON_RTDEV:
793 hme.e.media.daddr = event->daddr;
794 hme.e.media.bbcount = event->bbcount;
795 break;
796 case XFS_HEALTHMON_FILERANGE:
797 hme.e.filerange.ino = event->fino;
798 hme.e.filerange.gen = event->fgen;
799 hme.e.filerange.pos = event->fpos;
800 hme.e.filerange.len = event->flen;
801 hme.e.filerange.error = abs(event->error);
802 break;
803 default:
804 break;
805 }
806
807 ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
808
809 /* copy formatted object to the outbuf */
810 if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
811 memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
812 hm->bufhead += sizeof(hme);
813 }
814
815 return 0;
816 }
817
818 /* How many bytes are waiting in the outbuf to be copied? */
819 static inline size_t
xfs_healthmon_outbuf_bytes(struct xfs_healthmon * hm)820 xfs_healthmon_outbuf_bytes(
821 struct xfs_healthmon *hm)
822 {
823 if (hm->bufhead > hm->buftail)
824 return hm->bufhead - hm->buftail;
825 return 0;
826 }
827
828 /*
829 * Do we have something for userspace to read? This can mean unmount events,
830 * events pending in the queue, or pending bytes in the outbuf.
831 */
832 static inline bool
xfs_healthmon_has_eventdata(struct xfs_healthmon * hm)833 xfs_healthmon_has_eventdata(
834 struct xfs_healthmon *hm)
835 {
836 /*
837 * If the health monitor is already detached from the xfs_mount, we
838 * want reads to return 0 bytes even if there are no events, because
839 * userspace interprets that as EOF. If we race with deactivation,
840 * read_iter will take the necessary locks to discover that there are
841 * no events to send.
842 */
843 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
844 return true;
845
846 /*
847 * Either there are events waiting to be formatted into the buffer, or
848 * there's unread bytes in the buffer.
849 */
850 return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
851 }
852
853 /* Try to copy the rest of the outbuf to the iov iter. */
854 STATIC ssize_t
xfs_healthmon_copybuf(struct xfs_healthmon * hm,struct iov_iter * to)855 xfs_healthmon_copybuf(
856 struct xfs_healthmon *hm,
857 struct iov_iter *to)
858 {
859 size_t to_copy;
860 size_t w = 0;
861
862 trace_xfs_healthmon_copybuf(hm, to);
863
864 to_copy = xfs_healthmon_outbuf_bytes(hm);
865 if (to_copy) {
866 w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
867 if (!w)
868 return -EFAULT;
869
870 hm->buftail += w;
871 }
872
873 /*
874 * Nothing left to copy? Reset the output buffer cursors to the start
875 * since there's no live data in the buffer.
876 */
877 if (xfs_healthmon_outbuf_bytes(hm) == 0)
878 xfs_healthmon_reset_outbuf(hm);
879 return w;
880 }
881
882 /*
883 * Return a health monitoring event for formatting into the output buffer if
884 * there's enough space in the outbuf and an event waiting for us. Caller
885 * must hold i_rwsem on the healthmon file.
886 */
887 static inline struct xfs_healthmon_event *
xfs_healthmon_format_pop(struct xfs_healthmon * hm)888 xfs_healthmon_format_pop(
889 struct xfs_healthmon *hm)
890 {
891 struct xfs_healthmon_event *event;
892
893 if (hm->bufhead + sizeof(*event) > hm->bufsize)
894 return NULL;
895
896 mutex_lock(&hm->lock);
897 event = hm->first_event;
898 if (event) {
899 if (hm->last_event == event)
900 hm->last_event = NULL;
901 hm->first_event = event->next;
902 hm->events--;
903
904 trace_xfs_healthmon_pop(hm, event);
905 }
906 mutex_unlock(&hm->lock);
907 return event;
908 }
909
910 /* Allocate formatting buffer */
911 STATIC int
xfs_healthmon_alloc_outbuf(struct xfs_healthmon * hm,size_t user_bufsize)912 xfs_healthmon_alloc_outbuf(
913 struct xfs_healthmon *hm,
914 size_t user_bufsize)
915 {
916 void *outbuf;
917 size_t bufsize =
918 min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
919
920 outbuf = kzalloc(bufsize, GFP_KERNEL);
921 if (!outbuf) {
922 if (bufsize == PAGE_SIZE)
923 return -ENOMEM;
924
925 bufsize = PAGE_SIZE;
926 outbuf = kzalloc(bufsize, GFP_KERNEL);
927 if (!outbuf)
928 return -ENOMEM;
929 }
930
931 hm->buffer = outbuf;
932 hm->bufsize = bufsize;
933 hm->bufhead = 0;
934 hm->buftail = 0;
935
936 return 0;
937 }
938
939 /*
940 * Convey queued event data to userspace. First copy any remaining bytes in
941 * the outbuf, then format the oldest event into the outbuf and copy that too.
942 */
943 STATIC ssize_t
xfs_healthmon_read_iter(struct kiocb * iocb,struct iov_iter * to)944 xfs_healthmon_read_iter(
945 struct kiocb *iocb,
946 struct iov_iter *to)
947 {
948 struct file *file = iocb->ki_filp;
949 struct inode *inode = file_inode(file);
950 struct xfs_healthmon *hm = file->private_data;
951 struct xfs_healthmon_event *event;
952 size_t copied = 0;
953 ssize_t ret = 0;
954
955 if (file->f_flags & O_NONBLOCK) {
956 if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
957 return -EAGAIN;
958 } else {
959 ret = wait_event_interruptible(hm->wait,
960 xfs_healthmon_has_eventdata(hm));
961 if (ret)
962 return ret;
963
964 inode_lock(inode);
965 }
966
967 if (hm->bufsize == 0) {
968 ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
969 if (ret)
970 goto out_unlock;
971 }
972
973 trace_xfs_healthmon_read_start(hm);
974
975 /*
976 * If there's anything left in the output buffer, copy that before
977 * formatting more events.
978 */
979 ret = xfs_healthmon_copybuf(hm, to);
980 if (ret < 0)
981 goto out_unlock;
982 copied += ret;
983
984 while (iov_iter_count(to) > 0) {
985 /* Format the next events into the outbuf until it's full. */
986 while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
987 ret = xfs_healthmon_format_v0(hm, event);
988 kfree(event);
989 if (ret)
990 goto out_unlock;
991 }
992
993 /* Copy anything formatted into outbuf to userspace */
994 ret = xfs_healthmon_copybuf(hm, to);
995 if (ret <= 0)
996 break;
997
998 copied += ret;
999 }
1000
1001 out_unlock:
1002 trace_xfs_healthmon_read_finish(hm);
1003 inode_unlock(inode);
1004 return copied ?: ret;
1005 }
1006
1007 /* Poll for available events. */
1008 STATIC __poll_t
xfs_healthmon_poll(struct file * file,struct poll_table_struct * wait)1009 xfs_healthmon_poll(
1010 struct file *file,
1011 struct poll_table_struct *wait)
1012 {
1013 struct xfs_healthmon *hm = file->private_data;
1014 __poll_t mask = 0;
1015
1016 poll_wait(file, &hm->wait, wait);
1017
1018 if (xfs_healthmon_has_eventdata(hm))
1019 mask |= EPOLLIN;
1020 return mask;
1021 }
1022
1023 /* Free the health monitoring information. */
1024 STATIC int
xfs_healthmon_release(struct inode * inode,struct file * file)1025 xfs_healthmon_release(
1026 struct inode *inode,
1027 struct file *file)
1028 {
1029 struct xfs_healthmon *hm = file->private_data;
1030
1031 trace_xfs_healthmon_release(hm);
1032
1033 /*
1034 * We might be closing the healthmon file before the filesystem
1035 * unmounts, because userspace processes can terminate at any time and
1036 * for any reason. Null out xfs_mount::m_healthmon so that another
1037 * process can create another health monitor file.
1038 */
1039 xfs_healthmon_detach(hm);
1040 xfs_healthmon_put(hm);
1041 return 0;
1042 }
1043
1044 /* Validate ioctl parameters. */
1045 static inline bool
xfs_healthmon_validate(const struct xfs_health_monitor * hmo)1046 xfs_healthmon_validate(
1047 const struct xfs_health_monitor *hmo)
1048 {
1049 if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
1050 return false;
1051 if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
1052 return false;
1053 if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
1054 return false;
1055 return true;
1056 }
1057
1058 /* Emit some data about the health monitoring fd. */
1059 static void
xfs_healthmon_show_fdinfo(struct seq_file * m,struct file * file)1060 xfs_healthmon_show_fdinfo(
1061 struct seq_file *m,
1062 struct file *file)
1063 {
1064 struct xfs_healthmon *hm = file->private_data;
1065
1066 mutex_lock(&hm->lock);
1067 seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
1068 hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
1069 "dead" : "alive",
1070 MAJOR(hm->dev), MINOR(hm->dev),
1071 hm->total_events,
1072 hm->total_lost);
1073 mutex_unlock(&hm->lock);
1074 }
1075
1076 /* Reconfigure the health monitor. */
1077 STATIC long
xfs_healthmon_reconfigure(struct file * file,unsigned int cmd,void __user * arg)1078 xfs_healthmon_reconfigure(
1079 struct file *file,
1080 unsigned int cmd,
1081 void __user *arg)
1082 {
1083 struct xfs_health_monitor hmo;
1084 struct xfs_healthmon *hm = file->private_data;
1085
1086 if (copy_from_user(&hmo, arg, sizeof(hmo)))
1087 return -EFAULT;
1088
1089 if (!xfs_healthmon_validate(&hmo))
1090 return -EINVAL;
1091
1092 mutex_lock(&hm->lock);
1093 hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
1094 mutex_unlock(&hm->lock);
1095
1096 return 0;
1097 }
1098
1099 /* Does the fd point to the same filesystem as the one we're monitoring? */
1100 STATIC long
xfs_healthmon_file_on_monitored_fs(struct file * file,unsigned int cmd,void __user * arg)1101 xfs_healthmon_file_on_monitored_fs(
1102 struct file *file,
1103 unsigned int cmd,
1104 void __user *arg)
1105 {
1106 struct xfs_health_file_on_monitored_fs hms;
1107 struct xfs_healthmon *hm = file->private_data;
1108 struct inode *hms_inode;
1109
1110 if (copy_from_user(&hms, arg, sizeof(hms)))
1111 return -EFAULT;
1112
1113 if (hms.flags)
1114 return -EINVAL;
1115
1116 CLASS(fd, hms_fd)(hms.fd);
1117 if (fd_empty(hms_fd))
1118 return -EBADF;
1119
1120 hms_inode = file_inode(fd_file(hms_fd));
1121 mutex_lock(&hm->lock);
1122 if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
1123 mutex_unlock(&hm->lock);
1124 return -ESTALE;
1125 }
1126
1127 mutex_unlock(&hm->lock);
1128 return 0;
1129 }
1130
1131 /* Handle ioctls for the health monitoring thread. */
1132 STATIC long
xfs_healthmon_ioctl(struct file * file,unsigned int cmd,unsigned long p)1133 xfs_healthmon_ioctl(
1134 struct file *file,
1135 unsigned int cmd,
1136 unsigned long p)
1137 {
1138 void __user *arg = (void __user *)p;
1139
1140 switch (cmd) {
1141 case XFS_IOC_HEALTH_MONITOR:
1142 return xfs_healthmon_reconfigure(file, cmd, arg);
1143 case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
1144 return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
1145 default:
1146 break;
1147 }
1148
1149 return -ENOTTY;
1150 }
1151
1152 static const struct file_operations xfs_healthmon_fops = {
1153 .owner = THIS_MODULE,
1154 .show_fdinfo = xfs_healthmon_show_fdinfo,
1155 .read_iter = xfs_healthmon_read_iter,
1156 .poll = xfs_healthmon_poll,
1157 .release = xfs_healthmon_release,
1158 .unlocked_ioctl = xfs_healthmon_ioctl,
1159 };
1160
1161 /*
1162 * Create a health monitoring file. Returns an index to the fd table or a
1163 * negative errno.
1164 */
1165 long
xfs_ioc_health_monitor(struct file * file,struct xfs_health_monitor __user * arg)1166 xfs_ioc_health_monitor(
1167 struct file *file,
1168 struct xfs_health_monitor __user *arg)
1169 {
1170 struct xfs_health_monitor hmo;
1171 struct xfs_healthmon_event *running_event;
1172 struct xfs_healthmon *hm;
1173 struct xfs_inode *ip = XFS_I(file_inode(file));
1174 struct xfs_mount *mp = ip->i_mount;
1175 int ret;
1176
1177 /*
1178 * The only intended user of the health monitoring system should be the
1179 * xfs_healer daemon running on behalf of the whole filesystem in the
1180 * initial user namespace. IOWs, we don't allow unprivileged userspace
1181 * (they can use fsnotify) nor do we allow containers.
1182 */
1183 if (!capable(CAP_SYS_ADMIN))
1184 return -EPERM;
1185 if (ip->i_ino != mp->m_sb.sb_rootino)
1186 return -EPERM;
1187 if (current_user_ns() != &init_user_ns)
1188 return -EPERM;
1189
1190 if (copy_from_user(&hmo, arg, sizeof(hmo)))
1191 return -EFAULT;
1192
1193 if (!xfs_healthmon_validate(&hmo))
1194 return -EINVAL;
1195
1196 hm = kzalloc_obj(*hm);
1197 if (!hm)
1198 return -ENOMEM;
1199 hm->dev = mp->m_super->s_dev;
1200 refcount_set(&hm->ref, 1);
1201
1202 mutex_init(&hm->lock);
1203 init_waitqueue_head(&hm->wait);
1204
1205 if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
1206 hm->verbose = true;
1207
1208 /* Queue up the first event that lets the client know we're running. */
1209 running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1210 if (!running_event) {
1211 ret = -ENOMEM;
1212 goto out_hm;
1213 }
1214 running_event->type = XFS_HEALTHMON_RUNNING;
1215 running_event->domain = XFS_HEALTHMON_MOUNT;
1216 __xfs_healthmon_insert(hm, running_event);
1217
1218 /*
1219 * Preallocate the unmount event so that we can't fail to notify the
1220 * filesystem later. This is key for triggering fast exit of the
1221 * xfs_healer daemon.
1222 */
1223 hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1224 if (!hm->unmount_event) {
1225 ret = -ENOMEM;
1226 goto out_hm;
1227 }
1228 hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
1229 hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
1230
1231 /*
1232 * Try to attach this health monitor to the xfs_mount. The monitor is
1233 * considered live and will receive events if this succeeds.
1234 */
1235 ret = xfs_healthmon_attach(mp, hm);
1236 if (ret)
1237 goto out_hm;
1238
1239 /*
1240 * Create the anonymous file and install a fd for it. If it succeeds,
1241 * the file owns hm and can go away at any time, so we must not access
1242 * it again. This must go last because we can't undo a fd table
1243 * installation.
1244 */
1245 ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
1246 O_CLOEXEC | O_RDONLY);
1247 if (ret < 0)
1248 goto out_mp;
1249
1250 trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
1251
1252 return ret;
1253
1254 out_mp:
1255 xfs_healthmon_detach(hm);
1256 out_hm:
1257 ASSERT(refcount_read(&hm->ref) == 1);
1258 xfs_healthmon_put(hm);
1259 return ret;
1260 }
1261