xref: /linux/fs/xfs/xfs_healthmon.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trace.h"
15 #include "xfs_ag.h"
16 #include "xfs_btree.h"
17 #include "xfs_da_format.h"
18 #include "xfs_da_btree.h"
19 #include "xfs_quota_defs.h"
20 #include "xfs_rtgroup.h"
21 #include "xfs_health.h"
22 #include "xfs_healthmon.h"
23 #include "xfs_fsops.h"
24 #include "xfs_notify_failure.h"
25 #include "xfs_file.h"
26 #include "xfs_ioctl.h"
27 
28 #include <linux/anon_inodes.h>
29 #include <linux/eventpoll.h>
30 #include <linux/poll.h>
31 #include <linux/fserror.h>
32 
33 /*
34  * Live Health Monitoring
35  * ======================
36  *
37  * Autonomous self-healing of XFS filesystems requires a means for the kernel
38  * to send filesystem health events to a monitoring daemon in userspace.  To
39  * accomplish this, we establish a thread_with_file kthread object to handle
40  * translating internal events about filesystem health into a format that can
41  * be parsed easily by userspace.  When those internal events occur, the core
42  * filesystem code calls this health monitor to convey the events to userspace.
43  * Userspace reads events from the file descriptor returned by the ioctl.
44  *
45  * The healthmon abstraction has a weak reference to the host filesystem mount
46  * so that the queueing and processing of the events do not pin the mount and
47  * cannot slow down the main filesystem.  The healthmon object can exist past
48  * the end of the filesystem mount.
49  */
50 
51 /* sign of a detached health monitor */
52 #define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
53 
54 /* Constrain the number of event objects that can build up in memory. */
55 #define XFS_HEALTHMON_MAX_EVENTS	(SZ_32K / \
56 					 sizeof(struct xfs_healthmon_event))
57 
58 /* Constrain the size of the output buffer for read_iter. */
59 #define XFS_HEALTHMON_MAX_OUTBUF	SZ_64K
60 
61 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
62 static DEFINE_SPINLOCK(xfs_healthmon_lock);
63 
64 /* Grab a reference to the healthmon object for a given mount, if any. */
65 static struct xfs_healthmon *
66 xfs_healthmon_get(
67 	struct xfs_mount		*mp)
68 {
69 	struct xfs_healthmon		*hm;
70 
71 	rcu_read_lock();
72 	hm = mp->m_healthmon;
73 	if (hm && !refcount_inc_not_zero(&hm->ref))
74 		hm = NULL;
75 	rcu_read_unlock();
76 
77 	return hm;
78 }
79 
80 /*
81  * Release the reference to a healthmon object.  If there are no more holders,
82  * free the health monitor after an RCU grace period to eliminate possibility
83  * of races with xfs_healthmon_get.
84  */
85 static void
86 xfs_healthmon_put(
87 	struct xfs_healthmon		*hm)
88 {
89 	if (refcount_dec_and_test(&hm->ref)) {
90 		struct xfs_healthmon_event	*event;
91 		struct xfs_healthmon_event	*next = hm->first_event;
92 
93 		while ((event = next) != NULL) {
94 			trace_xfs_healthmon_drop(hm, event);
95 			next = event->next;
96 			kfree(event);
97 		}
98 
99 		kfree(hm->unmount_event);
100 		kfree(hm->buffer);
101 		mutex_destroy(&hm->lock);
102 		kfree_rcu_mightsleep(hm);
103 	}
104 }
105 
106 /* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
107 STATIC int
108 xfs_healthmon_attach(
109 	struct xfs_mount	*mp,
110 	struct xfs_healthmon	*hm)
111 {
112 	spin_lock(&xfs_healthmon_lock);
113 	if (mp->m_healthmon != NULL) {
114 		spin_unlock(&xfs_healthmon_lock);
115 		return -EEXIST;
116 	}
117 
118 	refcount_inc(&hm->ref);
119 	mp->m_healthmon = hm;
120 	hm->mount_cookie = (uintptr_t)mp->m_super;
121 	spin_unlock(&xfs_healthmon_lock);
122 
123 	return 0;
124 }
125 
126 /* Detach a xfs mount from a specific healthmon instance. */
127 STATIC void
128 xfs_healthmon_detach(
129 	struct xfs_healthmon	*hm)
130 {
131 	spin_lock(&xfs_healthmon_lock);
132 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
133 		spin_unlock(&xfs_healthmon_lock);
134 		return;
135 	}
136 
137 	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
138 	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
139 	spin_unlock(&xfs_healthmon_lock);
140 
141 	trace_xfs_healthmon_detach(hm);
142 	xfs_healthmon_put(hm);
143 }
144 
145 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
146 {
147 	hm->events++;
148 	hm->total_events++;
149 }
150 
151 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
152 {
153 	hm->lost_prev_event++;
154 	hm->total_lost++;
155 }
156 
157 /*
158  * If possible, merge a new event into an existing event.  Returns whether or
159  * not it merged anything.
160  */
161 static bool
162 xfs_healthmon_merge_events(
163 	struct xfs_healthmon_event		*existing,
164 	const struct xfs_healthmon_event	*new)
165 {
166 	if (!existing)
167 		return false;
168 
169 	/* type and domain must match to merge events */
170 	if (existing->type != new->type ||
171 	    existing->domain != new->domain)
172 		return false;
173 
174 	switch (existing->type) {
175 	case XFS_HEALTHMON_RUNNING:
176 	case XFS_HEALTHMON_UNMOUNT:
177 		/* should only ever be one of these events anyway */
178 		return false;
179 
180 	case XFS_HEALTHMON_LOST:
181 		existing->lostcount += new->lostcount;
182 		return true;
183 
184 	case XFS_HEALTHMON_SICK:
185 	case XFS_HEALTHMON_CORRUPT:
186 	case XFS_HEALTHMON_HEALTHY:
187 		switch (existing->domain) {
188 		case XFS_HEALTHMON_FS:
189 			existing->fsmask |= new->fsmask;
190 			return true;
191 		case XFS_HEALTHMON_AG:
192 		case XFS_HEALTHMON_RTGROUP:
193 			if (existing->group == new->group){
194 				existing->grpmask |= new->grpmask;
195 				return true;
196 			}
197 			return false;
198 		case XFS_HEALTHMON_INODE:
199 			if (existing->ino == new->ino &&
200 			    existing->gen == new->gen) {
201 				existing->imask |= new->imask;
202 				return true;
203 			}
204 			return false;
205 		default:
206 			ASSERT(0);
207 			return false;
208 		}
209 		return false;
210 
211 	case XFS_HEALTHMON_SHUTDOWN:
212 		/* yes, we can race to shutdown */
213 		existing->flags |= new->flags;
214 		return true;
215 
216 	case XFS_HEALTHMON_MEDIA_ERROR:
217 		/* physically adjacent errors can merge */
218 		if (existing->daddr + existing->bbcount == new->daddr) {
219 			existing->bbcount += new->bbcount;
220 			return true;
221 		}
222 		if (new->daddr + new->bbcount == existing->daddr) {
223 			existing->daddr = new->daddr;
224 			existing->bbcount += new->bbcount;
225 			return true;
226 		}
227 		return false;
228 
229 	case XFS_HEALTHMON_BUFREAD:
230 	case XFS_HEALTHMON_BUFWRITE:
231 	case XFS_HEALTHMON_DIOREAD:
232 	case XFS_HEALTHMON_DIOWRITE:
233 	case XFS_HEALTHMON_DATALOST:
234 		/* logically adjacent file ranges can merge */
235 		if (existing->fino != new->fino || existing->fgen != new->fgen)
236 			return false;
237 
238 		if (existing->fpos + existing->flen == new->fpos) {
239 			existing->flen += new->flen;
240 			return true;
241 		}
242 
243 		if (new->fpos + new->flen == existing->fpos) {
244 			existing->fpos = new->fpos;
245 			existing->flen += new->flen;
246 			return true;
247 		}
248 		return false;
249 	}
250 
251 	return false;
252 }
253 
254 /* Insert an event onto the start of the queue. */
255 static inline void
256 __xfs_healthmon_insert(
257 	struct xfs_healthmon		*hm,
258 	struct xfs_healthmon_event	*event)
259 {
260 	struct timespec64		now;
261 
262 	ktime_get_coarse_real_ts64(&now);
263 	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
264 
265 	event->next = hm->first_event;
266 	if (!hm->first_event)
267 		hm->first_event = event;
268 	if (!hm->last_event)
269 		hm->last_event = event;
270 	xfs_healthmon_bump_events(hm);
271 	wake_up(&hm->wait);
272 
273 	trace_xfs_healthmon_insert(hm, event);
274 }
275 
276 /* Push an event onto the end of the queue. */
277 static inline void
278 __xfs_healthmon_push(
279 	struct xfs_healthmon		*hm,
280 	struct xfs_healthmon_event	*event)
281 {
282 	struct timespec64		now;
283 
284 	ktime_get_coarse_real_ts64(&now);
285 	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
286 
287 	if (!hm->first_event)
288 		hm->first_event = event;
289 	if (hm->last_event)
290 		hm->last_event->next = event;
291 	hm->last_event = event;
292 	event->next = NULL;
293 	xfs_healthmon_bump_events(hm);
294 	wake_up(&hm->wait);
295 
296 	trace_xfs_healthmon_push(hm, event);
297 }
298 
299 /* Deal with any previously lost events */
300 static int
301 xfs_healthmon_clear_lost_prev(
302 	struct xfs_healthmon		*hm)
303 {
304 	struct xfs_healthmon_event	lost_event = {
305 		.type			= XFS_HEALTHMON_LOST,
306 		.domain			= XFS_HEALTHMON_MOUNT,
307 		.lostcount		= hm->lost_prev_event,
308 	};
309 	struct xfs_healthmon_event	*event = NULL;
310 
311 	if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
312 		trace_xfs_healthmon_merge(hm, hm->last_event);
313 		wake_up(&hm->wait);
314 		goto cleared;
315 	}
316 
317 	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
318 		event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
319 				GFP_NOFS);
320 	if (!event)
321 		return -ENOMEM;
322 
323 	__xfs_healthmon_push(hm, event);
324 cleared:
325 	hm->lost_prev_event = 0;
326 	return 0;
327 }
328 
329 /*
330  * Push an event onto the end of the list after dealing with lost events and
331  * possibly full queues.
332  */
333 STATIC int
334 xfs_healthmon_push(
335 	struct xfs_healthmon			*hm,
336 	const struct xfs_healthmon_event	*template)
337 {
338 	struct xfs_healthmon_event		*event = NULL;
339 	int					error = 0;
340 
341 	/*
342 	 * Locklessly check if the health monitor has already detached from the
343 	 * mount.  If so, ignore the event.  If we race with deactivation,
344 	 * we'll queue the event but never send it.
345 	 */
346 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
347 		return -ESHUTDOWN;
348 
349 	mutex_lock(&hm->lock);
350 
351 	/* Report previously lost events before we do anything else */
352 	if (hm->lost_prev_event) {
353 		error = xfs_healthmon_clear_lost_prev(hm);
354 		if (error)
355 			goto out_unlock;
356 	}
357 
358 	/* Try to merge with the newest event */
359 	if (xfs_healthmon_merge_events(hm->last_event, template)) {
360 		trace_xfs_healthmon_merge(hm, hm->last_event);
361 		wake_up(&hm->wait);
362 		goto out_unlock;
363 	}
364 
365 	/* Only create a heap event object if we're not already at capacity. */
366 	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
367 		event = kmemdup(template, sizeof(struct xfs_healthmon_event),
368 				GFP_NOFS);
369 	if (!event) {
370 		/* No memory means we lose the event */
371 		trace_xfs_healthmon_lost_event(hm);
372 		xfs_healthmon_bump_lost(hm);
373 		error = -ENOMEM;
374 		goto out_unlock;
375 	}
376 
377 	__xfs_healthmon_push(hm, event);
378 
379 out_unlock:
380 	mutex_unlock(&hm->lock);
381 	return error;
382 }
383 
384 /*
385  * Report that the filesystem is being unmounted, then detach the xfs mount
386  * from this healthmon instance.
387  */
388 void
389 xfs_healthmon_unmount(
390 	struct xfs_mount		*mp)
391 {
392 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
393 
394 	if (!hm)
395 		return;
396 
397 	trace_xfs_healthmon_report_unmount(hm);
398 
399 	/*
400 	 * Insert the unmount notification at the start of the event queue so
401 	 * that userspace knows the filesystem went away as soon as possible.
402 	 * There's nothing actionable for userspace after an unmount.  Once
403 	 * we've inserted the unmount event, hm no longer owns that event.
404 	 */
405 	__xfs_healthmon_insert(hm, hm->unmount_event);
406 	hm->unmount_event = NULL;
407 
408 	xfs_healthmon_detach(hm);
409 	xfs_healthmon_put(hm);
410 }
411 
412 /* Compute the reporting mask for non-unmount metadata health events. */
413 static inline unsigned int
414 metadata_event_mask(
415 	struct xfs_healthmon		*hm,
416 	enum xfs_healthmon_type		type,
417 	unsigned int			old_mask,
418 	unsigned int			new_mask)
419 {
420 	/* If we want all events, return all events. */
421 	if (hm->verbose)
422 		return new_mask;
423 
424 	switch (type) {
425 	case XFS_HEALTHMON_SICK:
426 		/* Always report runtime corruptions */
427 		return new_mask;
428 	case XFS_HEALTHMON_CORRUPT:
429 		/* Only report new fsck errors */
430 		return new_mask & ~old_mask;
431 	case XFS_HEALTHMON_HEALTHY:
432 		/* Only report healthy metadata that got fixed */
433 		return new_mask & old_mask;
434 	default:
435 		ASSERT(0);
436 		break;
437 	}
438 
439 	return 0;
440 }
441 
442 /* Report XFS_FS_SICK_* events to healthmon */
443 void
444 xfs_healthmon_report_fs(
445 	struct xfs_mount		*mp,
446 	enum xfs_healthmon_type		type,
447 	unsigned int			old_mask,
448 	unsigned int			new_mask)
449 {
450 	struct xfs_healthmon_event	event = {
451 		.type			= type,
452 		.domain			= XFS_HEALTHMON_FS,
453 	};
454 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
455 
456 	if (!hm)
457 		return;
458 
459 	event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
460 			~XFS_SICK_FS_SECONDARY;
461 	trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
462 
463 	if (event.fsmask)
464 		xfs_healthmon_push(hm, &event);
465 
466 	xfs_healthmon_put(hm);
467 }
468 
469 /* Report XFS_SICK_(AG|RG)* flags to healthmon */
470 void
471 xfs_healthmon_report_group(
472 	struct xfs_group		*xg,
473 	enum xfs_healthmon_type		type,
474 	unsigned int			old_mask,
475 	unsigned int			new_mask)
476 {
477 	struct xfs_healthmon_event	event = {
478 		.type			= type,
479 		.group			= xg->xg_gno,
480 	};
481 	struct xfs_healthmon		*hm = xfs_healthmon_get(xg->xg_mount);
482 
483 	if (!hm)
484 		return;
485 
486 	switch (xg->xg_type) {
487 	case XG_TYPE_RTG:
488 		event.domain = XFS_HEALTHMON_RTGROUP;
489 		event.grpmask = metadata_event_mask(hm, type, old_mask,
490 						    new_mask) &
491 				~XFS_SICK_RG_SECONDARY;
492 		break;
493 	case XG_TYPE_AG:
494 		event.domain = XFS_HEALTHMON_AG;
495 		event.grpmask = metadata_event_mask(hm, type, old_mask,
496 						    new_mask) &
497 				~XFS_SICK_AG_SECONDARY;
498 		break;
499 	default:
500 		ASSERT(0);
501 		break;
502 	}
503 
504 	trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
505 
506 	if (event.grpmask)
507 		xfs_healthmon_push(hm, &event);
508 
509 	xfs_healthmon_put(hm);
510 }
511 
512 /* Report XFS_SICK_INO_* flags to healthmon */
513 void
514 xfs_healthmon_report_inode(
515 	struct xfs_inode		*ip,
516 	enum xfs_healthmon_type		type,
517 	unsigned int			old_mask,
518 	unsigned int			new_mask)
519 {
520 	struct xfs_healthmon_event	event = {
521 		.type			= type,
522 		.domain			= XFS_HEALTHMON_INODE,
523 		.ino			= ip->i_ino,
524 		.gen			= VFS_I(ip)->i_generation,
525 	};
526 	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);
527 
528 	if (!hm)
529 		return;
530 
531 	event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
532 			~XFS_SICK_INO_SECONDARY;
533 	trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
534 
535 	if (event.imask)
536 		xfs_healthmon_push(hm, &event);
537 
538 	xfs_healthmon_put(hm);
539 }
540 
541 /* Add a shutdown event to the reporting queue. */
542 void
543 xfs_healthmon_report_shutdown(
544 	struct xfs_mount		*mp,
545 	uint32_t			flags)
546 {
547 	struct xfs_healthmon_event	event = {
548 		.type			= XFS_HEALTHMON_SHUTDOWN,
549 		.domain			= XFS_HEALTHMON_MOUNT,
550 		.flags			= flags,
551 	};
552 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
553 
554 	if (!hm)
555 		return;
556 
557 	trace_xfs_healthmon_report_shutdown(hm, flags);
558 
559 	xfs_healthmon_push(hm, &event);
560 	xfs_healthmon_put(hm);
561 }
562 
563 static inline enum xfs_healthmon_domain
564 media_error_domain(
565 	enum xfs_device			fdev)
566 {
567 	switch (fdev) {
568 	case XFS_DEV_DATA:
569 		return XFS_HEALTHMON_DATADEV;
570 	case XFS_DEV_LOG:
571 		return XFS_HEALTHMON_LOGDEV;
572 	case XFS_DEV_RT:
573 		return XFS_HEALTHMON_RTDEV;
574 	}
575 
576 	ASSERT(0);
577 	return 0;
578 }
579 
580 /* Add a media error event to the reporting queue. */
581 void
582 xfs_healthmon_report_media(
583 	struct xfs_mount		*mp,
584 	enum xfs_device			fdev,
585 	xfs_daddr_t			daddr,
586 	uint64_t			bbcount)
587 {
588 	struct xfs_healthmon_event	event = {
589 		.type			= XFS_HEALTHMON_MEDIA_ERROR,
590 		.domain			= media_error_domain(fdev),
591 		.daddr			= daddr,
592 		.bbcount		= bbcount,
593 	};
594 	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
595 
596 	if (!hm)
597 		return;
598 
599 	trace_xfs_healthmon_report_media(hm, fdev, &event);
600 
601 	xfs_healthmon_push(hm, &event);
602 	xfs_healthmon_put(hm);
603 }
604 
605 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
606 {
607 	switch (action) {
608 	case FSERR_BUFFERED_READ:
609 		return XFS_HEALTHMON_BUFREAD;
610 	case FSERR_BUFFERED_WRITE:
611 		return XFS_HEALTHMON_BUFWRITE;
612 	case FSERR_DIRECTIO_READ:
613 		return XFS_HEALTHMON_DIOREAD;
614 	case FSERR_DIRECTIO_WRITE:
615 		return XFS_HEALTHMON_DIOWRITE;
616 	case FSERR_DATA_LOST:
617 		return XFS_HEALTHMON_DATALOST;
618 	case FSERR_METADATA:
619 		/* filtered out by xfs_fs_report_error */
620 		break;
621 	}
622 
623 	ASSERT(0);
624 	return -1;
625 }
626 
627 /* Add a file io error event to the reporting queue. */
628 void
629 xfs_healthmon_report_file_ioerror(
630 	struct xfs_inode		*ip,
631 	const struct fserror_event	*p)
632 {
633 	struct xfs_healthmon_event	event = {
634 		.type			= file_ioerr_type(p->type),
635 		.domain			= XFS_HEALTHMON_FILERANGE,
636 		.fino			= ip->i_ino,
637 		.fgen			= VFS_I(ip)->i_generation,
638 		.fpos			= p->pos,
639 		.flen			= p->len,
640 		/* send positive error number to userspace */
641 		.error			= -p->error,
642 	};
643 	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);
644 
645 	if (!hm)
646 		return;
647 
648 	trace_xfs_healthmon_report_file_ioerror(hm, p);
649 
650 	xfs_healthmon_push(hm, &event);
651 	xfs_healthmon_put(hm);
652 }
653 
654 static inline void
655 xfs_healthmon_reset_outbuf(
656 	struct xfs_healthmon		*hm)
657 {
658 	hm->buftail = 0;
659 	hm->bufhead = 0;
660 }
661 
662 struct flags_map {
663 	unsigned int		in_mask;
664 	unsigned int		out_mask;
665 };
666 
667 static const struct flags_map shutdown_map[] = {
668 	{ SHUTDOWN_META_IO_ERROR,	XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
669 	{ SHUTDOWN_LOG_IO_ERROR,	XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
670 	{ SHUTDOWN_FORCE_UMOUNT,	XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
671 	{ SHUTDOWN_CORRUPT_INCORE,	XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
672 	{ SHUTDOWN_CORRUPT_ONDISK,	XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
673 	{ SHUTDOWN_DEVICE_REMOVED,	XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
674 };
675 
676 static inline unsigned int
677 __map_flags(
678 	const struct flags_map	*map,
679 	size_t			array_len,
680 	unsigned int		flags)
681 {
682 	const struct flags_map	*m;
683 	unsigned int		ret = 0;
684 
685 	for (m = map; m < map + array_len; m++) {
686 		if (flags & m->in_mask)
687 			ret |= m->out_mask;
688 	}
689 
690 	return ret;
691 }
692 
693 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
694 
695 static inline unsigned int shutdown_mask(unsigned int in)
696 {
697 	return map_flags(shutdown_map, in);
698 }
699 
700 static const unsigned int domain_map[] = {
701 	[XFS_HEALTHMON_MOUNT]		= XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
702 	[XFS_HEALTHMON_FS]		= XFS_HEALTH_MONITOR_DOMAIN_FS,
703 	[XFS_HEALTHMON_AG]		= XFS_HEALTH_MONITOR_DOMAIN_AG,
704 	[XFS_HEALTHMON_INODE]		= XFS_HEALTH_MONITOR_DOMAIN_INODE,
705 	[XFS_HEALTHMON_RTGROUP]		= XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
706 	[XFS_HEALTHMON_DATADEV]		= XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
707 	[XFS_HEALTHMON_RTDEV]		= XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
708 	[XFS_HEALTHMON_LOGDEV]		= XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
709 	[XFS_HEALTHMON_FILERANGE]	= XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
710 };
711 
712 static const unsigned int type_map[] = {
713 	[XFS_HEALTHMON_RUNNING]		= XFS_HEALTH_MONITOR_TYPE_RUNNING,
714 	[XFS_HEALTHMON_LOST]		= XFS_HEALTH_MONITOR_TYPE_LOST,
715 	[XFS_HEALTHMON_SICK]		= XFS_HEALTH_MONITOR_TYPE_SICK,
716 	[XFS_HEALTHMON_CORRUPT]		= XFS_HEALTH_MONITOR_TYPE_CORRUPT,
717 	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
718 	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
719 	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
720 	[XFS_HEALTHMON_MEDIA_ERROR]	= XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
721 	[XFS_HEALTHMON_BUFREAD]		= XFS_HEALTH_MONITOR_TYPE_BUFREAD,
722 	[XFS_HEALTHMON_BUFWRITE]	= XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
723 	[XFS_HEALTHMON_DIOREAD]		= XFS_HEALTH_MONITOR_TYPE_DIOREAD,
724 	[XFS_HEALTHMON_DIOWRITE]	= XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
725 	[XFS_HEALTHMON_DATALOST]	= XFS_HEALTH_MONITOR_TYPE_DATALOST,
726 };
727 
728 /* Render event as a V0 structure */
729 STATIC int
730 xfs_healthmon_format_v0(
731 	struct xfs_healthmon		*hm,
732 	const struct xfs_healthmon_event *event)
733 {
734 	struct xfs_health_monitor_event	hme = {
735 		.time_ns		= event->time_ns,
736 	};
737 
738 	trace_xfs_healthmon_format(hm, event);
739 
740 	if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
741 	    event->type < 0   || event->type >= ARRAY_SIZE(type_map))
742 		return -EFSCORRUPTED;
743 
744 	hme.domain = domain_map[event->domain];
745 	hme.type = type_map[event->type];
746 
747 	/* fill in the event-specific details */
748 	switch (event->domain) {
749 	case XFS_HEALTHMON_MOUNT:
750 		switch (event->type) {
751 		case XFS_HEALTHMON_LOST:
752 			hme.e.lost.count = event->lostcount;
753 			break;
754 		case XFS_HEALTHMON_SHUTDOWN:
755 			hme.e.shutdown.reasons = shutdown_mask(event->flags);
756 			break;
757 		default:
758 			break;
759 		}
760 		break;
761 	case XFS_HEALTHMON_FS:
762 		hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
763 		break;
764 	case XFS_HEALTHMON_RTGROUP:
765 		hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
766 		hme.e.group.gno = event->group;
767 		break;
768 	case XFS_HEALTHMON_AG:
769 		hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
770 		hme.e.group.gno = event->group;
771 		break;
772 	case XFS_HEALTHMON_INODE:
773 		hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
774 		hme.e.inode.ino = event->ino;
775 		hme.e.inode.gen = event->gen;
776 		break;
777 	case XFS_HEALTHMON_DATADEV:
778 	case XFS_HEALTHMON_LOGDEV:
779 	case XFS_HEALTHMON_RTDEV:
780 		hme.e.media.daddr = event->daddr;
781 		hme.e.media.bbcount = event->bbcount;
782 		break;
783 	case XFS_HEALTHMON_FILERANGE:
784 		hme.e.filerange.ino = event->fino;
785 		hme.e.filerange.gen = event->fgen;
786 		hme.e.filerange.pos = event->fpos;
787 		hme.e.filerange.len = event->flen;
788 		hme.e.filerange.error = abs(event->error);
789 		break;
790 	default:
791 		break;
792 	}
793 
794 	ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
795 
796 	/* copy formatted object to the outbuf */
797 	if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
798 		memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
799 		hm->bufhead += sizeof(hme);
800 	}
801 
802 	return 0;
803 }
804 
805 /* How many bytes are waiting in the outbuf to be copied? */
806 static inline size_t
807 xfs_healthmon_outbuf_bytes(
808 	struct xfs_healthmon	*hm)
809 {
810 	if (hm->bufhead > hm->buftail)
811 		return hm->bufhead - hm->buftail;
812 	return 0;
813 }
814 
815 /*
816  * Do we have something for userspace to read?  This can mean unmount events,
817  * events pending in the queue, or pending bytes in the outbuf.
818  */
819 static inline bool
820 xfs_healthmon_has_eventdata(
821 	struct xfs_healthmon	*hm)
822 {
823 	/*
824 	 * If the health monitor is already detached from the xfs_mount, we
825 	 * want reads to return 0 bytes even if there are no events, because
826 	 * userspace interprets that as EOF.  If we race with deactivation,
827 	 * read_iter will take the necessary locks to discover that there are
828 	 * no events to send.
829 	 */
830 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
831 		return true;
832 
833 	/*
834 	 * Either there are events waiting to be formatted into the buffer, or
835 	 * there's unread bytes in the buffer.
836 	 */
837 	return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
838 }
839 
840 /* Try to copy the rest of the outbuf to the iov iter. */
841 STATIC ssize_t
842 xfs_healthmon_copybuf(
843 	struct xfs_healthmon	*hm,
844 	struct iov_iter		*to)
845 {
846 	size_t			to_copy;
847 	size_t			w = 0;
848 
849 	trace_xfs_healthmon_copybuf(hm, to);
850 
851 	to_copy = xfs_healthmon_outbuf_bytes(hm);
852 	if (to_copy) {
853 		w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
854 		if (!w)
855 			return -EFAULT;
856 
857 		hm->buftail += w;
858 	}
859 
860 	/*
861 	 * Nothing left to copy?  Reset the output buffer cursors to the start
862 	 * since there's no live data in the buffer.
863 	 */
864 	if (xfs_healthmon_outbuf_bytes(hm) == 0)
865 		xfs_healthmon_reset_outbuf(hm);
866 	return w;
867 }
868 
869 /*
870  * Return a health monitoring event for formatting into the output buffer if
871  * there's enough space in the outbuf and an event waiting for us.  Caller
872  * must hold i_rwsem on the healthmon file.
873  */
874 static inline struct xfs_healthmon_event *
875 xfs_healthmon_format_pop(
876 	struct xfs_healthmon	*hm)
877 {
878 	struct xfs_healthmon_event *event;
879 
880 	if (hm->bufhead + sizeof(*event) > hm->bufsize)
881 		return NULL;
882 
883 	mutex_lock(&hm->lock);
884 	event = hm->first_event;
885 	if (event) {
886 		if (hm->last_event == event)
887 			hm->last_event = NULL;
888 		hm->first_event = event->next;
889 		hm->events--;
890 
891 		trace_xfs_healthmon_pop(hm, event);
892 	}
893 	mutex_unlock(&hm->lock);
894 	return event;
895 }
896 
897 /* Allocate formatting buffer */
898 STATIC int
899 xfs_healthmon_alloc_outbuf(
900 	struct xfs_healthmon	*hm,
901 	size_t			user_bufsize)
902 {
903 	void			*outbuf;
904 	size_t			bufsize =
905 		min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
906 
907 	outbuf = kzalloc(bufsize, GFP_KERNEL);
908 	if (!outbuf) {
909 		if (bufsize == PAGE_SIZE)
910 			return -ENOMEM;
911 
912 		bufsize = PAGE_SIZE;
913 		outbuf = kzalloc(bufsize, GFP_KERNEL);
914 		if (!outbuf)
915 			return -ENOMEM;
916 	}
917 
918 	hm->buffer = outbuf;
919 	hm->bufsize = bufsize;
920 	hm->bufhead = 0;
921 	hm->buftail = 0;
922 
923 	return 0;
924 }
925 
926 /*
927  * Convey queued event data to userspace.  First copy any remaining bytes in
928  * the outbuf, then format the oldest event into the outbuf and copy that too.
929  */
930 STATIC ssize_t
931 xfs_healthmon_read_iter(
932 	struct kiocb		*iocb,
933 	struct iov_iter		*to)
934 {
935 	struct file		*file = iocb->ki_filp;
936 	struct inode		*inode = file_inode(file);
937 	struct xfs_healthmon	*hm = file->private_data;
938 	struct xfs_healthmon_event *event;
939 	size_t			copied = 0;
940 	ssize_t			ret = 0;
941 
942 	if (file->f_flags & O_NONBLOCK) {
943 		if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
944 			return -EAGAIN;
945 	} else {
946 		ret = wait_event_interruptible(hm->wait,
947 				xfs_healthmon_has_eventdata(hm));
948 		if (ret)
949 			return ret;
950 
951 		inode_lock(inode);
952 	}
953 
954 	if (hm->bufsize == 0) {
955 		ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
956 		if (ret)
957 			goto out_unlock;
958 	}
959 
960 	trace_xfs_healthmon_read_start(hm);
961 
962 	/*
963 	 * If there's anything left in the output buffer, copy that before
964 	 * formatting more events.
965 	 */
966 	ret = xfs_healthmon_copybuf(hm, to);
967 	if (ret < 0)
968 		goto out_unlock;
969 	copied += ret;
970 
971 	while (iov_iter_count(to) > 0) {
972 		/* Format the next events into the outbuf until it's full. */
973 		while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
974 			ret = xfs_healthmon_format_v0(hm, event);
975 			kfree(event);
976 			if (ret)
977 				goto out_unlock;
978 		}
979 
980 		/* Copy anything formatted into outbuf to userspace */
981 		ret = xfs_healthmon_copybuf(hm, to);
982 		if (ret <= 0)
983 			break;
984 
985 		copied += ret;
986 	}
987 
988 out_unlock:
989 	trace_xfs_healthmon_read_finish(hm);
990 	inode_unlock(inode);
991 	return copied ?: ret;
992 }
993 
994 /* Poll for available events. */
995 STATIC __poll_t
996 xfs_healthmon_poll(
997 	struct file			*file,
998 	struct poll_table_struct	*wait)
999 {
1000 	struct xfs_healthmon		*hm = file->private_data;
1001 	__poll_t			mask = 0;
1002 
1003 	poll_wait(file, &hm->wait, wait);
1004 
1005 	if (xfs_healthmon_has_eventdata(hm))
1006 		mask |= EPOLLIN;
1007 	return mask;
1008 }
1009 
1010 /* Free the health monitoring information. */
1011 STATIC int
1012 xfs_healthmon_release(
1013 	struct inode		*inode,
1014 	struct file		*file)
1015 {
1016 	struct xfs_healthmon	*hm = file->private_data;
1017 
1018 	trace_xfs_healthmon_release(hm);
1019 
1020 	/*
1021 	 * We might be closing the healthmon file before the filesystem
1022 	 * unmounts, because userspace processes can terminate at any time and
1023 	 * for any reason.  Null out xfs_mount::m_healthmon so that another
1024 	 * process can create another health monitor file.
1025 	 */
1026 	xfs_healthmon_detach(hm);
1027 
1028 	/*
1029 	 * Wake up any readers that might be left.  There shouldn't be any
1030 	 * because the only users of the waiter are read and poll.
1031 	 */
1032 	wake_up_all(&hm->wait);
1033 
1034 	xfs_healthmon_put(hm);
1035 	return 0;
1036 }
1037 
1038 /* Validate ioctl parameters. */
1039 static inline bool
1040 xfs_healthmon_validate(
1041 	const struct xfs_health_monitor	*hmo)
1042 {
1043 	if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
1044 		return false;
1045 	if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
1046 		return false;
1047 	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
1048 		return false;
1049 	return true;
1050 }
1051 
1052 /* Emit some data about the health monitoring fd. */
1053 static void
1054 xfs_healthmon_show_fdinfo(
1055 	struct seq_file		*m,
1056 	struct file		*file)
1057 {
1058 	struct xfs_healthmon	*hm = file->private_data;
1059 
1060 	mutex_lock(&hm->lock);
1061 	seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
1062 			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
1063 				"dead" : "alive",
1064 			MAJOR(hm->dev), MINOR(hm->dev),
1065 			hm->total_events,
1066 			hm->total_lost);
1067 	mutex_unlock(&hm->lock);
1068 }
1069 
1070 /* Reconfigure the health monitor. */
1071 STATIC long
1072 xfs_healthmon_reconfigure(
1073 	struct file			*file,
1074 	unsigned int			cmd,
1075 	void __user			*arg)
1076 {
1077 	struct xfs_health_monitor	hmo;
1078 	struct xfs_healthmon		*hm = file->private_data;
1079 
1080 	if (copy_from_user(&hmo, arg, sizeof(hmo)))
1081 		return -EFAULT;
1082 
1083 	if (!xfs_healthmon_validate(&hmo))
1084 		return -EINVAL;
1085 
1086 	mutex_lock(&hm->lock);
1087 	hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
1088 	mutex_unlock(&hm->lock);
1089 
1090 	return 0;
1091 }
1092 
1093 /* Does the fd point to the same filesystem as the one we're monitoring? */
1094 STATIC long
1095 xfs_healthmon_file_on_monitored_fs(
1096 	struct file			*file,
1097 	unsigned int			cmd,
1098 	void __user			*arg)
1099 {
1100 	struct xfs_health_file_on_monitored_fs hms;
1101 	struct xfs_healthmon		*hm = file->private_data;
1102 	struct inode			*hms_inode;
1103 
1104 	if (copy_from_user(&hms, arg, sizeof(hms)))
1105 		return -EFAULT;
1106 
1107 	if (hms.flags)
1108 		return -EINVAL;
1109 
1110 	CLASS(fd, hms_fd)(hms.fd);
1111 	if (fd_empty(hms_fd))
1112 		return -EBADF;
1113 
1114 	hms_inode = file_inode(fd_file(hms_fd));
1115 	mutex_lock(&hm->lock);
1116 	if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
1117 		mutex_unlock(&hm->lock);
1118 		return -ESTALE;
1119 	}
1120 
1121 	mutex_unlock(&hm->lock);
1122 	return 0;
1123 }
1124 
1125 /* Handle ioctls for the health monitoring thread. */
1126 STATIC long
1127 xfs_healthmon_ioctl(
1128 	struct file			*file,
1129 	unsigned int			cmd,
1130 	unsigned long			p)
1131 {
1132 	void __user			*arg = (void __user *)p;
1133 
1134 	switch (cmd) {
1135 	case XFS_IOC_HEALTH_MONITOR:
1136 		return xfs_healthmon_reconfigure(file, cmd, arg);
1137 	case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
1138 		return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
1139 	default:
1140 		break;
1141 	}
1142 
1143 	return -ENOTTY;
1144 }
1145 
1146 static const struct file_operations xfs_healthmon_fops = {
1147 	.owner		= THIS_MODULE,
1148 	.show_fdinfo	= xfs_healthmon_show_fdinfo,
1149 	.read_iter	= xfs_healthmon_read_iter,
1150 	.poll		= xfs_healthmon_poll,
1151 	.release	= xfs_healthmon_release,
1152 	.unlocked_ioctl	= xfs_healthmon_ioctl,
1153 };
1154 
1155 /*
1156  * Create a health monitoring file.  Returns an index to the fd table or a
1157  * negative errno.
1158  */
1159 long
1160 xfs_ioc_health_monitor(
1161 	struct file			*file,
1162 	struct xfs_health_monitor __user *arg)
1163 {
1164 	struct xfs_health_monitor	hmo;
1165 	struct xfs_healthmon_event	*running_event;
1166 	struct xfs_healthmon		*hm;
1167 	struct xfs_inode		*ip = XFS_I(file_inode(file));
1168 	struct xfs_mount		*mp = ip->i_mount;
1169 	int				ret;
1170 
1171 	/*
1172 	 * The only intended user of the health monitoring system should be the
1173 	 * xfs_healer daemon running on behalf of the whole filesystem in the
1174 	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
1175 	 * (they can use fsnotify) nor do we allow containers.
1176 	 */
1177 	if (!capable(CAP_SYS_ADMIN))
1178 		return -EPERM;
1179 	if (ip->i_ino != mp->m_sb.sb_rootino)
1180 		return -EPERM;
1181 	if (current_user_ns() != &init_user_ns)
1182 		return -EPERM;
1183 
1184 	if (copy_from_user(&hmo, arg, sizeof(hmo)))
1185 		return -EFAULT;
1186 
1187 	if (!xfs_healthmon_validate(&hmo))
1188 		return -EINVAL;
1189 
1190 	hm = kzalloc_obj(*hm);
1191 	if (!hm)
1192 		return -ENOMEM;
1193 	hm->dev = mp->m_super->s_dev;
1194 	refcount_set(&hm->ref, 1);
1195 
1196 	mutex_init(&hm->lock);
1197 	init_waitqueue_head(&hm->wait);
1198 
1199 	if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
1200 		hm->verbose = true;
1201 
1202 	/* Queue up the first event that lets the client know we're running. */
1203 	running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1204 	if (!running_event) {
1205 		ret = -ENOMEM;
1206 		goto out_hm;
1207 	}
1208 	running_event->type = XFS_HEALTHMON_RUNNING;
1209 	running_event->domain = XFS_HEALTHMON_MOUNT;
1210 	__xfs_healthmon_insert(hm, running_event);
1211 
1212 	/*
1213 	 * Preallocate the unmount event so that we can't fail to notify the
1214 	 * filesystem later.  This is key for triggering fast exit of the
1215 	 * xfs_healer daemon.
1216 	 */
1217 	hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS);
1218 	if (!hm->unmount_event) {
1219 		ret = -ENOMEM;
1220 		goto out_hm;
1221 	}
1222 	hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
1223 	hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
1224 
1225 	/*
1226 	 * Try to attach this health monitor to the xfs_mount.  The monitor is
1227 	 * considered live and will receive events if this succeeds.
1228 	 */
1229 	ret = xfs_healthmon_attach(mp, hm);
1230 	if (ret)
1231 		goto out_hm;
1232 
1233 	/*
1234 	 * Create the anonymous file and install a fd for it.  If it succeeds,
1235 	 * the file owns hm and can go away at any time, so we must not access
1236 	 * it again.  This must go last because we can't undo a fd table
1237 	 * installation.
1238 	 */
1239 	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
1240 			O_CLOEXEC | O_RDONLY);
1241 	if (ret < 0)
1242 		goto out_mp;
1243 
1244 	trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
1245 
1246 	return ret;
1247 
1248 out_mp:
1249 	xfs_healthmon_detach(hm);
1250 out_hm:
1251 	ASSERT(refcount_read(&hm->ref) == 1);
1252 	xfs_healthmon_put(hm);
1253 	return ret;
1254 }
1255