xref: /titanic_41/usr/src/cmd/fm/fmd/common/fmd_self.c (revision 5f8171005a0c33f3c67f7da52d41c2362c3fd891)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/fm/protocol.h>
28 
29 #include <fmd_api.h>
30 #include <fmd_subr.h>
31 #include <fmd_string.h>
32 #include <fmd_protocol.h>
33 #include <fmd_module.h>
34 #include <fmd_error.h>
35 
36 static struct {
37 	fmd_stat_t nosub;
38 	fmd_stat_t module;
39 } self_stats = {
40 	{ "nosub", FMD_TYPE_UINT64, "event classes with no subscribers seen" },
41 	{ "module", FMD_TYPE_UINT64, "error events received from fmd modules" },
42 };
43 
44 typedef struct self_case {
45 	enum { SC_CLASS, SC_MODULE } sc_kind;
46 	char *sc_name;
47 } self_case_t;
48 
49 static self_case_t *
50 self_case_create(fmd_hdl_t *hdl, int kind, const char *name)
51 {
52 	self_case_t *scp = fmd_hdl_alloc(hdl, sizeof (self_case_t), FMD_SLEEP);
53 
54 	scp->sc_kind = kind;
55 	scp->sc_name = fmd_hdl_strdup(hdl, name, FMD_SLEEP);
56 
57 	return (scp);
58 }
59 
60 static void
61 self_case_destroy(fmd_hdl_t *hdl, self_case_t *scp)
62 {
63 	fmd_hdl_strfree(hdl, scp->sc_name);
64 	fmd_hdl_free(hdl, scp, sizeof (self_case_t));
65 }
66 
67 static fmd_case_t *
68 self_case_lookup(fmd_hdl_t *hdl, int kind, const char *name)
69 {
70 	fmd_case_t *cp = NULL;
71 
72 	while ((cp = fmd_case_next(hdl, cp)) != NULL) {
73 		self_case_t *scp = fmd_case_getspecific(hdl, cp);
74 		if (scp->sc_kind == kind && strcmp(scp->sc_name, name) == 0)
75 			break;
76 	}
77 
78 	return (cp);
79 }
80 
81 /*ARGSUSED*/
82 static void
83 self_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
84 {
85 	fmd_case_t *cp;
86 	nvlist_t *flt, *mod;
87 	char *name;
88 	int err = 0;
89 
90 	/*
91 	 * If we get an error report from another fmd module, then create a
92 	 * case for the module and add the ereport to it.  The error is either
93 	 * from fmd_hdl_error() or from fmd_api_error().  If it is the latter,
94 	 * fmd_module_error() will send another event of class EFMD_MOD_FAIL
95 	 * when the module has failed, at which point we can solve the case.
96 	 * We can also close the case on EFMD_MOD_CONF (bad config file).
97 	 */
98 	if (strcmp(class, fmd_errclass(EFMD_MODULE)) == 0 &&
99 	    nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &mod) == 0 &&
100 	    nvlist_lookup_string(mod, FM_FMRI_FMD_NAME, &name) == 0) {
101 
102 		if ((cp = self_case_lookup(hdl, SC_MODULE, name)) == NULL) {
103 			cp = fmd_case_open(hdl,
104 			    self_case_create(hdl, SC_MODULE, name));
105 		}
106 
107 		fmd_case_add_ereport(hdl, cp, ep);
108 		self_stats.module.fmds_value.ui64++;
109 		(void) nvlist_lookup_int32(nvl, FMD_ERR_MOD_ERRNO, &err);
110 
111 		if (err != EFMD_MOD_FAIL && err != EFMD_MOD_CONF)
112 			return; /* module is still active, so keep case open */
113 
114 		if (fmd_case_solved(hdl, cp))
115 			return; /* case is already closed but error in _fini */
116 
117 		class = err == EFMD_MOD_FAIL ? FMD_FLT_MOD : FMD_FLT_CONF;
118 		flt = fmd_protocol_fault(class, 100, mod, NULL, NULL, NULL);
119 
120 		fmd_case_add_suspect(hdl, cp, flt);
121 		fmd_case_solve(hdl, cp);
122 
123 		return;
124 	}
125 
126 	/*
127 	 * If we get an I/O DDI ereport, drop it for now until the I/O DE is
128 	 * implemented and integrated.  Existing drivers in O/N have bugs that
129 	 * will trigger these and we don't want this producing FMD_FLT_NOSUB.
130 	 */
131 	if (strncmp(class, "ereport.io.ddi.", strlen("ereport.io.ddi.")) == 0)
132 		return; /* if we got a DDI ereport, drop it for now */
133 
134 	/*
135 	 * If we get any other type of event then it is of a class for which
136 	 * there are no subscribers.  Some of these correspond to internal fmd
137 	 * errors, which we ignore.  Otherwise we keep one case per class and
138 	 * use it to produce a message indicating that something is awry.
139 	 */
140 	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
141 	    strcmp(class, FM_LIST_ISOLATED_CLASS) == 0 ||
142 	    strcmp(class, FM_LIST_UPDATED_CLASS) == 0 ||
143 	    strcmp(class, FM_LIST_RESOLVED_CLASS) == 0 ||
144 	    strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 ||
145 	    strncmp(class, FM_FAULT_CLASS, strlen(FM_FAULT_CLASS)) == 0 ||
146 	    strncmp(class, FM_DEFECT_CLASS, strlen(FM_DEFECT_CLASS)) == 0)
147 		return; /* if no agents are present just drop list.* */
148 
149 	if (strncmp(class, FMD_ERR_CLASS, FMD_ERR_CLASS_LEN) == 0)
150 		return; /* if fmd itself produced the error just drop it */
151 
152 	if (strncmp(class, FMD_RSRC_CLASS, FMD_RSRC_CLASS_LEN) == 0)
153 		return; /* if fmd itself produced the event just drop it */
154 
155 	if (strncmp(class, SYSEVENT_RSRC_CLASS, SYSEVENT_RSRC_CLASS_LEN) == 0)
156 		return; /* sysvent resources are auto generated by fmd */
157 
158 	if (self_case_lookup(hdl, SC_CLASS, class) != NULL)
159 		return; /* case is already open against this class */
160 
161 	cp = fmd_case_open(hdl, self_case_create(hdl, SC_CLASS, class));
162 	fmd_case_add_ereport(hdl, cp, ep);
163 	self_stats.nosub.fmds_value.ui64++;
164 
165 	flt = fmd_protocol_fault(FMD_FLT_NOSUB, 100, NULL, NULL, NULL, NULL);
166 	(void) nvlist_add_string(flt, "nosub_class", class);
167 	fmd_case_add_suspect(hdl, cp, flt);
168 	fmd_case_solve(hdl, cp);
169 }
170 
171 static void
172 self_close(fmd_hdl_t *hdl, fmd_case_t *cp)
173 {
174 	self_case_destroy(hdl, fmd_case_getspecific(hdl, cp));
175 }
176 
177 static const fmd_hdl_ops_t self_ops = {
178 	self_recv,	/* fmdo_recv */
179 	NULL,		/* fmdo_timeout */
180 	self_close,	/* fmdo_close */
181 	NULL,		/* fmdo_stats */
182 	NULL,		/* fmdo_gc */
183 };
184 
185 void
186 self_init(fmd_hdl_t *hdl)
187 {
188 	fmd_module_t *mp = (fmd_module_t *)hdl; /* see below */
189 
190 	fmd_hdl_info_t info = {
191 	    "Fault Manager Self-Diagnosis", "1.0", &self_ops, NULL
192 	};
193 
194 	/*
195 	 * Unlike other modules, fmd-self-diagnosis has some special needs that
196 	 * fall outside of what we want in the module API.  Manually disable
197 	 * checkpointing for this module by tweaking the mod_stats values.
198 	 * The self-diagnosis world relates to fmd's running state and modules
199 	 * which all change when it restarts, so don't bother w/ checkpointing.
200 	 */
201 	(void) pthread_mutex_lock(&mp->mod_stats_lock);
202 	mp->mod_stats->ms_ckpt_save.fmds_value.bool = FMD_B_FALSE;
203 	mp->mod_stats->ms_ckpt_restore.fmds_value.bool = FMD_B_FALSE;
204 	(void) pthread_mutex_unlock(&mp->mod_stats_lock);
205 
206 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &info) != 0)
207 		return; /* failed to register with fmd */
208 
209 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (self_stats) /
210 	    sizeof (fmd_stat_t), (fmd_stat_t *)&self_stats);
211 }
212 
213 void
214 self_fini(fmd_hdl_t *hdl)
215 {
216 	fmd_case_t *cp = NULL;
217 
218 	while ((cp = fmd_case_next(hdl, cp)) != NULL)
219 		self_case_destroy(hdl, fmd_case_getspecific(hdl, cp));
220 }
221