xref: /illumos-gate/usr/src/cmd/fm/modules/common/sw-diag-response/subsidiary/smf/smf_diag.c (revision 3ce5372277f4657ad0e52d36c979527c4ca22de2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * SMF software-diagnosis subsidiary
28  *
29  * We model service instances in maintenance state as a defect diagnosis
30  * in FMA.  When an instance transitions to maintenance state the SMF
31  * graph engine publishes an event which we subscribe to here, and diagnose
32  * a corresponding defect.
33  *
34  * We always solve a case immediately after opening it.  But we leave the
35  * case close action to the response agent which needs to cache case UUIDs.
36  * So in the normal case, where software-response is loaded and operational,
37  * our cases will transition to CLOSED state moments after we solve them.
38  * But if fmd restarts in the interim or if software-response is not loaded
39  * then our cases may hang around in SOLVED state for a while, which means
40  * we could iterate over them on receipt of new events.  But we don't -
41  * we blindly solve a new case for every new maintenance event received,
42  * and leave it to the fmd duplicate detection and history-based diagnosis
43  * logic to do the right thing.
44  *
45  * Our sibling SMF response subsidiary propogates fmadm-initiated repairs
46  * into SMF, and svcadm-initiated clears back into FMA.  In both cases
47  * the case is moved on to the RESOLVED state, even if fmd is unable to
48  * verify that the service is out of maintenance state (i.e., no longer
49  * isolated).  If the service immediately re-enters maintenance state then
50  * we diagnose a fresh case.  The history-based diagnosis changes in fmd
51  * "do the right thing" and avoid throwing away new cases as duplicates
52  * of old ones hanging around in the "resolved but not all usable again"
53  * state.
54  */
55 
56 #include <strings.h>
57 #include <fm/libtopo.h>
58 #include <fm/fmd_fmri.h>
59 
60 #include "../../common/sw.h"
61 #include "smf.h"
62 
63 static id_t myid;
64 
65 static struct {
66 	fmd_stat_t swde_smf_diagnosed;
67 	fmd_stat_t swde_smf_bad_class;
68 	fmd_stat_t swde_smf_no_attr;
69 	fmd_stat_t swde_smf_bad_attr;
70 	fmd_stat_t swde_smf_bad_fmri;
71 	fmd_stat_t swde_smf_no_uuid;
72 	fmd_stat_t swde_smf_no_reason_short;
73 	fmd_stat_t swde_smf_no_reason_long;
74 	fmd_stat_t swde_smf_no_svcname;
75 	fmd_stat_t swde_smf_admin_maint_drop;
76 	fmd_stat_t swde_smf_bad_nvlist_pack;
77 	fmd_stat_t swde_smf_dupuuid;
78 } swde_smf_stats = {
79 	{ "swde_smf_diagnosed", FMD_TYPE_UINT64,
80 	    "maintenance state defects published" },
81 	{ "swde_smf_bad_class", FMD_TYPE_UINT64,
82 	    "incorrect event class received" },
83 	{ "swde_smf_no_attr", FMD_TYPE_UINT64,
84 	    "malformed event - missing attr nvlist" },
85 	{ "swde_smf_bad_attr", FMD_TYPE_UINT64,
86 	    "malformed event - invalid attr list" },
87 	{ "swde_smf_bad_fmri", FMD_TYPE_UINT64,
88 	    "malformed event - fmri2str fails" },
89 	{ "swde_smf_no_uuid", FMD_TYPE_UINT64,
90 	    "malformed event - missing uuid" },
91 	{ "swde_smf_no_reason_short", FMD_TYPE_UINT64,
92 	    "SMF transition event had no reason-short" },
93 	{ "swde_smf_no_reason_long", FMD_TYPE_UINT64,
94 	    "SMF transition event had no reason-long" },
95 	{ "swde_smf_no_svcname", FMD_TYPE_UINT64,
96 	    "SMF transition event had no svc-string" },
97 	{ "swde_smf_admin_maint_drop", FMD_TYPE_UINT64,
98 	    "maintenance transitions requested by admin - no diagnosis" },
99 	{ "swde_smf_bad_nvlist_pack", FMD_TYPE_UINT64,
100 	    "failed nvlist_size or nvlist_pack" },
101 	{ "swde_smf_dupuuid", FMD_TYPE_UINT64,
102 	    "duplicate events received" },
103 };
104 
105 #define	SWDE_SMF_CASEDATA_VERS		1
106 
107 typedef struct swde_smf_casedata {
108 	uint32_t scd_vers;		/* must be first member */
109 	size_t scd_nvlbufsz;		/* size of following buffer */
110 					/* packed fmri nvlist follows */
111 } swde_smf_casedata_t;
112 
113 #define	BUMPSTAT(stat)		swde_smf_stats.stat.fmds_value.ui64++
114 
115 /*ARGSUSED*/
116 void
117 swde_smf_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
118     const char *class, void *arg)
119 {
120 	char *rsn = NULL, *rsnl = NULL, *svcname = NULL;
121 	nvlist_t *attr, *svcfmri, *defect;
122 	swde_smf_casedata_t *cdp;
123 	fmd_case_t *cp;
124 	char *fmribuf;
125 	char *uuid;
126 	size_t sz;
127 
128 	if (!fmd_nvl_class_match(hdl, nvl, TRANCLASS("maintenance"))) {
129 		BUMPSTAT(swde_smf_bad_class);
130 		return;
131 	}
132 
133 	if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) {
134 		BUMPSTAT(swde_smf_no_attr);
135 		return;
136 	}
137 
138 	if (nvlist_lookup_string(nvl, FM_IREPORT_UUID, &uuid) != 0) {
139 		BUMPSTAT(swde_smf_no_uuid);
140 		return;
141 	}
142 
143 	if (nvlist_lookup_nvlist(attr, "svc", &svcfmri) != 0) {
144 		BUMPSTAT(swde_smf_bad_attr);
145 		return;
146 	}
147 
148 	if (nvlist_lookup_string(attr, "reason-short", &rsn) != 0) {
149 		BUMPSTAT(swde_smf_no_reason_short);
150 		return;
151 	}
152 
153 	if (nvlist_lookup_string(attr, "reason-long", &rsnl) != 0) {
154 		BUMPSTAT(swde_smf_no_reason_long);
155 		return;
156 	}
157 
158 	if (nvlist_lookup_string(attr, "svc-string", &svcname) != 0) {
159 		BUMPSTAT(swde_smf_no_svcname);
160 		return;
161 	}
162 
163 	if (strcmp(rsn, "administrative_request") == 0) {
164 		BUMPSTAT(swde_smf_admin_maint_drop);
165 		return;
166 	}
167 
168 	/*
169 	 * Our case checkpoint data, version 1.
170 	 */
171 	if (nvlist_size(svcfmri, &sz, NV_ENCODE_NATIVE) != 0) {
172 		BUMPSTAT(swde_smf_bad_nvlist_pack);
173 		return;
174 	}
175 	cdp = fmd_hdl_zalloc(hdl, sizeof (*cdp) + sz, FMD_SLEEP);
176 	cdp->scd_vers = SWDE_SMF_CASEDATA_VERS;
177 	fmribuf = (char *)cdp + sizeof (*cdp);
178 	cdp->scd_nvlbufsz = sz;
179 	(void) nvlist_pack(svcfmri, &fmribuf, &sz, NV_ENCODE_NATIVE, 0);
180 
181 	/*
182 	 * Open a case with UUID matching the originating event, and no
183 	 * associated serialization data.  Create a defect and add it to
184 	 * the case, and link the originating event to the case.  This
185 	 * call will return NULL if a case with the requested UUID already
186 	 * exists, which would mean we are processing an event twice so
187 	 * we can discard.
188 	 */
189 	if ((cp = swde_case_open(hdl, myid, uuid, SWDE_SMF_CASEDATA_VERS,
190 	    (void *)cdp, sizeof (*cdp) + sz)) == NULL) {
191 		BUMPSTAT(swde_smf_dupuuid);
192 		fmd_hdl_free(hdl, cdp, sizeof (*cdp) + sz);
193 		return;
194 	}
195 
196 	defect = fmd_nvl_create_defect(hdl, SW_SMF_MAINT_DEFECT,
197 	    100, svcfmri, NULL, svcfmri);
198 	if (rsn != NULL)
199 		(void) nvlist_add_string(defect, "reason-short", rsn);
200 	if (rsnl != NULL)
201 		(void) nvlist_add_string(defect, "reason-long", rsnl);
202 	if (svcname != NULL)
203 		(void) nvlist_add_string(defect, "svc-string", svcname);
204 	fmd_case_add_suspect(hdl, cp, defect);
205 	fmd_case_add_ereport(hdl, cp, ep);
206 
207 	/*
208 	 * Now solve the case, and immediately close it.  Although the
209 	 * resource is already isolated (SMF put it in maintenance state)
210 	 * we do not immediately close the case here - our sibling response
211 	 * logic will do that after caching the case UUID.
212 	 */
213 	fmd_case_solve(hdl, cp);
214 	BUMPSTAT(swde_smf_diagnosed);
215 }
216 
217 /*
218  * In the normal course of events we keep in sync with SMF through the
219  * maintenance enter/clear events it raises.  Even if a maintenance
220  * state is cleared using svcadm while fmd is not running, the event
221  * will pend and be consumed when fmd does start and we'll close the
222  * case (in the response agent).
223  *
224  * But is is possible for discontinuities to produce some confusion:
225  *
226  *	- if an instance is in maintenance state (and so shown in svcs -x
227  *	  and fmadm faulty output) at the time we clone a new boot
228  *	  environment then when we boot the new BE we can be out of
229  *	  sync if the instance is cleared when we boot there
230  *
231  *	- meddling with /var/fm state - eg manual clear of files there,
232  *	  or restore of old state
233  *
234  * So as an extra guard we have a case verify function which is called
235  * at fmd restart (module load for software-diagnosis).  We must
236  * return 0 to close the case, non-zero to retain it.
237  */
238 int
239 swde_smf_vrfy(fmd_hdl_t *hdl, fmd_case_t *cp)
240 {
241 	swde_smf_casedata_t *cdp;
242 	nvlist_t *svcfmri;
243 	uint32_t v;
244 	int rv;
245 
246 	cdp = swde_case_data(hdl, cp, &v);
247 
248 	if (cdp == NULL || v != 1)
249 		return (0);	/* bad or damaged - just close */
250 
251 	if (nvlist_unpack((char *)cdp + sizeof (*cdp),
252 	    cdp->scd_nvlbufsz, &svcfmri, 0) != 0)
253 		return (0);	/* ditto */
254 
255 	switch (fmd_nvl_fmri_service_state(hdl, svcfmri)) {
256 	case FMD_SERVICE_STATE_UNUSABLE:
257 		/*
258 		 * Keep case iff in maintenance state
259 		 */
260 		rv = 1;
261 		break;
262 
263 	default:
264 		/*
265 		 * Discard the case for all other states - cleared,
266 		 * service no longer exists, ... whatever.
267 		 */
268 		rv = 0;
269 		break;
270 	}
271 
272 	nvlist_free(svcfmri);
273 	return (rv);
274 }
275 
276 const struct sw_disp swde_smf_disp[] = {
277 	{ TRANCLASS("maintenance"), swde_smf_recv, NULL },
278 	{ NULL, NULL, NULL }
279 };
280 
281 /*ARGSUSED*/
282 int
283 swde_smf_init(fmd_hdl_t *hdl, id_t id, const struct sw_disp **dpp, int *nelemp)
284 {
285 	myid = id;
286 
287 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (swde_smf_stats) /
288 	    sizeof (fmd_stat_t), (fmd_stat_t *)&swde_smf_stats);
289 
290 	fmd_hdl_subscribe(hdl, TRANCLASS("maintenance"));
291 
292 	*dpp = &swde_smf_disp[0];
293 	*nelemp = sizeof (swde_smf_disp) / sizeof (swde_smf_disp[0]);
294 	return (SW_SUB_INIT_SUCCESS);
295 }
296 
297 const struct sw_subinfo smf_diag_info = {
298 	"smf diagnosis",		/* swsub_name */
299 	SW_CASE_SMF,			/* swsub_casetype */
300 	swde_smf_init,			/* swsub_init */
301 	NULL,				/* swsub_fini */
302 	NULL,				/* swsub_timeout */
303 	NULL,				/* swsub_case_close */
304 	swde_smf_vrfy,			/* swsub_case_vrfy */
305 };
306