xref: /illumos-gate/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 1743a90d9abccc6cd1e57ef89729c674b859fc6e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * FMD Case Subsystem
28  *
29  * Diagnosis engines are expected to group telemetry events related to the
30  * diagnosis of a particular problem on the system into a set of cases.  The
31  * diagnosis engine may have any number of cases open at a given point in time.
32  * Some cases may eventually be *solved* by associating a suspect list of one
33  * or more problems with the case, at which point fmd publishes a list.suspect
34  * event for the case and it becomes visible to administrators and agents.
35  *
36  * Every case is named using a UUID, and is globally visible in the case hash.
37  * Cases are reference-counted, except for the reference from the case hash
38  * itself.  Consumers of case references include modules, which store active
39  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
40  *
41  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
42  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
43  * or transport) and the case is referenced by the mod_cases list.  Once the
44  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
45  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
46  *
47  *			+------------+
48  *	     +----------|  UNSOLVED  |
49  *	     |		+------------+
50  *	     |		      1 |
51  *	     |			|
52  *	     |		+-------v----+
53  *	   2 |		|    SOLVED  |
54  *	     |		+------------+
55  *	     |		    3 |  5 |
56  *	     +------------+   |    |
57  *			  |   |    |
58  *			+-v---v----v-+
59  *			| CLOSE_WAIT |
60  *			+------------+
61  *			  |   |    |
62  *	      +-----------+   |    +------------+
63  *	      |		    4 |			|
64  *	      v		+-----v------+		|
65  *	   discard      |   CLOSED   |	      6	|
66  *			+------------+		|
67  *			      |			|
68  *			      |	   +------------+
69  *			    7 |	   |
70  *			+-----v----v-+
71  *			|  REPAIRED  |
72  *			+------------+
73  *			      |
74  *			    8 |
75  *			+-----v------+
76  *			|  RESOLVED  |
77  *			+------------+
78  *			      |
79  *			      v
80  *			   discard
81  *
82  * The state machine changes are triggered by calls to fmd_case_transition()
83  * from various locations inside of fmd, as described below:
84  *
85  * [1] Called by: fmd_case_solve()
86  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
87  *                conviction policy is applied to suspect list
88  *                suspects convicted are marked faulty (F) in R$
89  *                list.suspect event logged and dispatched
90  *
91  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
92  *       Actions: diagnosis engine fmdo_close() entry point scheduled
93  *                case discarded upon exit from CLOSE_WAIT
94  *
95  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
96  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
97  *                suspects convicted (F) are marked unusable (U) in R$
98  *                diagnosis engine fmdo_close() entry point scheduled
99  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
100  *
101  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
102  *       Actions: list.isolated event dispatched
103  *                case deleted from module's list of open cases
104  *
105  * [5] Called by: fmd_case_repair(), fmd_case_update()
106  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
107  *                diagnosis engine fmdo_close() entry point scheduled
108  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
109  *
110  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
111  *       Actions: suspects convicted are marked non faulty (!F) in R$
112  *                list.repaired or list.updated event dispatched
113  *
114  * [7] Called by: fmd_case_repair(), fmd_case_update()
115  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
116  *                suspects convicted are marked non faulty (!F) in R$
117  *                list.repaired or list.updated event dispatched
118  *
119  * [8] Called by: fmd_case_uuresolve()
120  *       Actions: list.resolved event dispatched
121  *		  case is discarded
122  */
123 
124 #include <sys/fm/protocol.h>
125 #include <uuid/uuid.h>
126 #include <alloca.h>
127 
128 #include <fmd_alloc.h>
129 #include <fmd_module.h>
130 #include <fmd_error.h>
131 #include <fmd_conf.h>
132 #include <fmd_case.h>
133 #include <fmd_string.h>
134 #include <fmd_subr.h>
135 #include <fmd_protocol.h>
136 #include <fmd_event.h>
137 #include <fmd_eventq.h>
138 #include <fmd_dispq.h>
139 #include <fmd_buf.h>
140 #include <fmd_log.h>
141 #include <fmd_asru.h>
142 #include <fmd_fmri.h>
143 #include <fmd_xprt.h>
144 
145 #include <fmd.h>
146 
147 static const char *const _fmd_case_snames[] = {
148 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
149 	"SOLVED",	/* FMD_CASE_SOLVED */
150 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
151 	"CLOSED",	/* FMD_CASE_CLOSED */
152 	"REPAIRED",	/* FMD_CASE_REPAIRED */
153 	"RESOLVED"	/* FMD_CASE_RESOLVED */
154 };
155 
156 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
157 
158 fmd_case_hash_t *
fmd_case_hash_create(void)159 fmd_case_hash_create(void)
160 {
161 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
162 
163 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
164 	chp->ch_hashlen = fmd.d_str_buckets;
165 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
166 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
167 	    FMD_SLEEP);
168 	chp->ch_count = 0;
169 
170 	return (chp);
171 }
172 
173 /*
174  * Destroy the case hash.  Unlike most of our hash tables, no active references
175  * are kept by the case hash itself; all references come from other subsystems.
176  * The hash must be destroyed after all modules are unloaded; if anything was
177  * present in the hash it would be by definition a reference count leak.
178  */
179 void
fmd_case_hash_destroy(fmd_case_hash_t * chp)180 fmd_case_hash_destroy(fmd_case_hash_t *chp)
181 {
182 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
183 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp, sizeof (fmd_case_hash_t));
185 }
186 
187 /*
188  * Take a snapshot of the case hash by placing an additional hold on each
189  * member in an auxiliary array, and then call 'func' for each case.
190  */
191 void
fmd_case_hash_apply(fmd_case_hash_t * chp,void (* func)(fmd_case_t *,void *),void * arg)192 fmd_case_hash_apply(fmd_case_hash_t *chp,
193     void (*func)(fmd_case_t *, void *), void *arg)
194 {
195 	fmd_case_impl_t *cp, **cps, **cpp;
196 	uint_t cpc, i;
197 
198 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
199 
200 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
201 	cpc = chp->ch_count;
202 
203 	for (i = 0; i < chp->ch_hashlen; i++) {
204 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
205 			*cpp++ = fmd_case_tryhold(cp);
206 	}
207 
208 	ASSERT(cpp == cps + cpc);
209 	(void) pthread_rwlock_unlock(&chp->ch_lock);
210 
211 	for (i = 0; i < cpc; i++) {
212 		if (cps[i] != NULL) {
213 			func((fmd_case_t *)cps[i], arg);
214 			fmd_case_rele((fmd_case_t *)cps[i]);
215 		}
216 	}
217 
218 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
219 }
220 
221 static void
fmd_case_code_hash_insert(fmd_case_hash_t * chp,fmd_case_impl_t * cip)222 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
223 {
224 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
225 
226 	cip->ci_code_next = chp->ch_code_hash[h];
227 	chp->ch_code_hash[h] = cip;
228 }
229 
230 static void
fmd_case_code_hash_delete(fmd_case_hash_t * chp,fmd_case_impl_t * cip)231 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
232 {
233 	fmd_case_impl_t **pp, *cp;
234 
235 	if (cip->ci_code) {
236 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
237 
238 		pp = &chp->ch_code_hash[h];
239 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
240 			if (cp != cip)
241 				pp = &cp->ci_code_next;
242 			else
243 				break;
244 		}
245 		if (cp != NULL) {
246 			*pp = cp->ci_code_next;
247 			cp->ci_code_next = NULL;
248 		}
249 	}
250 }
251 
252 /*
253  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
254  * were defined for this case or if the lookup fails, the event dictionary or
255  * module code is broken, and we set the event code to a precomputed default.
256  */
257 static const char *
fmd_case_mkcode(fmd_case_t * cp)258 fmd_case_mkcode(fmd_case_t *cp)
259 {
260 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
261 	fmd_case_susp_t *cis;
262 	fmd_case_hash_t *chp = fmd.d_cases;
263 
264 	char **keys, **keyp;
265 	const char *s;
266 
267 	ASSERT(MUTEX_HELD(&cip->ci_lock));
268 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
269 
270 	/*
271 	 * delete any existing entry from code hash if it is on it
272 	 */
273 	fmd_case_code_hash_delete(chp, cip);
274 
275 	fmd_free(cip->ci_code, cip->ci_codelen);
276 	cip->ci_codelen = cip->ci_mod->mod_codelen;
277 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
278 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
279 
280 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
281 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
282 			keyp++;
283 	}
284 
285 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
286 
287 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
288 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
289 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
290 		fmd_free(cip->ci_code, cip->ci_codelen);
291 		cip->ci_codelen = strlen(s) + 1;
292 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
293 		(void) strcpy(cip->ci_code, s);
294 	}
295 
296 	/*
297 	 * add into hash of solved cases
298 	 */
299 	fmd_case_code_hash_insert(chp, cip);
300 
301 	return (cip->ci_code);
302 }
303 
304 typedef struct {
305 	int	*fcl_countp;
306 	int	fcl_maxcount;
307 	uint8_t *fcl_ba;
308 	nvlist_t **fcl_nva;
309 	int	*fcl_msgp;
310 } fmd_case_lst_t;
311 
312 static void
fmd_case_set_lst(fmd_asru_link_t * alp,void * arg)313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
314 {
315 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
316 	boolean_t b;
317 	int state;
318 
319 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
320 		return;
321 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
322 	    &b) == 0 && b == B_FALSE)
323 		*entryp->fcl_msgp = B_FALSE;
324 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
325 	state = fmd_asru_al_getstate(alp);
326 	if (state & FMD_ASRU_DEGRADED)
327 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
328 	if (state & FMD_ASRU_UNUSABLE)
329 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
330 	if (state & FMD_ASRU_FAULTY)
331 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
332 	if (!(state & FMD_ASRU_PRESENT))
333 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
334 	if (alp->al_reason == FMD_ASRU_REPAIRED)
335 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
336 	else if (alp->al_reason == FMD_ASRU_REPLACED)
337 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
338 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
339 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
340 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
341 	(*entryp->fcl_countp)++;
342 }
343 
344 static void
fmd_case_faulty(fmd_asru_link_t * alp,void * arg)345 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
346 {
347 	int *faultyp = (int *)arg;
348 
349 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
350 }
351 
352 static void
fmd_case_usable(fmd_asru_link_t * alp,void * arg)353 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
354 {
355 	int *usablep = (int *)arg;
356 
357 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
358 }
359 
360 static void
fmd_case_not_faulty(fmd_asru_link_t * alp,void * arg)361 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
362 {
363 	int *not_faultyp = (int *)arg;
364 
365 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
366 }
367 
368 /*
369  * Have we got any suspects with an asru that are still unusable and present?
370  */
371 static void
fmd_case_unusable_and_present(fmd_asru_link_t * alp,void * arg)372 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
373 {
374 	int *rvalp = (int *)arg;
375 	int state;
376 	nvlist_t *asru;
377 
378 	/*
379 	 * if this a proxy case and this suspect doesn't have an local asru
380 	 * then state is unknown so we must assume it may still be unusable.
381 	 */
382 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
383 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
384 		*rvalp |= B_TRUE;
385 		return;
386 	}
387 
388 	state = fmd_asru_al_getstate(alp);
389 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
390 		return;
391 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
392 }
393 
394 nvlist_t *
fmd_case_mkevent(fmd_case_t * cp,const char * class)395 fmd_case_mkevent(fmd_case_t *cp, const char *class)
396 {
397 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
398 	nvlist_t **nva, *nvl;
399 	uint8_t *ba;
400 	int msg = B_TRUE;
401 	const char *code;
402 	fmd_case_lst_t fcl;
403 	int count = 0;
404 
405 	(void) pthread_mutex_lock(&cip->ci_lock);
406 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
407 
408 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
409 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
410 
411 	/*
412 	 * For each suspect associated with the case, store its fault event
413 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
414 	 * have asked not to be messaged.  If any of them have made such a
415 	 * request, propagate that attribute to the composite list.* event.
416 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
417 	 */
418 	fcl.fcl_countp = &count;
419 	fcl.fcl_maxcount = cip->ci_nsuspects;
420 	fcl.fcl_msgp = &msg;
421 	fcl.fcl_ba = ba;
422 	fcl.fcl_nva = nva;
423 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
424 
425 	if (cip->ci_code == NULL)
426 		(void) fmd_case_mkcode(cp);
427 	/*
428 	 * For repair and updated event, we lookup diagcode from dict using key
429 	 * "list.repaired" or "list.updated" or "list.resolved".
430 	 */
431 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
432 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
433 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
434 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
435 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
436 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
437 	else
438 		code = cip->ci_code;
439 
440 	if (msg == B_FALSE)
441 		cip->ci_flags |= FMD_CF_INVISIBLE;
442 
443 	/*
444 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
445 	 * Otherwise use the authority for the current module.
446 	 */
447 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
448 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
449 	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
450 
451 	(void) pthread_mutex_unlock(&cip->ci_lock);
452 	return (nvl);
453 }
454 
455 static int fmd_case_match_on_faulty_overlap = 1;
456 static int fmd_case_match_on_acquit_overlap = 1;
457 static int fmd_case_auto_acquit_isolated = 1;
458 static int fmd_case_auto_acquit_non_acquitted = 1;
459 static int fmd_case_too_recent = 10; /* time in seconds */
460 
461 static boolean_t
fmd_case_compare_elem(nvlist_t * nvl,nvlist_t * xnvl,const char * elem)462 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
463 {
464 	nvlist_t *new_rsrc;
465 	nvlist_t *rsrc;
466 	char *new_name = NULL;
467 	char *name = NULL;
468 	ssize_t new_namelen;
469 	ssize_t namelen;
470 	int fmri_present = 1;
471 	int new_fmri_present = 1;
472 	int match = B_FALSE;
473 	fmd_topo_t *ftp = fmd_topo_hold();
474 
475 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
476 		fmri_present = 0;
477 	else {
478 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
479 			goto done;
480 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
481 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
482 			goto done;
483 	}
484 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
485 		new_fmri_present = 0;
486 	else {
487 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
488 			goto done;
489 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
490 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
491 			goto done;
492 	}
493 	match = (fmri_present == new_fmri_present &&
494 	    (fmri_present == 0 ||
495 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
496 done:
497 	if (name != NULL)
498 		fmd_free(name, namelen + 1);
499 	if (new_name != NULL)
500 		fmd_free(new_name, new_namelen + 1);
501 	fmd_topo_rele(ftp);
502 	return (match);
503 }
504 
505 static int
fmd_case_match_suspect(nvlist_t * nvl1,nvlist_t * nvl2)506 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
507 {
508 	char *class, *new_class;
509 
510 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
511 		return (0);
512 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
513 		return (0);
514 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
515 		return (0);
516 	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
517 	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
518 	return (strcmp(class, new_class) == 0);
519 }
520 
521 typedef struct {
522 	int	*fcms_countp;
523 	int	fcms_maxcount;
524 	fmd_case_impl_t *fcms_cip;
525 	uint8_t *fcms_new_susp_state;
526 	uint8_t *fcms_old_susp_state;
527 	uint8_t *fcms_old_match_state;
528 } fcms_t;
529 #define	SUSPECT_STATE_FAULTY				0x1
530 #define	SUSPECT_STATE_ISOLATED				0x2
531 #define	SUSPECT_STATE_REMOVED				0x4
532 #define	SUSPECT_STATE_ACQUITED				0x8
533 #define	SUSPECT_STATE_REPAIRED				0x10
534 #define	SUSPECT_STATE_REPLACED				0x20
535 #define	SUSPECT_STATE_NO_MATCH				0x1
536 
537 /*
538  * This is called for each suspect in the old case. Compare it against each
539  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
540  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
541  * found in the old case.
542  */
543 static void
fmd_case_match_suspects(fmd_asru_link_t * alp,void * arg)544 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
545 {
546 	fcms_t *fcmsp = (fcms_t *)arg;
547 	fmd_case_impl_t *cip = fcmsp->fcms_cip;
548 	fmd_case_susp_t *cis;
549 	int i = 0;
550 	int state = fmd_asru_al_getstate(alp);
551 
552 	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
553 		return;
554 
555 	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
556 	    alp->al_reason == FMD_ASRU_REMOVED))
557 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
558 		    SUSPECT_STATE_REMOVED;
559 	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
560 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
561 		    SUSPECT_STATE_ISOLATED;
562 	else if (state & FMD_ASRU_FAULTY)
563 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
564 		    SUSPECT_STATE_FAULTY;
565 	else if (alp->al_reason == FMD_ASRU_REPLACED)
566 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
567 		    SUSPECT_STATE_REPLACED;
568 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
569 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
570 		    SUSPECT_STATE_ACQUITED;
571 	else
572 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
573 		    SUSPECT_STATE_REPAIRED;
574 
575 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
576 		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
577 			break;
578 	if (cis != NULL)
579 		fcmsp->fcms_new_susp_state[i] =
580 		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
581 	else
582 		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
583 		    SUSPECT_STATE_NO_MATCH;
584 	(*fcmsp->fcms_countp)++;
585 }
586 
587 typedef struct {
588 	int	*fca_do_update;
589 	fmd_case_impl_t *fca_cip;
590 } fca_t;
591 
592 /*
593  * Re-fault all acquitted suspects that are still present in the new list.
594  */
595 static void
fmd_case_fault_acquitted_matching(fmd_asru_link_t * alp,void * arg)596 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
597 {
598 	fca_t *fcap = (fca_t *)arg;
599 	fmd_case_impl_t *cip = fcap->fca_cip;
600 	fmd_case_susp_t *cis;
601 	int state = fmd_asru_al_getstate(alp);
602 
603 	if (!(state & FMD_ASRU_FAULTY) &&
604 	    alp->al_reason == FMD_ASRU_ACQUITTED) {
605 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
606 			if (fmd_case_match_suspect(cis->cis_nvl,
607 			    alp->al_event) == 1)
608 				break;
609 		if (cis != NULL) {
610 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
611 			*fcap->fca_do_update = 1;
612 		}
613 	}
614 }
615 
616 /*
617  * Re-fault all suspects that are still present in the new list.
618  */
619 static void
fmd_case_fault_all_matching(fmd_asru_link_t * alp,void * arg)620 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
621 {
622 	fca_t *fcap = (fca_t *)arg;
623 	fmd_case_impl_t *cip = fcap->fca_cip;
624 	fmd_case_susp_t *cis;
625 	int state = fmd_asru_al_getstate(alp);
626 
627 	if (!(state & FMD_ASRU_FAULTY)) {
628 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
629 			if (fmd_case_match_suspect(cis->cis_nvl,
630 			    alp->al_event) == 1)
631 				break;
632 		if (cis != NULL) {
633 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
634 			*fcap->fca_do_update = 1;
635 		}
636 	}
637 }
638 
639 /*
640  * Acquit all suspects that are no longer present in the new list.
641  */
642 static void
fmd_case_acquit_no_match(fmd_asru_link_t * alp,void * arg)643 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
644 {
645 	fca_t *fcap = (fca_t *)arg;
646 	fmd_case_impl_t *cip = fcap->fca_cip;
647 	fmd_case_susp_t *cis;
648 	int state = fmd_asru_al_getstate(alp);
649 
650 	if (state & FMD_ASRU_FAULTY) {
651 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
652 			if (fmd_case_match_suspect(cis->cis_nvl,
653 			    alp->al_event) == 1)
654 				break;
655 		if (cis == NULL) {
656 			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
657 			    FMD_ASRU_ACQUITTED);
658 			*fcap->fca_do_update = 1;
659 		}
660 	}
661 }
662 
663 /*
664  * Acquit all isolated suspects.
665  */
666 static void
fmd_case_acquit_isolated(fmd_asru_link_t * alp,void * arg)667 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
668 {
669 	int *do_update = (int *)arg;
670 	int state = fmd_asru_al_getstate(alp);
671 
672 	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
673 	    (state & FMD_ASRU_FAULTY)) {
674 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
675 		    FMD_ASRU_ACQUITTED);
676 		*do_update = 1;
677 	}
678 }
679 
680 /*
681  * Acquit suspect which matches specified nvlist
682  */
683 static void
fmd_case_acquit_suspect(fmd_asru_link_t * alp,void * arg)684 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
685 {
686 	nvlist_t *nvl = (nvlist_t *)arg;
687 	int state = fmd_asru_al_getstate(alp);
688 
689 	if ((state & FMD_ASRU_FAULTY) &&
690 	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
691 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
692 		    FMD_ASRU_ACQUITTED);
693 }
694 
695 typedef struct {
696 	fmd_case_impl_t *fccd_cip;
697 	uint8_t *fccd_new_susp_state;
698 	uint8_t *fccd_new_match_state;
699 	int *fccd_discard_new;
700 	int *fccd_adjust_new;
701 } fccd_t;
702 
703 /*
704  * see if a matching suspect list already exists in the cache
705  */
706 static void
fmd_case_check_for_dups(fmd_case_t * old_cp,void * arg)707 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
708 {
709 	fccd_t *fccdp = (fccd_t *)arg;
710 	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
711 	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
712 	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
713 	int got_faulty_overlap = 0;
714 	int got_acquit_overlap = 0;
715 	boolean_t too_recent;
716 	uint64_t most_recent = 0;
717 	fcms_t fcms;
718 	fca_t fca;
719 	uint8_t *new_susp_state;
720 	uint8_t *old_susp_state;
721 	uint8_t *old_match_state;
722 
723 	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
724 	for (i = 0; i < new_cip->ci_nsuspects; i++)
725 		new_susp_state[i] = 0;
726 	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
727 	for (i = 0; i < old_cip->ci_nsuspects; i++)
728 		old_susp_state[i] = 0;
729 	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
730 	for (i = 0; i < old_cip->ci_nsuspects; i++)
731 		old_match_state[i] = 0;
732 
733 	/*
734 	 * Compare with each suspect in the existing case.
735 	 */
736 	fcms.fcms_countp = &count;
737 	fcms.fcms_maxcount = old_cip->ci_nsuspects;
738 	fcms.fcms_cip = new_cip;
739 	fcms.fcms_new_susp_state = new_susp_state;
740 	fcms.fcms_old_susp_state = old_susp_state;
741 	fcms.fcms_old_match_state = old_match_state;
742 	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
743 	    fmd_case_match_suspects, &fcms);
744 
745 	/*
746 	 * If we have some faulty, non-isolated suspects that overlap, then most
747 	 * likely it is the suspects that overlap in the suspect lists that are
748 	 * to blame. So we can consider this to be a match.
749 	 */
750 	for (i = 0; i < new_cip->ci_nsuspects; i++)
751 		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
752 			got_faulty_overlap = 1;
753 	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
754 		goto got_match;
755 
756 	/*
757 	 * If we have no faulty, non-isolated suspects in the old case, but we
758 	 * do have some acquitted suspects that overlap, then most likely it is
759 	 * the acquitted suspects that overlap in the suspect lists that are
760 	 * to blame. So we can consider this to be a match.
761 	 */
762 	for (i = 0; i < new_cip->ci_nsuspects; i++)
763 		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
764 			got_acquit_overlap = 1;
765 	for (i = 0; i < old_cip->ci_nsuspects; i++)
766 		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
767 			got_acquit_overlap = 0;
768 	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
769 		goto got_match;
770 
771 	/*
772 	 * Check that all suspects in the new list are present in the old list.
773 	 * Return if we find one that isn't.
774 	 */
775 	for (i = 0; i < new_cip->ci_nsuspects; i++)
776 		if (new_susp_state[i] == 0)
777 			return;
778 
779 	/*
780 	 * Check that all suspects in the old list are present in the new list
781 	 * *or* they are isolated or removed/replaced (which would explain why
782 	 * they are not present in the new list). Return if we find one that is
783 	 * faulty and unisolated or repaired or acquitted, and that is not
784 	 * present in the new case.
785 	 */
786 	for (i = 0; i < old_cip->ci_nsuspects; i++)
787 		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
788 		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
789 		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
790 		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
791 			return;
792 
793 got_match:
794 	/*
795 	 * If the old case is already in repaired/resolved state, we can't
796 	 * do anything more with it, so keep the new case, but acquit some
797 	 * of the suspects if appropriate.
798 	 */
799 	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
800 		if (fmd_case_auto_acquit_non_acquitted) {
801 			*fccdp->fccd_adjust_new = 1;
802 			for (i = 0; i < new_cip->ci_nsuspects; i++) {
803 				fccdp->fccd_new_susp_state[i] |=
804 				    new_susp_state[i];
805 				if (new_susp_state[i] == 0)
806 					fccdp->fccd_new_susp_state[i] =
807 					    SUSPECT_STATE_NO_MATCH;
808 			}
809 		}
810 		return;
811 	}
812 
813 	/*
814 	 * Otherwise discard the new case and keep the old, again updating the
815 	 * state of the suspects as appropriate
816 	 */
817 	*fccdp->fccd_discard_new = 1;
818 	fca.fca_cip = new_cip;
819 	fca.fca_do_update = &do_update;
820 
821 	/*
822 	 * See if new case occurred within fmd_case_too_recent seconds of the
823 	 * most recent modification to the old case and if so don't do
824 	 * auto-acquit. This avoids problems if a flood of ereports come in and
825 	 * they don't all get diagnosed before the first case causes some of
826 	 * the devices to be isolated making it appear that an isolated device
827 	 * was in the suspect list.
828 	 */
829 	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
830 	    fmd_asru_most_recent, &most_recent);
831 	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
832 	    fmd_case_too_recent);
833 
834 	if (got_faulty_overlap) {
835 		/*
836 		 * Acquit any suspects not present in the new list, plus
837 		 * any that are are present but are isolated.
838 		 */
839 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
840 		    fmd_case_acquit_no_match, &fca);
841 		if (fmd_case_auto_acquit_isolated && !too_recent)
842 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
843 			    fmd_case_acquit_isolated, &do_update);
844 	} else if (got_acquit_overlap) {
845 		/*
846 		 * Re-fault the acquitted matching suspects and acquit all
847 		 * isolated suspects.
848 		 */
849 		if (fmd_case_auto_acquit_isolated && !too_recent) {
850 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
851 			    fmd_case_fault_acquitted_matching, &fca);
852 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
853 			    fmd_case_acquit_isolated, &do_update);
854 		}
855 	} else if (fmd_case_auto_acquit_isolated) {
856 		/*
857 		 * To get here, there must be no faulty or acquitted suspects,
858 		 * but there must be at least one isolated suspect. Just acquit
859 		 * non-matching isolated suspects. If there are no matching
860 		 * isolated suspects, then re-fault all matching suspects.
861 		 */
862 		for (i = 0; i < new_cip->ci_nsuspects; i++)
863 			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
864 				got_isolated_overlap = 1;
865 		if (!got_isolated_overlap)
866 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
867 			    fmd_case_fault_all_matching, &fca);
868 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
869 		    fmd_case_acquit_no_match, &fca);
870 	}
871 
872 	/*
873 	 * If we've updated anything in the old case, call fmd_case_update()
874 	 */
875 	if (do_update)
876 		fmd_case_update(old_cp);
877 }
878 
879 /*
880  * Convict suspects in a case by applying a conviction policy and updating the
881  * resource cache prior to emitting the list.suspect event for the given case.
882  * At present, our policy is very simple: convict every suspect in the case.
883  * In the future, this policy can be extended and made configurable to permit:
884  *
885  * - convicting the suspect with the highest FIT rate
886  * - convicting the suspect with the cheapest FRU
887  * - convicting the suspect with the FRU that is in a depot's inventory
888  * - convicting the suspect with the longest lifetime
889  *
890  * and so forth.  A word to the wise: this problem is significantly harder that
891  * it seems at first glance.  Future work should heed the following advice:
892  *
893  * Hacking the policy into C code here is a very bad idea.  The policy needs to
894  * be decided upon very carefully and fundamentally encodes knowledge of what
895  * suspect list combinations can be emitted by what diagnosis engines.  As such
896  * fmd's code is the wrong location, because that would require fmd itself to
897  * be updated for every diagnosis engine change, defeating the entire design.
898  * The FMA Event Registry knows the suspect list combinations: policy inputs
899  * can be derived from it and used to produce per-module policy configuration.
900  *
901  * If the policy needs to be dynamic and not statically fixed at either fmd
902  * startup or module load time, any implementation of dynamic policy retrieval
903  * must employ some kind of caching mechanism or be part of a built-in module.
904  * The fmd_case_convict() function is called with locks held inside of fmd and
905  * is not a place where unbounded blocking on some inter-process or inter-
906  * system communication to another service (e.g. another daemon) can occur.
907  */
908 static int
fmd_case_convict(fmd_case_t * cp)909 fmd_case_convict(fmd_case_t *cp)
910 {
911 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
912 	fmd_asru_hash_t *ahp = fmd.d_asrus;
913 	int discard_new = 0, i;
914 	fmd_case_susp_t *cis;
915 	fmd_asru_link_t *alp;
916 	uint8_t *new_susp_state;
917 	uint8_t *new_match_state;
918 	int adjust_new = 0;
919 	fccd_t fccd;
920 	fmd_case_impl_t *ncp, **cps, **cpp;
921 	uint_t cpc;
922 	fmd_case_hash_t *chp;
923 
924 	/*
925 	 * First we must see if any matching cases already exist.
926 	 */
927 	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
928 	for (i = 0; i < cip->ci_nsuspects; i++)
929 		new_susp_state[i] = 0;
930 	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
931 	for (i = 0; i < cip->ci_nsuspects; i++)
932 		new_match_state[i] = 0;
933 	fccd.fccd_cip = cip;
934 	fccd.fccd_adjust_new = &adjust_new;
935 	fccd.fccd_new_susp_state = new_susp_state;
936 	fccd.fccd_new_match_state = new_match_state;
937 	fccd.fccd_discard_new = &discard_new;
938 
939 	/*
940 	 * Hold all cases
941 	 */
942 	chp = fmd.d_cases;
943 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
944 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
945 	cpc = chp->ch_count;
946 	for (i = 0; i < chp->ch_hashlen; i++)
947 		for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
948 			*cpp++ = fmd_case_tryhold(ncp);
949 	ASSERT(cpp == cps + cpc);
950 	(void) pthread_rwlock_unlock(&chp->ch_lock);
951 
952 	/*
953 	 * Run fmd_case_check_for_dups() on all cases except the current one.
954 	 */
955 	for (i = 0; i < cpc; i++) {
956 		if (cps[i] != NULL) {
957 			if (cps[i] != (fmd_case_impl_t *)cp)
958 				fmd_case_check_for_dups((fmd_case_t *)cps[i],
959 				    &fccd);
960 			fmd_case_rele((fmd_case_t *)cps[i]);
961 		}
962 	}
963 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
964 
965 	(void) pthread_mutex_lock(&cip->ci_lock);
966 	if (cip->ci_code == NULL)
967 		(void) fmd_case_mkcode(cp);
968 	else if (cip->ci_precanned)
969 		fmd_case_code_hash_insert(fmd.d_cases, cip);
970 
971 	if (discard_new) {
972 		/*
973 		 * We've found an existing case that is a match and it is not
974 		 * already in repaired or resolved state. So we can close this
975 		 * one as a duplicate.
976 		 */
977 		(void) pthread_mutex_unlock(&cip->ci_lock);
978 		return (1);
979 	}
980 
981 	/*
982 	 * Allocate new cache entries
983 	 */
984 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
985 		if ((alp = fmd_asru_hash_create_entry(ahp,
986 		    cp, cis->cis_nvl)) == NULL) {
987 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
988 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
989 			continue;
990 		}
991 		alp->al_flags |= FMD_ASRU_PRESENT;
992 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
993 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
994 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
995 	}
996 
997 	if (adjust_new) {
998 		int some_suspect = 0, some_not_suspect = 0;
999 
1000 		/*
1001 		 * There is one or more matching case but they are already in
1002 		 * repaired or resolved state. So we need to keep the new
1003 		 * case, but we can adjust it. Repaired/removed/replaced
1004 		 * suspects are unlikely to be to blame (unless there are
1005 		 * actually two separate faults). So if we have a combination of
1006 		 * repaired/replaced/removed suspects and acquitted suspects in
1007 		 * the old lists, then we should acquit in the new list those
1008 		 * that were repaired/replaced/removed in the old.
1009 		 */
1010 		for (i = 0; i < cip->ci_nsuspects; i++) {
1011 			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1012 			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1013 			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1014 			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1015 				some_not_suspect = 1;
1016 			else
1017 				some_suspect = 1;
1018 		}
1019 		if (some_suspect && some_not_suspect) {
1020 			for (cis = cip->ci_suspects, i = 0; cis != NULL;
1021 			    cis = cis->cis_next, i++)
1022 				if ((new_susp_state[i] &
1023 				    SUSPECT_STATE_REPLACED) ||
1024 				    (new_susp_state[i] &
1025 				    SUSPECT_STATE_REPAIRED) ||
1026 				    (new_susp_state[i] &
1027 				    SUSPECT_STATE_REMOVED) ||
1028 				    (new_match_state[i] &
1029 				    SUSPECT_STATE_NO_MATCH))
1030 					fmd_asru_hash_apply_by_case(fmd.d_asrus,
1031 					    cp, fmd_case_acquit_suspect,
1032 					    cis->cis_nvl);
1033 		}
1034 	}
1035 
1036 	(void) pthread_mutex_unlock(&cip->ci_lock);
1037 	return (0);
1038 }
1039 
1040 void
fmd_case_publish(fmd_case_t * cp,uint_t state)1041 fmd_case_publish(fmd_case_t *cp, uint_t state)
1042 {
1043 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1044 	fmd_event_t *e;
1045 	nvlist_t *nvl;
1046 	char *class;
1047 
1048 	if (state == FMD_CASE_CURRENT)
1049 		state = cip->ci_state; /* use current state */
1050 
1051 	switch (state) {
1052 	case FMD_CASE_SOLVED:
1053 		(void) pthread_mutex_lock(&cip->ci_lock);
1054 
1055 		/*
1056 		 * If we already have a code, then case is already solved.
1057 		 */
1058 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1059 		    cip->ci_code != NULL) {
1060 			(void) pthread_mutex_unlock(&cip->ci_lock);
1061 			break;
1062 		}
1063 
1064 		if (cip->ci_tv_valid == 0) {
1065 			fmd_time_gettimeofday(&cip->ci_tv);
1066 			cip->ci_tv_valid = 1;
1067 		}
1068 		(void) pthread_mutex_unlock(&cip->ci_lock);
1069 
1070 		if (fmd_case_convict(cp) == 1) { /* dupclose */
1071 			cip->ci_flags &= ~FMD_CF_SOLVED;
1072 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1073 			break;
1074 		}
1075 		if (cip->ci_xprt != NULL) {
1076 			/*
1077 			 * For proxy, save some information about the transport
1078 			 * in the resource cache.
1079 			 */
1080 			int count = 0;
1081 			fmd_asru_set_on_proxy_t fasp;
1082 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1083 
1084 			fasp.fasp_countp = &count;
1085 			fasp.fasp_maxcount = cip->ci_nsuspects;
1086 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1087 			fasp.fasp_proxy_external = xip->xi_flags &
1088 			    FMD_XPRT_EXTERNAL;
1089 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1090 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1091 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1092 			    fmd_asru_set_on_proxy, &fasp);
1093 		}
1094 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1095 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1096 
1097 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1098 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1099 		fmd_log_append(fmd.d_fltlog, e, cp);
1100 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1101 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1102 
1103 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1104 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1105 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1106 
1107 		break;
1108 
1109 	case FMD_CASE_CLOSE_WAIT:
1110 		fmd_case_hold(cp);
1111 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1112 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1113 
1114 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1115 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1116 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1117 
1118 		break;
1119 
1120 	case FMD_CASE_CLOSED:
1121 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1122 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1123 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1124 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1125 		break;
1126 
1127 	case FMD_CASE_REPAIRED:
1128 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1129 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1130 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1131 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1132 		fmd_log_append(fmd.d_fltlog, e, cp);
1133 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1134 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1135 		break;
1136 
1137 	case FMD_CASE_RESOLVED:
1138 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1139 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1140 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1141 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1142 		fmd_log_append(fmd.d_fltlog, e, cp);
1143 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1144 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1145 		break;
1146 	}
1147 }
1148 
1149 fmd_case_t *
fmd_case_hash_lookup(fmd_case_hash_t * chp,const char * uuid)1150 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1151 {
1152 	fmd_case_impl_t *cip;
1153 	uint_t h;
1154 
1155 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
1156 	h = fmd_strhash(uuid) % chp->ch_hashlen;
1157 
1158 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1159 		if (strcmp(cip->ci_uuid, uuid) == 0)
1160 			break;
1161 	}
1162 
1163 	/*
1164 	 * If deleting bit is set, treat the case as if it doesn't exist.
1165 	 */
1166 	if (cip != NULL)
1167 		cip = fmd_case_tryhold(cip);
1168 
1169 	if (cip == NULL)
1170 		(void) fmd_set_errno(EFMD_CASE_INVAL);
1171 
1172 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1173 	return ((fmd_case_t *)cip);
1174 }
1175 
1176 static fmd_case_impl_t *
fmd_case_hash_insert(fmd_case_hash_t * chp,fmd_case_impl_t * cip)1177 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1178 {
1179 	fmd_case_impl_t *eip;
1180 	uint_t h;
1181 
1182 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1183 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1184 
1185 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1186 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1187 		    fmd_case_tryhold(eip) != NULL) {
1188 			(void) pthread_rwlock_unlock(&chp->ch_lock);
1189 			return (eip); /* uuid already present */
1190 		}
1191 	}
1192 
1193 	cip->ci_next = chp->ch_hash[h];
1194 	chp->ch_hash[h] = cip;
1195 
1196 	chp->ch_count++;
1197 	ASSERT(chp->ch_count != 0);
1198 
1199 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1200 	return (cip);
1201 }
1202 
1203 static void
fmd_case_hash_delete(fmd_case_hash_t * chp,fmd_case_impl_t * cip)1204 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1205 {
1206 	fmd_case_impl_t *cp, **pp;
1207 	uint_t h;
1208 
1209 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1210 
1211 	cip->ci_flags |= FMD_CF_DELETING;
1212 	(void) pthread_mutex_unlock(&cip->ci_lock);
1213 
1214 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1215 
1216 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1217 	pp = &chp->ch_hash[h];
1218 
1219 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1220 		if (cp != cip)
1221 			pp = &cp->ci_next;
1222 		else
1223 			break;
1224 	}
1225 
1226 	if (cp == NULL) {
1227 		fmd_panic("case %p (%s) not found on hash chain %u\n",
1228 		    (void *)cip, cip->ci_uuid, h);
1229 	}
1230 
1231 	*pp = cp->ci_next;
1232 	cp->ci_next = NULL;
1233 
1234 	/*
1235 	 * delete from code hash if it is on it
1236 	 */
1237 	fmd_case_code_hash_delete(chp, cip);
1238 
1239 	ASSERT(chp->ch_count != 0);
1240 	chp->ch_count--;
1241 
1242 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1243 
1244 	(void) pthread_mutex_lock(&cip->ci_lock);
1245 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
1246 }
1247 
1248 fmd_case_t *
fmd_case_create(fmd_module_t * mp,const char * uuidstr,void * data)1249 fmd_case_create(fmd_module_t *mp, const char *uuidstr, void *data)
1250 {
1251 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1252 	fmd_case_impl_t *eip = NULL;
1253 	uuid_t uuid;
1254 
1255 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1256 	fmd_buf_hash_create(&cip->ci_bufs);
1257 
1258 	fmd_module_hold(mp);
1259 	cip->ci_mod = mp;
1260 	cip->ci_refs = 1;
1261 	cip->ci_state = FMD_CASE_UNSOLVED;
1262 	cip->ci_flags = FMD_CF_DIRTY;
1263 	cip->ci_data = data;
1264 
1265 	/*
1266 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
1267 	 * define any constant for the length of an unparse string, and do not
1268 	 * permit the caller to specify a buffer length for safety.  The spec
1269 	 * says it will be 36 bytes, but we make it tunable just in case.
1270 	 */
1271 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1272 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1273 
1274 	if (uuidstr == NULL) {
1275 		/*
1276 		 * We expect this loop to execute only once, but code it
1277 		 * defensively against the possibility of libuuid bugs.
1278 		 * Keep generating uuids and attempting to do a hash insert
1279 		 * until we get a unique one.
1280 		 */
1281 		do {
1282 			if (eip != NULL)
1283 				fmd_case_rele((fmd_case_t *)eip);
1284 			uuid_generate(uuid);
1285 			uuid_unparse(uuid, cip->ci_uuid);
1286 		} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1287 	} else {
1288 		/*
1289 		 * If a uuid was specified we must succeed with that uuid,
1290 		 * or return NULL indicating a case with that uuid already
1291 		 * exists.
1292 		 */
1293 		(void) strncpy(cip->ci_uuid, uuidstr, cip->ci_uuidlen + 1);
1294 		if (fmd_case_hash_insert(fmd.d_cases, cip) != cip) {
1295 			fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1296 			(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1297 			fmd_module_rele(mp);
1298 			pthread_mutex_destroy(&cip->ci_lock);
1299 			fmd_free(cip, sizeof (*cip));
1300 			return (NULL);
1301 		}
1302 	}
1303 
1304 	ASSERT(fmd_module_locked(mp));
1305 	fmd_list_append(&mp->mod_cases, cip);
1306 	fmd_module_setcdirty(mp);
1307 
1308 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1309 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1310 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1311 
1312 	return ((fmd_case_t *)cip);
1313 }
1314 
1315 static void
fmd_case_destroy_suspects(fmd_case_impl_t * cip)1316 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1317 {
1318 	fmd_case_susp_t *cis, *ncis;
1319 
1320 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1321 
1322 	if (cip->ci_proxy_asru)
1323 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1324 		    cip->ci_nsuspects);
1325 	nvlist_free(cip->ci_diag_de);
1326 	if (cip->ci_diag_asru)
1327 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1328 		    cip->ci_nsuspects);
1329 
1330 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1331 		ncis = cis->cis_next;
1332 		nvlist_free(cis->cis_nvl);
1333 		fmd_free(cis, sizeof (fmd_case_susp_t));
1334 	}
1335 
1336 	cip->ci_suspects = NULL;
1337 	cip->ci_nsuspects = 0;
1338 }
1339 
1340 fmd_case_t *
fmd_case_recreate(fmd_module_t * mp,fmd_xprt_t * xp,uint_t state,const char * uuid,const char * code)1341 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1342     uint_t state, const char *uuid, const char *code)
1343 {
1344 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1345 	fmd_case_impl_t *eip;
1346 
1347 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1348 	fmd_buf_hash_create(&cip->ci_bufs);
1349 
1350 	fmd_module_hold(mp);
1351 	cip->ci_mod = mp;
1352 	cip->ci_xprt = xp;
1353 	cip->ci_refs = 1;
1354 	cip->ci_state = state;
1355 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1356 	cip->ci_uuidlen = strlen(cip->ci_uuid);
1357 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1358 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1359 
1360 	if (state > FMD_CASE_CLOSE_WAIT)
1361 		cip->ci_flags |= FMD_CF_SOLVED;
1362 
1363 	/*
1364 	 * Insert the case into the global case hash.  If the specified UUID is
1365 	 * already present, check to see if it is an orphan: if so, reclaim it;
1366 	 * otherwise if it is owned by a different module then return NULL.
1367 	 */
1368 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1369 		(void) pthread_mutex_lock(&cip->ci_lock);
1370 		cip->ci_refs--; /* decrement to zero */
1371 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1372 
1373 		cip = eip; /* switch 'cip' to the existing case */
1374 		(void) pthread_mutex_lock(&cip->ci_lock);
1375 
1376 		/*
1377 		 * If the ASRU cache is trying to recreate an orphan, then just
1378 		 * return the existing case that we found without changing it.
1379 		 */
1380 		if (mp == fmd.d_rmod) {
1381 			/*
1382 			 * In case the case has already been created from
1383 			 * a checkpoint file we need to set up code now.
1384 			 */
1385 			if (cip->ci_state < FMD_CASE_CLOSED) {
1386 				if (code != NULL && cip->ci_code == NULL) {
1387 					cip->ci_code = fmd_strdup(code,
1388 					    FMD_SLEEP);
1389 					cip->ci_codelen = cip->ci_code ?
1390 					    strlen(cip->ci_code) + 1 : 0;
1391 					fmd_case_code_hash_insert(fmd.d_cases,
1392 					    cip);
1393 				}
1394 			}
1395 
1396 			/*
1397 			 * When recreating an orphan case, state passed in may
1398 			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1399 			 * any suspects are still CLOSED (faulty) then the
1400 			 * overall state needs to be CLOSED.
1401 			 */
1402 			if ((cip->ci_state == FMD_CASE_REPAIRED ||
1403 			    cip->ci_state == FMD_CASE_RESOLVED) &&
1404 			    state == FMD_CASE_CLOSED)
1405 				cip->ci_state = FMD_CASE_CLOSED;
1406 			(void) pthread_mutex_unlock(&cip->ci_lock);
1407 			fmd_case_rele((fmd_case_t *)cip);
1408 			return ((fmd_case_t *)cip);
1409 		}
1410 
1411 		/*
1412 		 * If the existing case isn't an orphan or is being proxied,
1413 		 * then we have a UUID conflict: return failure to the caller.
1414 		 */
1415 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1416 			(void) pthread_mutex_unlock(&cip->ci_lock);
1417 			fmd_case_rele((fmd_case_t *)cip);
1418 			return (NULL);
1419 		}
1420 
1421 		/*
1422 		 * If the new module is reclaiming an orphaned case, remove
1423 		 * the case from the root module, switch ci_mod, and then fall
1424 		 * through to adding the case to the new owner module 'mp'.
1425 		 */
1426 		fmd_module_lock(cip->ci_mod);
1427 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1428 		fmd_module_unlock(cip->ci_mod);
1429 
1430 		fmd_module_rele(cip->ci_mod);
1431 		cip->ci_mod = mp;
1432 		fmd_module_hold(mp);
1433 
1434 		/*
1435 		 * It's possible that fmd crashed or was restarted during a
1436 		 * previous solve operation between the asru cache being created
1437 		 * and the ckpt file being updated to SOLVED. Thus when the DE
1438 		 * recreates the case here from the checkpoint file, the state
1439 		 * will be UNSOLVED and yet we are having to reclaim because
1440 		 * the case was in the asru cache. If this happens, revert the
1441 		 * case back to the UNSOLVED state and let the DE solve it again
1442 		 */
1443 		if (state == FMD_CASE_UNSOLVED) {
1444 			fmd_asru_hash_delete_case(fmd.d_asrus,
1445 			    (fmd_case_t *)cip);
1446 			fmd_case_destroy_suspects(cip);
1447 			fmd_case_code_hash_delete(fmd.d_cases, cip);
1448 			fmd_free(cip->ci_code, cip->ci_codelen);
1449 			cip->ci_code = NULL;
1450 			cip->ci_codelen = 0;
1451 			cip->ci_tv_valid = 0;
1452 		}
1453 
1454 		cip->ci_state = state;
1455 
1456 		(void) pthread_mutex_unlock(&cip->ci_lock);
1457 		fmd_case_rele((fmd_case_t *)cip);
1458 	} else {
1459 		/*
1460 		 * add into hash of solved cases
1461 		 */
1462 		if (cip->ci_code)
1463 			fmd_case_code_hash_insert(fmd.d_cases, cip);
1464 	}
1465 
1466 	ASSERT(fmd_module_locked(mp));
1467 	fmd_list_append(&mp->mod_cases, cip);
1468 
1469 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1470 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1471 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1472 
1473 	return ((fmd_case_t *)cip);
1474 }
1475 
1476 void
fmd_case_destroy(fmd_case_t * cp,int visible)1477 fmd_case_destroy(fmd_case_t *cp, int visible)
1478 {
1479 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1480 	fmd_case_item_t *cit, *ncit;
1481 
1482 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1483 	ASSERT(cip->ci_refs == 0);
1484 
1485 	if (visible) {
1486 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1487 		fmd_case_hash_delete(fmd.d_cases, cip);
1488 	}
1489 
1490 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1491 		ncit = cit->cit_next;
1492 		fmd_event_rele(cit->cit_event);
1493 		fmd_free(cit, sizeof (fmd_case_item_t));
1494 	}
1495 
1496 	fmd_case_destroy_suspects(cip);
1497 
1498 	if (cip->ci_principal != NULL)
1499 		fmd_event_rele(cip->ci_principal);
1500 
1501 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1502 	fmd_free(cip->ci_code, cip->ci_codelen);
1503 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1504 
1505 	fmd_module_rele(cip->ci_mod);
1506 	fmd_free(cip, sizeof (fmd_case_impl_t));
1507 }
1508 
1509 void
fmd_case_hold(fmd_case_t * cp)1510 fmd_case_hold(fmd_case_t *cp)
1511 {
1512 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1513 
1514 	(void) pthread_mutex_lock(&cip->ci_lock);
1515 	fmd_case_hold_locked(cp);
1516 	(void) pthread_mutex_unlock(&cip->ci_lock);
1517 }
1518 
1519 void
fmd_case_hold_locked(fmd_case_t * cp)1520 fmd_case_hold_locked(fmd_case_t *cp)
1521 {
1522 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1523 
1524 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1525 	if (cip->ci_flags & FMD_CF_DELETING)
1526 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1527 		    (void *)cip, cip->ci_uuid);
1528 	cip->ci_refs++;
1529 	ASSERT(cip->ci_refs != 0);
1530 }
1531 
1532 static fmd_case_impl_t *
fmd_case_tryhold(fmd_case_impl_t * cip)1533 fmd_case_tryhold(fmd_case_impl_t *cip)
1534 {
1535 	/*
1536 	 * If the case's "deleting" bit is unset, hold and return case,
1537 	 * otherwise, return NULL.
1538 	 */
1539 	(void) pthread_mutex_lock(&cip->ci_lock);
1540 	if (cip->ci_flags & FMD_CF_DELETING) {
1541 		(void) pthread_mutex_unlock(&cip->ci_lock);
1542 		cip = NULL;
1543 	} else {
1544 		fmd_case_hold_locked((fmd_case_t *)cip);
1545 		(void) pthread_mutex_unlock(&cip->ci_lock);
1546 	}
1547 	return (cip);
1548 }
1549 
1550 void
fmd_case_rele(fmd_case_t * cp)1551 fmd_case_rele(fmd_case_t *cp)
1552 {
1553 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1554 
1555 	(void) pthread_mutex_lock(&cip->ci_lock);
1556 	ASSERT(cip->ci_refs != 0);
1557 
1558 	if (--cip->ci_refs == 0)
1559 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1560 	else
1561 		(void) pthread_mutex_unlock(&cip->ci_lock);
1562 }
1563 
1564 void
fmd_case_rele_locked(fmd_case_t * cp)1565 fmd_case_rele_locked(fmd_case_t *cp)
1566 {
1567 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1568 
1569 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1570 	--cip->ci_refs;
1571 	ASSERT(cip->ci_refs != 0);
1572 }
1573 
1574 int
fmd_case_insert_principal(void * cp,fmd_event_t * ep)1575 fmd_case_insert_principal(void *cp, fmd_event_t *ep)
1576 {
1577 	fmd_case_impl_t *cip = cp;
1578 	fmd_case_item_t *cit;
1579 	fmd_event_t *oep;
1580 	uint_t state;
1581 	int new;
1582 
1583 	fmd_event_hold(ep);
1584 	(void) pthread_mutex_lock(&cip->ci_lock);
1585 
1586 	if (cip->ci_flags & FMD_CF_SOLVED)
1587 		state = FMD_EVS_DIAGNOSED;
1588 	else
1589 		state = FMD_EVS_ACCEPTED;
1590 
1591 	oep = cip->ci_principal;
1592 	cip->ci_principal = ep;
1593 
1594 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1595 		if (cit->cit_event == ep)
1596 			break;
1597 	}
1598 
1599 	cip->ci_flags |= FMD_CF_DIRTY;
1600 	new = cit == NULL && ep != oep;
1601 
1602 	(void) pthread_mutex_unlock(&cip->ci_lock);
1603 
1604 	fmd_module_setcdirty(cip->ci_mod);
1605 	fmd_event_transition(ep, state);
1606 
1607 	if (oep != NULL)
1608 		fmd_event_rele(oep);
1609 
1610 	return (new);
1611 }
1612 
1613 int
fmd_case_insert_event(void * cp,fmd_event_t * ep)1614 fmd_case_insert_event(void *cp, fmd_event_t *ep)
1615 {
1616 	fmd_case_impl_t *cip = cp;
1617 	fmd_case_item_t *cit;
1618 	uint_t state;
1619 	int new;
1620 	boolean_t injected;
1621 
1622 	(void) pthread_mutex_lock(&cip->ci_lock);
1623 
1624 	if (cip->ci_flags & FMD_CF_SOLVED)
1625 		state = FMD_EVS_DIAGNOSED;
1626 	else
1627 		state = FMD_EVS_ACCEPTED;
1628 
1629 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1630 		if (cit->cit_event == ep)
1631 			break;
1632 	}
1633 
1634 	new = cit == NULL && ep != cip->ci_principal;
1635 
1636 	/*
1637 	 * If the event is already in the case or the case is already solved,
1638 	 * there is no reason to save it: just transition it appropriately.
1639 	 */
1640 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1641 		(void) pthread_mutex_unlock(&cip->ci_lock);
1642 		fmd_event_transition(ep, state);
1643 		return (new);
1644 	}
1645 
1646 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1647 	fmd_event_hold(ep);
1648 
1649 	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1650 	    "__injected", &injected) == 0 && injected)
1651 		fmd_case_set_injected(cp);
1652 
1653 	cit->cit_next = cip->ci_items;
1654 	cit->cit_event = ep;
1655 
1656 	cip->ci_items = cit;
1657 	cip->ci_nitems++;
1658 
1659 	cip->ci_flags |= FMD_CF_DIRTY;
1660 	(void) pthread_mutex_unlock(&cip->ci_lock);
1661 
1662 	fmd_module_setcdirty(cip->ci_mod);
1663 	fmd_event_transition(ep, state);
1664 
1665 	return (new);
1666 }
1667 
1668 void
fmd_case_insert_suspect(fmd_case_t * cp,nvlist_t * nvl)1669 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1670 {
1671 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1672 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1673 
1674 	(void) pthread_mutex_lock(&cip->ci_lock);
1675 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1676 	cip->ci_flags |= FMD_CF_DIRTY;
1677 
1678 	cis->cis_next = cip->ci_suspects;
1679 	cis->cis_nvl = nvl;
1680 
1681 	cip->ci_suspects = cis;
1682 	cip->ci_nsuspects++;
1683 
1684 	(void) pthread_mutex_unlock(&cip->ci_lock);
1685 	if (cip->ci_xprt == NULL)
1686 		fmd_module_setcdirty(cip->ci_mod);
1687 }
1688 
1689 void
fmd_case_recreate_suspect(fmd_case_t * cp,nvlist_t * nvl)1690 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1691 {
1692 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1693 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1694 	boolean_t b;
1695 
1696 	(void) pthread_mutex_lock(&cip->ci_lock);
1697 
1698 	cis->cis_next = cip->ci_suspects;
1699 	cis->cis_nvl = nvl;
1700 
1701 	if (nvlist_lookup_boolean_value(nvl,
1702 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1703 		cip->ci_flags |= FMD_CF_INVISIBLE;
1704 
1705 	cip->ci_suspects = cis;
1706 	cip->ci_nsuspects++;
1707 
1708 	(void) pthread_mutex_unlock(&cip->ci_lock);
1709 }
1710 
1711 void
fmd_case_reset_suspects(fmd_case_t * cp)1712 fmd_case_reset_suspects(fmd_case_t *cp)
1713 {
1714 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1715 
1716 	(void) pthread_mutex_lock(&cip->ci_lock);
1717 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1718 
1719 	fmd_case_destroy_suspects(cip);
1720 	cip->ci_flags |= FMD_CF_DIRTY;
1721 
1722 	(void) pthread_mutex_unlock(&cip->ci_lock);
1723 	fmd_module_setcdirty(cip->ci_mod);
1724 }
1725 
1726 /*ARGSUSED*/
1727 static void
fmd_case_unusable(fmd_asru_link_t * alp,void * arg)1728 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1729 {
1730 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1731 }
1732 
1733 /*
1734  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1735  * whatever actions and emit whatever events are appropriate for the state.
1736  * Refer to the topmost block comment explaining the state machine for details.
1737  */
1738 void
fmd_case_transition(fmd_case_t * cp,uint_t state,uint_t flags)1739 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1740 {
1741 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1742 	fmd_case_item_t *cit;
1743 	fmd_event_t *e;
1744 	int resolved = 0;
1745 	int any_unusable_and_present = 0;
1746 
1747 	ASSERT(state <= FMD_CASE_RESOLVED);
1748 	(void) pthread_mutex_lock(&cip->ci_lock);
1749 
1750 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1751 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1752 
1753 	cip->ci_flags |= flags;
1754 
1755 	if (cip->ci_state >= state) {
1756 		(void) pthread_mutex_unlock(&cip->ci_lock);
1757 		return; /* already in specified state */
1758 	}
1759 
1760 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1761 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1762 
1763 	cip->ci_state = state;
1764 	cip->ci_flags |= FMD_CF_DIRTY;
1765 
1766 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1767 		fmd_module_setcdirty(cip->ci_mod);
1768 
1769 	switch (state) {
1770 	case FMD_CASE_SOLVED:
1771 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1772 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1773 
1774 		if (cip->ci_principal != NULL) {
1775 			fmd_event_transition(cip->ci_principal,
1776 			    FMD_EVS_DIAGNOSED);
1777 		}
1778 		break;
1779 
1780 	case FMD_CASE_CLOSE_WAIT:
1781 		/*
1782 		 * If the case was never solved, do not change ASRUs.
1783 		 * If the case was never fmd_case_closed, do not change ASRUs.
1784 		 * If the case was repaired, do not change ASRUs.
1785 		 */
1786 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1787 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1788 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1789 			    fmd_case_unusable, NULL);
1790 
1791 		/*
1792 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1793 		 * module is no longer loaded: continue on to CASE_CLOSED or
1794 		 * CASE_REPAIRED as appropriate.
1795 		 */
1796 		if (fmd_case_orphaned(cp)) {
1797 			if (cip->ci_flags & FMD_CF_REPAIRED) {
1798 				state = cip->ci_state = FMD_CASE_REPAIRED;
1799 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1800 				    cip->ci_uuid,
1801 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1802 				    _fmd_case_snames[FMD_CASE_REPAIRED]));
1803 				goto do_repair;
1804 			} else {
1805 				state = cip->ci_state = FMD_CASE_CLOSED;
1806 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1807 				    cip->ci_uuid,
1808 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1809 				    _fmd_case_snames[FMD_CASE_CLOSED]));
1810 			}
1811 		}
1812 		break;
1813 
1814 	case FMD_CASE_REPAIRED:
1815 do_repair:
1816 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1817 
1818 		/*
1819 		 * If we've been requested to transition straight on to the
1820 		 * RESOLVED state (which can happen with fault proxying where a
1821 		 * list.resolved or a uuresolved is received from the other
1822 		 * side), or if all suspects are already either usable or not
1823 		 * present then transition straight to RESOLVED state,
1824 		 * publishing both the list.repaired and list.resolved. For a
1825 		 * proxy, if we discover here that all suspects are already
1826 		 * either usable or not present, notify the diag side instead
1827 		 * using fmd_xprt_uuresolved().
1828 		 */
1829 		if (flags & FMD_CF_RESOLVED) {
1830 			if (cip->ci_xprt != NULL)
1831 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1832 		} else {
1833 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1834 			    fmd_case_unusable_and_present,
1835 			    &any_unusable_and_present);
1836 			if (any_unusable_and_present)
1837 				break;
1838 			if (cip->ci_xprt != NULL) {
1839 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1840 				break;
1841 			}
1842 		}
1843 
1844 		cip->ci_state = FMD_CASE_RESOLVED;
1845 		(void) pthread_mutex_unlock(&cip->ci_lock);
1846 		fmd_case_publish(cp, state);
1847 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1848 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1849 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1850 		state = FMD_CASE_RESOLVED;
1851 		resolved = 1;
1852 		(void) pthread_mutex_lock(&cip->ci_lock);
1853 		break;
1854 
1855 	case FMD_CASE_RESOLVED:
1856 		/*
1857 		 * For a proxy, no need to check that all suspects are already
1858 		 * either usable or not present - this request has come from
1859 		 * the diagnosing side which makes the final decision on this.
1860 		 */
1861 		if (cip->ci_xprt != NULL) {
1862 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1863 			resolved = 1;
1864 			break;
1865 		}
1866 
1867 		ASSERT(fmd_case_orphaned(cp));
1868 
1869 		/*
1870 		 * If all suspects are already either usable or not present then
1871 		 * carry on, publish list.resolved and discard the case.
1872 		 */
1873 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1874 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1875 		if (any_unusable_and_present) {
1876 			(void) pthread_mutex_unlock(&cip->ci_lock);
1877 			return;
1878 		}
1879 
1880 		resolved = 1;
1881 		break;
1882 	}
1883 
1884 	(void) pthread_mutex_unlock(&cip->ci_lock);
1885 
1886 	/*
1887 	 * If the module has initialized, then publish the appropriate event
1888 	 * for the new case state.  If not, we are being called from the
1889 	 * checkpoint code during module load, in which case the module's
1890 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1891 	 * may not be open yet, which will prevent us from computing the event
1892 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1893 	 * event in our queue: this won't be processed until _fmd_init is done.
1894 	 */
1895 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1896 		fmd_case_publish(cp, state);
1897 	else {
1898 		fmd_case_hold(cp);
1899 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1900 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1901 	}
1902 
1903 	if (resolved) {
1904 		if (cip->ci_xprt != NULL) {
1905 			/*
1906 			 * If we transitioned to RESOLVED, adjust the reference
1907 			 * count to reflect our removal from
1908 			 * fmd.d_rmod->mod_cases above.  If the caller has not
1909 			 * placed an additional hold on the case, it will now
1910 			 * be freed.
1911 			 */
1912 			(void) pthread_mutex_lock(&cip->ci_lock);
1913 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1914 			(void) pthread_mutex_unlock(&cip->ci_lock);
1915 			fmd_case_rele(cp);
1916 		} else {
1917 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1918 			    fmd_asru_log_resolved, NULL);
1919 			(void) pthread_mutex_lock(&cip->ci_lock);
1920 			/* mark as "ready to be discarded */
1921 			cip->ci_flags |= FMD_CF_RES_CMPL;
1922 			(void) pthread_mutex_unlock(&cip->ci_lock);
1923 		}
1924 	}
1925 }
1926 
1927 /*
1928  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1929  * is set if all suspects have passed the rsrc.aged time).
1930  */
1931 void
fmd_case_discard_resolved(fmd_case_t * cp,void * arg)1932 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1933 {
1934 	int check_if_aged = *(int *)arg;
1935 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1936 
1937 	/*
1938 	 * First check if case has completed transition to resolved.
1939 	 */
1940 	(void) pthread_mutex_lock(&cip->ci_lock);
1941 	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1942 		(void) pthread_mutex_unlock(&cip->ci_lock);
1943 		return;
1944 	}
1945 
1946 	/*
1947 	 * Now if check_is_aged is set, see if all suspects have aged.
1948 	 */
1949 	if (check_if_aged) {
1950 		int aged = 1;
1951 
1952 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1953 		    fmd_asru_check_if_aged, &aged);
1954 		if (!aged) {
1955 			(void) pthread_mutex_unlock(&cip->ci_lock);
1956 			return;
1957 		}
1958 	}
1959 
1960 	/*
1961 	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1962 	 * do it twice.
1963 	 */
1964 	fmd_module_lock(cip->ci_mod);
1965 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1966 	fmd_module_unlock(cip->ci_mod);
1967 	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1968 	cip->ci_flags &= ~FMD_CF_RES_CMPL;
1969 	(void) pthread_mutex_unlock(&cip->ci_lock);
1970 	fmd_case_rele(cp);
1971 }
1972 
1973 /*
1974  * Transition the specified case to *at least* the specified state by first
1975  * re-validating the suspect list using the resource cache.  This function is
1976  * employed by the checkpoint code when restoring a saved, solved case to see
1977  * if the state of the case has effectively changed while fmd was not running
1978  * or the module was not loaded.
1979  */
1980 void
fmd_case_transition_update(fmd_case_t * cp,uint_t state,uint_t flags)1981 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1982 {
1983 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1984 
1985 	int usable = 0;		/* are any suspects usable? */
1986 
1987 	ASSERT(state >= FMD_CASE_SOLVED);
1988 	(void) pthread_mutex_lock(&cip->ci_lock);
1989 
1990 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1991 
1992 	(void) pthread_mutex_unlock(&cip->ci_lock);
1993 
1994 	if (!usable) {
1995 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1996 		flags |= FMD_CF_ISOLATED;
1997 	}
1998 
1999 	fmd_case_transition(cp, state, flags);
2000 }
2001 
2002 void
fmd_case_setdirty(fmd_case_t * cp)2003 fmd_case_setdirty(fmd_case_t *cp)
2004 {
2005 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2006 
2007 	(void) pthread_mutex_lock(&cip->ci_lock);
2008 	cip->ci_flags |= FMD_CF_DIRTY;
2009 	(void) pthread_mutex_unlock(&cip->ci_lock);
2010 
2011 	fmd_module_setcdirty(cip->ci_mod);
2012 }
2013 
2014 void
fmd_case_clrdirty(fmd_case_t * cp)2015 fmd_case_clrdirty(fmd_case_t *cp)
2016 {
2017 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2018 
2019 	(void) pthread_mutex_lock(&cip->ci_lock);
2020 	cip->ci_flags &= ~FMD_CF_DIRTY;
2021 	(void) pthread_mutex_unlock(&cip->ci_lock);
2022 }
2023 
2024 void
fmd_case_commit(fmd_case_t * cp)2025 fmd_case_commit(fmd_case_t *cp)
2026 {
2027 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2028 	fmd_case_item_t *cit;
2029 
2030 	(void) pthread_mutex_lock(&cip->ci_lock);
2031 
2032 	if (cip->ci_flags & FMD_CF_DIRTY) {
2033 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2034 			fmd_event_commit(cit->cit_event);
2035 
2036 		if (cip->ci_principal != NULL)
2037 			fmd_event_commit(cip->ci_principal);
2038 
2039 		fmd_buf_hash_commit(&cip->ci_bufs);
2040 		cip->ci_flags &= ~FMD_CF_DIRTY;
2041 	}
2042 
2043 	(void) pthread_mutex_unlock(&cip->ci_lock);
2044 }
2045 
2046 /*
2047  * On proxy side, send back repair/acquit/etc request to diagnosing side
2048  */
2049 void
fmd_case_xprt_updated(fmd_case_t * cp)2050 fmd_case_xprt_updated(fmd_case_t *cp)
2051 {
2052 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2053 	nvlist_t **nva;
2054 	uint8_t *ba;
2055 	int msg = B_TRUE;
2056 	int count = 0;
2057 	fmd_case_lst_t fcl;
2058 
2059 	ASSERT(cip->ci_xprt != NULL);
2060 	(void) pthread_mutex_lock(&cip->ci_lock);
2061 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2062 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2063 	fcl.fcl_countp = &count;
2064 	fcl.fcl_maxcount = cip->ci_nsuspects;
2065 	fcl.fcl_msgp = &msg;
2066 	fcl.fcl_ba = ba;
2067 	fcl.fcl_nva = nva;
2068 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2069 	(void) pthread_mutex_unlock(&cip->ci_lock);
2070 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2071 	    count);
2072 }
2073 
2074 /*
2075  * fmd_case_update_status() can be called on either the proxy side when a
2076  * list.suspect is received, or on the diagnosing side when an update request
2077  * is received from the proxy. It updates the status in the resource cache.
2078  */
2079 void
fmd_case_update_status(fmd_case_t * cp,uint8_t * statusp,uint8_t * proxy_asrup,uint8_t * diag_asrup)2080 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2081     uint8_t *diag_asrup)
2082 {
2083 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2084 	int count = 0;
2085 	fmd_asru_update_status_t faus;
2086 
2087 	/*
2088 	 * update status of resource cache entries
2089 	 */
2090 	faus.faus_countp = &count;
2091 	faus.faus_maxcount = cip->ci_nsuspects;
2092 	faus.faus_ba = statusp;
2093 	faus.faus_proxy_asru = proxy_asrup;
2094 	faus.faus_diag_asru = diag_asrup;
2095 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
2096 	(void) pthread_mutex_lock(&cip->ci_lock);
2097 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2098 	    &faus);
2099 	(void) pthread_mutex_unlock(&cip->ci_lock);
2100 }
2101 
2102 /*
2103  * Called on either the proxy side or the diag side when a repair has taken
2104  * place on the other side but this side may know the asru "contains"
2105  * relationships.
2106  */
2107 void
fmd_case_update_containees(fmd_case_t * cp)2108 fmd_case_update_containees(fmd_case_t *cp)
2109 {
2110 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2111 
2112 	(void) pthread_mutex_lock(&cip->ci_lock);
2113 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2114 	    fmd_asru_update_containees, NULL);
2115 	(void) pthread_mutex_unlock(&cip->ci_lock);
2116 }
2117 
2118 /*
2119  * fmd_case_close_status() is called on diagnosing side when proxy side
2120  * has had a uuclose. It updates the status in the resource cache.
2121  */
2122 void
fmd_case_close_status(fmd_case_t * cp)2123 fmd_case_close_status(fmd_case_t *cp)
2124 {
2125 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2126 	int count = 0;
2127 	fmd_asru_close_status_t facs;
2128 
2129 	/*
2130 	 * update status of resource cache entries
2131 	 */
2132 	facs.facs_countp = &count;
2133 	facs.facs_maxcount = cip->ci_nsuspects;
2134 	(void) pthread_mutex_lock(&cip->ci_lock);
2135 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2136 	    &facs);
2137 	(void) pthread_mutex_unlock(&cip->ci_lock);
2138 }
2139 
2140 /*
2141  * Indicate that the case may need to change state because one or more of the
2142  * ASRUs named as a suspect has changed state.  We examine all the suspects
2143  * and if none are still faulty, we initiate a case close transition.
2144  */
2145 void
fmd_case_update(fmd_case_t * cp)2146 fmd_case_update(fmd_case_t *cp)
2147 {
2148 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2149 	uint_t cstate;
2150 	int faulty = 0;
2151 
2152 	(void) pthread_mutex_lock(&cip->ci_lock);
2153 	cstate = cip->ci_state;
2154 
2155 	if (cip->ci_state < FMD_CASE_SOLVED) {
2156 		(void) pthread_mutex_unlock(&cip->ci_lock);
2157 		return; /* update is not appropriate */
2158 	}
2159 
2160 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2161 		(void) pthread_mutex_unlock(&cip->ci_lock);
2162 		return; /* already repaired */
2163 	}
2164 
2165 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2166 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2167 	(void) pthread_mutex_unlock(&cip->ci_lock);
2168 
2169 	if (faulty) {
2170 		nvlist_t *nvl;
2171 		fmd_event_t *e;
2172 		char *class;
2173 
2174 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2175 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2176 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2177 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2178 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2179 		fmd_log_append(fmd.d_fltlog, e, cp);
2180 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
2181 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2182 		return; /* one or more suspects are still marked faulty */
2183 	}
2184 
2185 	if (cstate == FMD_CASE_CLOSED)
2186 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2187 	else
2188 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2189 }
2190 
2191 /*
2192  * Delete a closed case from the module's case list once the fmdo_close() entry
2193  * point has run to completion.  If the case is owned by a transport module,
2194  * tell the transport to proxy a case close on the other end of the transport.
2195  * Transition to the appropriate next state based on ci_flags.  This
2196  * function represents the end of CLOSE_WAIT and transitions the case to either
2197  * CLOSED or REPAIRED or discards it entirely because it was never solved;
2198  * refer to the topmost block comment explaining the state machine for details.
2199  */
2200 void
fmd_case_delete(fmd_case_t * cp)2201 fmd_case_delete(fmd_case_t *cp)
2202 {
2203 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2204 	fmd_modstat_t *msp;
2205 	size_t buftotal;
2206 
2207 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2208 	ASSERT(fmd_module_locked(cip->ci_mod));
2209 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2210 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2211 
2212 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2213 	msp = cip->ci_mod->mod_stats;
2214 
2215 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2216 	msp->ms_caseopen.fmds_value.ui64--;
2217 
2218 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2219 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2220 
2221 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2222 
2223 	if (cip->ci_xprt == NULL)
2224 		fmd_module_setcdirty(cip->ci_mod);
2225 
2226 	fmd_module_rele(cip->ci_mod);
2227 	cip->ci_mod = fmd.d_rmod;
2228 	fmd_module_hold(cip->ci_mod);
2229 
2230 	/*
2231 	 * If the case has been solved, then retain it
2232 	 * on the root module's case list at least until we're transitioned.
2233 	 * Otherwise free the case with our final fmd_case_rele() below.
2234 	 */
2235 	if (cip->ci_flags & FMD_CF_SOLVED) {
2236 		fmd_module_lock(cip->ci_mod);
2237 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
2238 		fmd_module_unlock(cip->ci_mod);
2239 		fmd_case_hold(cp);
2240 	}
2241 
2242 	/*
2243 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
2244 	 * Note that for proxy case if we're transitioning to CLOSED it means
2245 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2246 	 * the diagnosing side. No need to notify the diagnosing side if we are
2247 	 * transitioning to REPAIRED as we only do this when requested to do
2248 	 * so by the diagnosing side anyway.
2249 	 */
2250 	if (cip->ci_flags & FMD_CF_REPAIRED)
2251 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2252 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
2253 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2254 		if (cip->ci_xprt != NULL)
2255 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2256 	}
2257 
2258 	fmd_case_rele(cp);
2259 }
2260 
2261 void
fmd_case_discard(fmd_case_t * cp,boolean_t delete_from_asru_cache)2262 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2263 {
2264 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2265 
2266 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2267 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2268 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2269 
2270 	ASSERT(fmd_module_locked(cip->ci_mod));
2271 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2272 	if (delete_from_asru_cache) {
2273 		(void) pthread_mutex_lock(&cip->ci_lock);
2274 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2275 		(void) pthread_mutex_unlock(&cip->ci_lock);
2276 	}
2277 	fmd_case_rele(cp);
2278 }
2279 
2280 /*
2281  * Indicate that the problem corresponding to a case has been repaired by
2282  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2283  * already been closed, this function initiates the transition to CLOSE_WAIT.
2284  * The caller must have the case held from fmd_case_hash_lookup(), so we can
2285  * grab and drop ci_lock without the case being able to be freed in between.
2286  */
2287 int
fmd_case_repair(fmd_case_t * cp)2288 fmd_case_repair(fmd_case_t *cp)
2289 {
2290 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2291 	uint_t cstate;
2292 	fmd_asru_rep_arg_t fara;
2293 
2294 	(void) pthread_mutex_lock(&cip->ci_lock);
2295 	cstate = cip->ci_state;
2296 
2297 	if (cstate < FMD_CASE_SOLVED) {
2298 		(void) pthread_mutex_unlock(&cip->ci_lock);
2299 		return (fmd_set_errno(EFMD_CASE_STATE));
2300 	}
2301 
2302 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2303 		(void) pthread_mutex_unlock(&cip->ci_lock);
2304 		return (0); /* already repaired */
2305 	}
2306 
2307 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2308 	fara.fara_reason = FMD_ASRU_REPAIRED;
2309 	fara.fara_bywhat = FARA_BY_CASE;
2310 	fara.fara_rval = NULL;
2311 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2312 	(void) pthread_mutex_unlock(&cip->ci_lock);
2313 
2314 	/*
2315 	 * if this is a proxied case, send the repair across the transport.
2316 	 * The remote side will then do the repair and send a list.repaired back
2317 	 * again such that we can finally repair the case on this side.
2318 	 */
2319 	if (cip->ci_xprt != NULL) {
2320 		fmd_case_xprt_updated(cp);
2321 		return (0);
2322 	}
2323 
2324 	if (cstate == FMD_CASE_CLOSED)
2325 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2326 	else
2327 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2328 
2329 	return (0);
2330 }
2331 
2332 int
fmd_case_acquit(fmd_case_t * cp)2333 fmd_case_acquit(fmd_case_t *cp)
2334 {
2335 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2336 	uint_t cstate;
2337 	fmd_asru_rep_arg_t fara;
2338 
2339 	(void) pthread_mutex_lock(&cip->ci_lock);
2340 	cstate = cip->ci_state;
2341 
2342 	if (cstate < FMD_CASE_SOLVED) {
2343 		(void) pthread_mutex_unlock(&cip->ci_lock);
2344 		return (fmd_set_errno(EFMD_CASE_STATE));
2345 	}
2346 
2347 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2348 		(void) pthread_mutex_unlock(&cip->ci_lock);
2349 		return (0); /* already repaired */
2350 	}
2351 
2352 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2353 	fara.fara_reason = FMD_ASRU_ACQUITTED;
2354 	fara.fara_bywhat = FARA_BY_CASE;
2355 	fara.fara_rval = NULL;
2356 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2357 	(void) pthread_mutex_unlock(&cip->ci_lock);
2358 
2359 	/*
2360 	 * if this is a proxied case, send the repair across the transport.
2361 	 * The remote side will then do the repair and send a list.repaired back
2362 	 * again such that we can finally repair the case on this side.
2363 	 */
2364 	if (cip->ci_xprt != NULL) {
2365 		fmd_case_xprt_updated(cp);
2366 		return (0);
2367 	}
2368 
2369 	if (cstate == FMD_CASE_CLOSED)
2370 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2371 	else
2372 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2373 
2374 	return (0);
2375 }
2376 
2377 int
fmd_case_contains(fmd_case_t * cp,fmd_event_t * ep)2378 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2379 {
2380 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2381 	fmd_case_item_t *cit;
2382 	uint_t state;
2383 	int rv = 0;
2384 
2385 	(void) pthread_mutex_lock(&cip->ci_lock);
2386 
2387 	if (cip->ci_state >= FMD_CASE_SOLVED)
2388 		state = FMD_EVS_DIAGNOSED;
2389 	else
2390 		state = FMD_EVS_ACCEPTED;
2391 
2392 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2393 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2394 			break;
2395 	}
2396 
2397 	if (rv == 0 && cip->ci_principal != NULL)
2398 		rv = fmd_event_equal(ep, cip->ci_principal);
2399 
2400 	(void) pthread_mutex_unlock(&cip->ci_lock);
2401 
2402 	if (rv != 0)
2403 		fmd_event_transition(ep, state);
2404 
2405 	return (rv);
2406 }
2407 
2408 int
fmd_case_orphaned(fmd_case_t * cp)2409 fmd_case_orphaned(fmd_case_t *cp)
2410 {
2411 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2412 }
2413 
2414 void
fmd_case_settime(fmd_case_t * cp,time_t tv_sec,suseconds_t tv_usec)2415 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2416 {
2417 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2418 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2419 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2420 }
2421 
2422 void
fmd_case_set_injected(fmd_case_t * cp)2423 fmd_case_set_injected(fmd_case_t *cp)
2424 {
2425 	((fmd_case_impl_t *)cp)->ci_injected = 1;
2426 }
2427 
2428 void
fmd_case_set_de_fmri(fmd_case_t * cp,nvlist_t * nvl)2429 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2430 {
2431 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2432 
2433 	nvlist_free(cip->ci_diag_de);
2434 	cip->ci_diag_de = nvl;
2435 }
2436 
2437 void
fmd_case_setcode(fmd_case_t * cp,char * code)2438 fmd_case_setcode(fmd_case_t *cp, char *code)
2439 {
2440 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2441 
2442 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2443 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2444 }
2445 
2446 /*ARGSUSED*/
2447 static void
fmd_case_repair_replay_case(fmd_case_t * cp,void * arg)2448 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2449 {
2450 	int not_faulty = 0;
2451 	int faulty = 0;
2452 	nvlist_t *nvl;
2453 	fmd_event_t *e;
2454 	char *class;
2455 	int any_unusable_and_present = 0;
2456 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2457 
2458 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2459 		return;
2460 
2461 	if (cip->ci_state == FMD_CASE_RESOLVED) {
2462 		cip->ci_flags |= FMD_CF_RES_CMPL;
2463 		return;
2464 	}
2465 
2466 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2467 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2468 	    &not_faulty);
2469 
2470 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2471 		/*
2472 		 * If none of the suspects is faulty, replay the list.repaired.
2473 		 * If all suspects are already either usable or not present then
2474 		 * also transition straight to RESOLVED state.
2475 		 */
2476 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2477 		    fmd_case_unusable_and_present, &any_unusable_and_present);
2478 		if (!any_unusable_and_present) {
2479 			cip->ci_state = FMD_CASE_RESOLVED;
2480 
2481 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2482 			    cip->ci_uuid));
2483 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2484 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2485 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2486 			    class);
2487 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2488 
2489 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2490 			    cip->ci_uuid));
2491 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2492 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2493 			    fmd_asru_log_resolved, NULL);
2494 			cip->ci_flags |= FMD_CF_RES_CMPL;
2495 		} else {
2496 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2497 			    cip->ci_uuid));
2498 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2499 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2500 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2501 			    class);
2502 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2503 		}
2504 	} else if (faulty && not_faulty) {
2505 		/*
2506 		 * if some but not all of the suspects are not faulty, replay
2507 		 * the list.updated.
2508 		 */
2509 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2510 		    cip->ci_uuid));
2511 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2512 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2513 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2514 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2515 	}
2516 }
2517 
2518 void
fmd_case_repair_replay()2519 fmd_case_repair_replay()
2520 {
2521 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2522 }
2523