xref: /titanic_44/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 2b4a78020b9c38d1b95e2f3fefa6d6e4be382d1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * FMD Case Subsystem
29  *
30  * Diagnosis engines are expected to group telemetry events related to the
31  * diagnosis of a particular problem on the system into a set of cases.  The
32  * diagnosis engine may have any number of cases open at a given point in time.
33  * Some cases may eventually be *solved* by associating a suspect list of one
34  * or more problems with the case, at which point fmd publishes a list.suspect
35  * event for the case and it becomes visible to administrators and agents.
36  *
37  * Every case is named using a UUID, and is globally visible in the case hash.
38  * Cases are reference-counted, except for the reference from the case hash
39  * itself.  Consumers of case references include modules, which store active
40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
41  *
42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
44  * or transport) and the case is referenced by the mod_cases list.  Once the
45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
47  *
48  *			+------------+
49  *	     +----------|  UNSOLVED  |
50  *	     |		+------------+
51  *	     |		      1 |
52  *	     |			|
53  *	     |		+-------v----+
54  *	   2 |		|    SOLVED  |
55  *	     |		+------------+
56  *	     |		    3 |  5 |
57  *	     +------------+   |    |
58  *			  |   |    |
59  *			+-v---v----v-+
60  *			| CLOSE_WAIT |
61  *			+------------+
62  *			  |   |    |
63  *	      +-----------+   |    +------------+
64  *	      |		    4 |			|
65  *	      v		+-----v------+		|
66  *	   discard      |   CLOSED   |	      6	|
67  *			+------------+		|
68  *			      |			|
69  *			      |	   +------------+
70  *			    7 |	   |
71  *			+-----v----v-+
72  *			|  REPAIRED  |
73  *			+------------+
74  *			      |
75  *			    8 |
76  *			+-----v------+
77  *			|  RESOLVED  |
78  *			+------------+
79  *			      |
80  *			      v
81  *			   discard
82  *
83  * The state machine changes are triggered by calls to fmd_case_transition()
84  * from various locations inside of fmd, as described below:
85  *
86  * [1] Called by: fmd_case_solve()
87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
88  *                conviction policy is applied to suspect list
89  *                suspects convicted are marked faulty (F) in R$
90  *                list.suspect event logged and dispatched
91  *
92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
94  *                case discarded upon exit from CLOSE_WAIT
95  *
96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
98  *                suspects convicted (F) are marked unusable (U) in R$
99  *                diagnosis engine fmdo_close() entry point scheduled
100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
101  *
102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
103  *       Actions: list.isolated event dispatched
104  *                case deleted from module's list of open cases
105  *
106  * [5] Called by: fmd_case_repair(), fmd_case_update()
107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
108  *                diagnosis engine fmdo_close() entry point scheduled
109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
110  *
111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
112  *       Actions: suspects convicted are marked non faulty (!F) in R$
113  *                list.repaired or list.updated event dispatched
114  *
115  * [7] Called by: fmd_case_repair(), fmd_case_update()
116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
117  *                suspects convicted are marked non faulty (!F) in R$
118  *                list.repaired or list.updated event dispatched
119  *
120  * [8] Called by: fmd_case_uuresolve()
121  *       Actions: list.resolved event dispatched
122  *		  case is discarded
123  */
124 
125 #include <sys/fm/protocol.h>
126 #include <uuid/uuid.h>
127 #include <alloca.h>
128 
129 #include <fmd_alloc.h>
130 #include <fmd_module.h>
131 #include <fmd_error.h>
132 #include <fmd_conf.h>
133 #include <fmd_case.h>
134 #include <fmd_string.h>
135 #include <fmd_subr.h>
136 #include <fmd_protocol.h>
137 #include <fmd_event.h>
138 #include <fmd_eventq.h>
139 #include <fmd_dispq.h>
140 #include <fmd_buf.h>
141 #include <fmd_log.h>
142 #include <fmd_asru.h>
143 #include <fmd_fmri.h>
144 #include <fmd_xprt.h>
145 
146 #include <fmd.h>
147 
148 static const char *const _fmd_case_snames[] = {
149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
150 	"SOLVED",	/* FMD_CASE_SOLVED */
151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
152 	"CLOSED",	/* FMD_CASE_CLOSED */
153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
155 };
156 
157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
158 
159 fmd_case_hash_t *
160 fmd_case_hash_create(void)
161 {
162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
163 
164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
165 	chp->ch_hashlen = fmd.d_str_buckets;
166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
168 	    FMD_SLEEP);
169 	chp->ch_count = 0;
170 
171 	return (chp);
172 }
173 
174 /*
175  * Destroy the case hash.  Unlike most of our hash tables, no active references
176  * are kept by the case hash itself; all references come from other subsystems.
177  * The hash must be destroyed after all modules are unloaded; if anything was
178  * present in the hash it would be by definition a reference count leak.
179  */
180 void
181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
182 {
183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
185 	fmd_free(chp, sizeof (fmd_case_hash_t));
186 }
187 
188 /*
189  * Take a snapshot of the case hash by placing an additional hold on each
190  * member in an auxiliary array, and then call 'func' for each case.
191  */
192 void
193 fmd_case_hash_apply(fmd_case_hash_t *chp,
194     void (*func)(fmd_case_t *, void *), void *arg)
195 {
196 	fmd_case_impl_t *cp, **cps, **cpp;
197 	uint_t cpc, i;
198 
199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
200 
201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
202 	cpc = chp->ch_count;
203 
204 	for (i = 0; i < chp->ch_hashlen; i++) {
205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
206 			*cpp++ = fmd_case_tryhold(cp);
207 	}
208 
209 	ASSERT(cpp == cps + cpc);
210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
211 
212 	for (i = 0; i < cpc; i++) {
213 		if (cps[i] != NULL) {
214 			func((fmd_case_t *)cps[i], arg);
215 			fmd_case_rele((fmd_case_t *)cps[i]);
216 		}
217 	}
218 
219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
220 }
221 
222 static void
223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
224 {
225 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
226 
227 	cip->ci_code_next = chp->ch_code_hash[h];
228 	chp->ch_code_hash[h] = cip;
229 }
230 
231 static void
232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
233 {
234 	fmd_case_impl_t **pp, *cp;
235 
236 	if (cip->ci_code) {
237 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
238 
239 		pp = &chp->ch_code_hash[h];
240 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
241 			if (cp != cip)
242 				pp = &cp->ci_code_next;
243 			else
244 				break;
245 		}
246 		if (cp != NULL) {
247 			*pp = cp->ci_code_next;
248 			cp->ci_code_next = NULL;
249 		}
250 	}
251 }
252 
253 /*
254  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
255  * were defined for this case or if the lookup fails, the event dictionary or
256  * module code is broken, and we set the event code to a precomputed default.
257  */
258 static const char *
259 fmd_case_mkcode(fmd_case_t *cp)
260 {
261 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
262 	fmd_case_susp_t *cis;
263 	fmd_case_hash_t *chp = fmd.d_cases;
264 
265 	char **keys, **keyp;
266 	const char *s;
267 
268 	ASSERT(MUTEX_HELD(&cip->ci_lock));
269 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
270 
271 	/*
272 	 * delete any existing entry from code hash if it is on it
273 	 */
274 	fmd_case_code_hash_delete(chp, cip);
275 
276 	fmd_free(cip->ci_code, cip->ci_codelen);
277 	cip->ci_codelen = cip->ci_mod->mod_codelen;
278 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
279 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
280 
281 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
282 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
283 			keyp++;
284 	}
285 
286 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
287 
288 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
289 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
290 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
291 		fmd_free(cip->ci_code, cip->ci_codelen);
292 		cip->ci_codelen = strlen(s) + 1;
293 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
294 		(void) strcpy(cip->ci_code, s);
295 	}
296 
297 	/*
298 	 * add into hash of solved cases
299 	 */
300 	fmd_case_code_hash_insert(chp, cip);
301 
302 	return (cip->ci_code);
303 }
304 
305 typedef struct {
306 	int	*fcl_countp;
307 	int	fcl_maxcount;
308 	uint8_t *fcl_ba;
309 	nvlist_t **fcl_nva;
310 	int	*fcl_msgp;
311 } fmd_case_lst_t;
312 
313 static void
314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
315 {
316 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
317 	boolean_t b;
318 	int state;
319 
320 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
321 		return;
322 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
323 	    &b) == 0 && b == B_FALSE)
324 		*entryp->fcl_msgp = B_FALSE;
325 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
326 	state = fmd_asru_al_getstate(alp);
327 	if (state & FMD_ASRU_DEGRADED)
328 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
329 	if (state & FMD_ASRU_UNUSABLE)
330 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
331 	if (state & FMD_ASRU_FAULTY)
332 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
333 	if (!(state & FMD_ASRU_PRESENT))
334 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
335 	if (alp->al_reason == FMD_ASRU_REPAIRED)
336 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
337 	else if (alp->al_reason == FMD_ASRU_REPLACED)
338 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
339 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
340 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
341 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
342 	(*entryp->fcl_countp)++;
343 }
344 
345 static void
346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
347 {
348 	int *faultyp = (int *)arg;
349 
350 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
351 }
352 
353 static void
354 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
355 {
356 	int *usablep = (int *)arg;
357 
358 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
359 }
360 
361 static void
362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
363 {
364 	int *not_faultyp = (int *)arg;
365 
366 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
367 }
368 
369 /*
370  * Have we got any suspects with an asru that are still unusable and present?
371  */
372 static void
373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
374 {
375 	int *rvalp = (int *)arg;
376 	int state = fmd_asru_al_getstate(alp);
377 	nvlist_t *asru;
378 
379 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
380 		return;
381 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
382 }
383 
384 nvlist_t *
385 fmd_case_mkevent(fmd_case_t *cp, const char *class)
386 {
387 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
388 	nvlist_t **nva, *nvl;
389 	uint8_t *ba;
390 	int msg = B_TRUE;
391 	const char *code;
392 	fmd_case_lst_t fcl;
393 	int count = 0;
394 
395 	(void) pthread_mutex_lock(&cip->ci_lock);
396 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
397 
398 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
399 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
400 
401 	/*
402 	 * For each suspect associated with the case, store its fault event
403 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
404 	 * have asked not to be messaged.  If any of them have made such a
405 	 * request, propagate that attribute to the composite list.* event.
406 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
407 	 */
408 	fcl.fcl_countp = &count;
409 	fcl.fcl_maxcount = cip->ci_nsuspects;
410 	fcl.fcl_msgp = &msg;
411 	fcl.fcl_ba = ba;
412 	fcl.fcl_nva = nva;
413 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
414 
415 	if (cip->ci_code == NULL)
416 		(void) fmd_case_mkcode(cp);
417 	/*
418 	 * For repair and updated event, we lookup diagcode from dict using key
419 	 * "list.repaired" or "list.updated" or "list.resolved".
420 	 */
421 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
422 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
423 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
424 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
425 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
426 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
427 	else
428 		code = cip->ci_code;
429 
430 	if (msg == B_FALSE)
431 		cip->ci_flags |= FMD_CF_INVISIBLE;
432 
433 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
434 	    code, count, nva, ba, msg, &cip->ci_tv);
435 
436 	(void) pthread_mutex_unlock(&cip->ci_lock);
437 	return (nvl);
438 }
439 
440 static boolean_t
441 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
442 {
443 	nvlist_t *new_rsrc;
444 	nvlist_t *rsrc;
445 	char *new_name = NULL;
446 	char *name = NULL;
447 	ssize_t new_namelen;
448 	ssize_t namelen;
449 	int fmri_present = 1;
450 	int new_fmri_present = 1;
451 	int match = B_FALSE;
452 	fmd_topo_t *ftp = fmd_topo_hold();
453 
454 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
455 		fmri_present = 0;
456 	else {
457 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
458 			goto done;
459 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
460 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
461 			goto done;
462 	}
463 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
464 		new_fmri_present = 0;
465 	else {
466 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
467 			goto done;
468 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
469 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
470 			goto done;
471 	}
472 	match = (fmri_present == new_fmri_present &&
473 	    (fmri_present == 0 ||
474 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
475 done:
476 	if (name != NULL)
477 		fmd_free(name, namelen + 1);
478 	if (new_name != NULL)
479 		fmd_free(new_name, new_namelen + 1);
480 	fmd_topo_rele(ftp);
481 	return (match);
482 }
483 
484 static int
485 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
486 {
487 	char *class, *new_class;
488 
489 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
490 		return (0);
491 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
492 	    FM_FAULT_RESOURCE))
493 		return (0);
494 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
495 		return (0);
496 	(void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
497 	(void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
498 	return (strcmp(class, new_class) == 0);
499 }
500 
501 /*
502  * see if an identical suspect list already exists in the cache
503  */
504 static int
505 fmd_case_check_for_dups(fmd_case_t *cp)
506 {
507 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
508 	fmd_case_hash_t *chp = fmd.d_cases;
509 	fmd_case_susp_t *xcis, *cis;
510 	int match = 0, match_susp;
511 	uint_t h;
512 
513 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
514 
515 	/*
516 	 * Find all cases with this code
517 	 */
518 	h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
519 	for (xcip = chp->ch_code_hash[h]; xcip != NULL;
520 	    xcip = xcip->ci_code_next) {
521 		/*
522 		 * only look for any cases (apart from this one)
523 		 * whose code and number of suspects match
524 		 */
525 		if (xcip == cip || fmd_case_tryhold(xcip) == NULL)
526 			continue;
527 		if (strcmp(xcip->ci_code, cip->ci_code) != 0 ||
528 		    xcip->ci_nsuspects != cip->ci_nsuspects) {
529 			fmd_case_rele((fmd_case_t *)xcip);
530 			continue;
531 		}
532 
533 		/*
534 		 * For each suspect in one list, check if there
535 		 * is an identical suspect in the other list
536 		 */
537 		match = 1;
538 		for (xcis = xcip->ci_suspects; xcis != NULL;
539 		    xcis = xcis->cis_next) {
540 			match_susp = 0;
541 			for (cis = cip->ci_suspects; cis != NULL;
542 			    cis = cis->cis_next) {
543 				if (fmd_case_match_suspect(cis, xcis) == 1) {
544 					match_susp = 1;
545 					break;
546 				}
547 			}
548 			if (match_susp == 0) {
549 				match = 0;
550 				break;
551 			}
552 		}
553 		fmd_case_rele((fmd_case_t *)xcip);
554 		if (match) {
555 			(void) pthread_rwlock_unlock(&chp->ch_lock);
556 			return (1);
557 		}
558 	}
559 	(void) pthread_rwlock_unlock(&chp->ch_lock);
560 	return (0);
561 }
562 
563 /*
564  * Convict suspects in a case by applying a conviction policy and updating the
565  * resource cache prior to emitting the list.suspect event for the given case.
566  * At present, our policy is very simple: convict every suspect in the case.
567  * In the future, this policy can be extended and made configurable to permit:
568  *
569  * - convicting the suspect with the highest FIT rate
570  * - convicting the suspect with the cheapest FRU
571  * - convicting the suspect with the FRU that is in a depot's inventory
572  * - convicting the suspect with the longest lifetime
573  *
574  * and so forth.  A word to the wise: this problem is significantly harder that
575  * it seems at first glance.  Future work should heed the following advice:
576  *
577  * Hacking the policy into C code here is a very bad idea.  The policy needs to
578  * be decided upon very carefully and fundamentally encodes knowledge of what
579  * suspect list combinations can be emitted by what diagnosis engines.  As such
580  * fmd's code is the wrong location, because that would require fmd itself to
581  * be updated for every diagnosis engine change, defeating the entire design.
582  * The FMA Event Registry knows the suspect list combinations: policy inputs
583  * can be derived from it and used to produce per-module policy configuration.
584  *
585  * If the policy needs to be dynamic and not statically fixed at either fmd
586  * startup or module load time, any implementation of dynamic policy retrieval
587  * must employ some kind of caching mechanism or be part of a built-in module.
588  * The fmd_case_convict() function is called with locks held inside of fmd and
589  * is not a place where unbounded blocking on some inter-process or inter-
590  * system communication to another service (e.g. another daemon) can occur.
591  */
592 static int
593 fmd_case_convict(fmd_case_t *cp)
594 {
595 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
596 	fmd_asru_hash_t *ahp = fmd.d_asrus;
597 
598 	fmd_case_susp_t *cis;
599 	fmd_asru_link_t *alp;
600 
601 	(void) pthread_mutex_lock(&cip->ci_lock);
602 	(void) fmd_case_mkcode(cp);
603 	if (fmd_case_check_for_dups(cp) == 1) {
604 		(void) pthread_mutex_unlock(&cip->ci_lock);
605 		return (1);
606 	}
607 
608 	/*
609 	 * no suspect list already exists  - allocate new cache entries
610 	 */
611 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
612 		if ((alp = fmd_asru_hash_create_entry(ahp,
613 		    cp, cis->cis_nvl)) == NULL) {
614 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
615 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
616 			continue;
617 		}
618 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
619 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
620 	}
621 
622 	(void) pthread_mutex_unlock(&cip->ci_lock);
623 	return (0);
624 }
625 
626 void
627 fmd_case_publish(fmd_case_t *cp, uint_t state)
628 {
629 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
630 	fmd_event_t *e;
631 	nvlist_t *nvl;
632 	char *class;
633 
634 	if (state == FMD_CASE_CURRENT)
635 		state = cip->ci_state; /* use current state */
636 
637 	switch (state) {
638 	case FMD_CASE_SOLVED:
639 		(void) pthread_mutex_lock(&cip->ci_lock);
640 
641 		/*
642 		 * If we already have a code, then case is already solved.
643 		 */
644 		if (cip->ci_code != NULL) {
645 			(void) pthread_mutex_unlock(&cip->ci_lock);
646 			break;
647 		}
648 
649 		if (cip->ci_tv_valid == 0) {
650 			fmd_time_gettimeofday(&cip->ci_tv);
651 			cip->ci_tv_valid = 1;
652 		}
653 		(void) pthread_mutex_unlock(&cip->ci_lock);
654 
655 		if (fmd_case_convict(cp) == 1) { /* dupclose */
656 			cip->ci_flags &= ~FMD_CF_SOLVED;
657 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
658 			break;
659 		}
660 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
661 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
662 
663 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
664 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
665 		fmd_log_append(fmd.d_fltlog, e, cp);
666 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
667 		fmd_dispq_dispatch(fmd.d_disp, e, class);
668 
669 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
670 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
671 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
672 
673 		break;
674 
675 	case FMD_CASE_CLOSE_WAIT:
676 		fmd_case_hold(cp);
677 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
678 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
679 
680 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
681 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
682 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
683 
684 		break;
685 
686 	case FMD_CASE_CLOSED:
687 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
688 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
689 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
690 		fmd_dispq_dispatch(fmd.d_disp, e, class);
691 		break;
692 
693 	case FMD_CASE_REPAIRED:
694 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
695 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
696 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
697 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
698 		fmd_log_append(fmd.d_fltlog, e, cp);
699 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
700 		fmd_dispq_dispatch(fmd.d_disp, e, class);
701 		break;
702 
703 	case FMD_CASE_RESOLVED:
704 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
705 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
706 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
707 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
708 		fmd_log_append(fmd.d_fltlog, e, cp);
709 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
710 		fmd_dispq_dispatch(fmd.d_disp, e, class);
711 		break;
712 	}
713 }
714 
715 fmd_case_t *
716 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
717 {
718 	fmd_case_impl_t *cip;
719 	uint_t h;
720 
721 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
722 	h = fmd_strhash(uuid) % chp->ch_hashlen;
723 
724 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
725 		if (strcmp(cip->ci_uuid, uuid) == 0)
726 			break;
727 	}
728 
729 	/*
730 	 * If deleting bit is set, treat the case as if it doesn't exist.
731 	 */
732 	if (cip != NULL)
733 		cip = fmd_case_tryhold(cip);
734 
735 	if (cip == NULL)
736 		(void) fmd_set_errno(EFMD_CASE_INVAL);
737 
738 	(void) pthread_rwlock_unlock(&chp->ch_lock);
739 	return ((fmd_case_t *)cip);
740 }
741 
742 static fmd_case_impl_t *
743 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
744 {
745 	fmd_case_impl_t *eip;
746 	uint_t h;
747 
748 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
749 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
750 
751 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
752 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
753 		    fmd_case_tryhold(eip) != NULL) {
754 			(void) pthread_rwlock_unlock(&chp->ch_lock);
755 			return (eip); /* uuid already present */
756 		}
757 	}
758 
759 	cip->ci_next = chp->ch_hash[h];
760 	chp->ch_hash[h] = cip;
761 
762 	chp->ch_count++;
763 	ASSERT(chp->ch_count != 0);
764 
765 	(void) pthread_rwlock_unlock(&chp->ch_lock);
766 	return (cip);
767 }
768 
769 static void
770 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
771 {
772 	fmd_case_impl_t *cp, **pp;
773 	uint_t h;
774 
775 	ASSERT(MUTEX_HELD(&cip->ci_lock));
776 
777 	cip->ci_flags |= FMD_CF_DELETING;
778 	(void) pthread_mutex_unlock(&cip->ci_lock);
779 
780 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
781 
782 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
783 	pp = &chp->ch_hash[h];
784 
785 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
786 		if (cp != cip)
787 			pp = &cp->ci_next;
788 		else
789 			break;
790 	}
791 
792 	if (cp == NULL) {
793 		fmd_panic("case %p (%s) not found on hash chain %u\n",
794 		    (void *)cip, cip->ci_uuid, h);
795 	}
796 
797 	*pp = cp->ci_next;
798 	cp->ci_next = NULL;
799 
800 	/*
801 	 * delete from code hash if it is on it
802 	 */
803 	fmd_case_code_hash_delete(chp, cip);
804 
805 	ASSERT(chp->ch_count != 0);
806 	chp->ch_count--;
807 
808 	(void) pthread_rwlock_unlock(&chp->ch_lock);
809 
810 	(void) pthread_mutex_lock(&cip->ci_lock);
811 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
812 }
813 
814 fmd_case_t *
815 fmd_case_create(fmd_module_t *mp, void *data)
816 {
817 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
818 	fmd_case_impl_t *eip = NULL;
819 	uuid_t uuid;
820 
821 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
822 	fmd_buf_hash_create(&cip->ci_bufs);
823 
824 	fmd_module_hold(mp);
825 	cip->ci_mod = mp;
826 	cip->ci_refs = 1;
827 	cip->ci_state = FMD_CASE_UNSOLVED;
828 	cip->ci_flags = FMD_CF_DIRTY;
829 	cip->ci_data = data;
830 
831 	/*
832 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
833 	 * define any constant for the length of an unparse string, and do not
834 	 * permit the caller to specify a buffer length for safety.  The spec
835 	 * says it will be 36 bytes, but we make it tunable just in case.
836 	 */
837 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
838 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
839 
840 	/*
841 	 * We expect this loop to execute only once, but code it defensively
842 	 * against the possibility of libuuid bugs.  Keep generating uuids and
843 	 * attempting to do a hash insert until we get a unique one.
844 	 */
845 	do {
846 		if (eip != NULL)
847 			fmd_case_rele((fmd_case_t *)eip);
848 		uuid_generate(uuid);
849 		uuid_unparse(uuid, cip->ci_uuid);
850 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
851 
852 	ASSERT(fmd_module_locked(mp));
853 	fmd_list_append(&mp->mod_cases, cip);
854 	fmd_module_setcdirty(mp);
855 
856 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
857 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
858 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
859 
860 	return ((fmd_case_t *)cip);
861 }
862 
863 static void
864 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
865 {
866 	fmd_case_susp_t *cis, *ncis;
867 
868 	ASSERT(MUTEX_HELD(&cip->ci_lock));
869 
870 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
871 		ncis = cis->cis_next;
872 		nvlist_free(cis->cis_nvl);
873 		fmd_free(cis, sizeof (fmd_case_susp_t));
874 	}
875 
876 	cip->ci_suspects = NULL;
877 	cip->ci_nsuspects = 0;
878 }
879 
880 fmd_case_t *
881 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
882     uint_t state, const char *uuid, const char *code)
883 {
884 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
885 	fmd_case_impl_t *eip;
886 
887 	ASSERT(state < FMD_CASE_RESOLVED);
888 
889 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
890 	fmd_buf_hash_create(&cip->ci_bufs);
891 
892 	fmd_module_hold(mp);
893 	cip->ci_mod = mp;
894 	cip->ci_xprt = xp;
895 	cip->ci_refs = 1;
896 	cip->ci_state = state;
897 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
898 	cip->ci_uuidlen = strlen(cip->ci_uuid);
899 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
900 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
901 
902 	if (state > FMD_CASE_CLOSE_WAIT)
903 		cip->ci_flags |= FMD_CF_SOLVED;
904 
905 	/*
906 	 * Insert the case into the global case hash.  If the specified UUID is
907 	 * already present, check to see if it is an orphan: if so, reclaim it;
908 	 * otherwise if it is owned by a different module then return NULL.
909 	 */
910 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
911 		(void) pthread_mutex_lock(&cip->ci_lock);
912 		cip->ci_refs--; /* decrement to zero */
913 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
914 
915 		cip = eip; /* switch 'cip' to the existing case */
916 		(void) pthread_mutex_lock(&cip->ci_lock);
917 
918 		/*
919 		 * If the ASRU cache is trying to recreate an orphan, then just
920 		 * return the existing case that we found without changing it.
921 		 */
922 		if (mp == fmd.d_rmod) {
923 			/*
924 			 * When recreating an orphan case, state passed in may
925 			 * either be CLOSED (faulty) or REPAIRED (!faulty). If
926 			 * any suspects are still CLOSED (faulty) then the
927 			 * overall state needs to be CLOSED.
928 			 */
929 			if (state == FMD_CASE_CLOSED)
930 				cip->ci_state = FMD_CASE_CLOSED;
931 			(void) pthread_mutex_unlock(&cip->ci_lock);
932 			fmd_case_rele((fmd_case_t *)cip);
933 			return ((fmd_case_t *)cip);
934 		}
935 
936 		/*
937 		 * If the existing case isn't an orphan or is being proxied,
938 		 * then we have a UUID conflict: return failure to the caller.
939 		 */
940 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
941 			(void) pthread_mutex_unlock(&cip->ci_lock);
942 			fmd_case_rele((fmd_case_t *)cip);
943 			return (NULL);
944 		}
945 
946 		/*
947 		 * If the new module is reclaiming an orphaned case, remove
948 		 * the case from the root module, switch ci_mod, and then fall
949 		 * through to adding the case to the new owner module 'mp'.
950 		 */
951 		fmd_module_lock(cip->ci_mod);
952 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
953 		fmd_module_unlock(cip->ci_mod);
954 
955 		fmd_module_rele(cip->ci_mod);
956 		cip->ci_mod = mp;
957 		fmd_module_hold(mp);
958 
959 		/*
960 		 * It's possible that fmd crashed or was restarted during a
961 		 * previous solve operation between the asru cache being created
962 		 * and the ckpt file being updated to SOLVED. Thus when the DE
963 		 * recreates the case here from the checkpoint file, the state
964 		 * will be UNSOLVED and yet we are having to reclaim because
965 		 * the case was in the asru cache. If this happens, revert the
966 		 * case back to the UNSOLVED state and let the DE solve it again
967 		 */
968 		if (state == FMD_CASE_UNSOLVED) {
969 			fmd_asru_hash_delete_case(fmd.d_asrus,
970 			    (fmd_case_t *)cip);
971 			fmd_case_destroy_suspects(cip);
972 			fmd_case_code_hash_delete(fmd.d_cases, cip);
973 			fmd_free(cip->ci_code, cip->ci_codelen);
974 			cip->ci_code = NULL;
975 			cip->ci_codelen = 0;
976 			cip->ci_tv_valid = 0;
977 		}
978 
979 		cip->ci_state = state;
980 
981 		(void) pthread_mutex_unlock(&cip->ci_lock);
982 		fmd_case_rele((fmd_case_t *)cip);
983 	} else {
984 		/*
985 		 * add into hash of solved cases
986 		 */
987 		if (cip->ci_code)
988 			fmd_case_code_hash_insert(fmd.d_cases, cip);
989 	}
990 
991 	ASSERT(fmd_module_locked(mp));
992 	fmd_list_append(&mp->mod_cases, cip);
993 
994 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
995 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
996 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
997 
998 	return ((fmd_case_t *)cip);
999 }
1000 
1001 void
1002 fmd_case_destroy(fmd_case_t *cp, int visible)
1003 {
1004 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1005 	fmd_case_item_t *cit, *ncit;
1006 
1007 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1008 	ASSERT(cip->ci_refs == 0);
1009 
1010 	if (visible) {
1011 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1012 		fmd_case_hash_delete(fmd.d_cases, cip);
1013 	}
1014 
1015 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1016 		ncit = cit->cit_next;
1017 		fmd_event_rele(cit->cit_event);
1018 		fmd_free(cit, sizeof (fmd_case_item_t));
1019 	}
1020 
1021 	fmd_case_destroy_suspects(cip);
1022 
1023 	if (cip->ci_principal != NULL)
1024 		fmd_event_rele(cip->ci_principal);
1025 
1026 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1027 	fmd_free(cip->ci_code, cip->ci_codelen);
1028 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1029 
1030 	fmd_module_rele(cip->ci_mod);
1031 	fmd_free(cip, sizeof (fmd_case_impl_t));
1032 }
1033 
1034 void
1035 fmd_case_hold(fmd_case_t *cp)
1036 {
1037 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1038 
1039 	(void) pthread_mutex_lock(&cip->ci_lock);
1040 	fmd_case_hold_locked(cp);
1041 	(void) pthread_mutex_unlock(&cip->ci_lock);
1042 }
1043 
1044 void
1045 fmd_case_hold_locked(fmd_case_t *cp)
1046 {
1047 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1048 
1049 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1050 	if (cip->ci_flags & FMD_CF_DELETING)
1051 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1052 		    (void *)cip, cip->ci_uuid);
1053 	cip->ci_refs++;
1054 	ASSERT(cip->ci_refs != 0);
1055 }
1056 
1057 static fmd_case_impl_t *
1058 fmd_case_tryhold(fmd_case_impl_t *cip)
1059 {
1060 	/*
1061 	 * If the case's "deleting" bit is unset, hold and return case,
1062 	 * otherwise, return NULL.
1063 	 */
1064 	(void) pthread_mutex_lock(&cip->ci_lock);
1065 	if (cip->ci_flags & FMD_CF_DELETING) {
1066 		(void) pthread_mutex_unlock(&cip->ci_lock);
1067 		cip = NULL;
1068 	} else {
1069 		fmd_case_hold_locked((fmd_case_t *)cip);
1070 		(void) pthread_mutex_unlock(&cip->ci_lock);
1071 	}
1072 	return (cip);
1073 }
1074 
1075 void
1076 fmd_case_rele(fmd_case_t *cp)
1077 {
1078 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1079 
1080 	(void) pthread_mutex_lock(&cip->ci_lock);
1081 	ASSERT(cip->ci_refs != 0);
1082 
1083 	if (--cip->ci_refs == 0)
1084 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1085 	else
1086 		(void) pthread_mutex_unlock(&cip->ci_lock);
1087 }
1088 
1089 void
1090 fmd_case_rele_locked(fmd_case_t *cp)
1091 {
1092 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1093 
1094 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1095 	--cip->ci_refs;
1096 	ASSERT(cip->ci_refs != 0);
1097 }
1098 
1099 int
1100 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1101 {
1102 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1103 	fmd_case_item_t *cit;
1104 	fmd_event_t *oep;
1105 	uint_t state;
1106 	int new;
1107 
1108 	fmd_event_hold(ep);
1109 	(void) pthread_mutex_lock(&cip->ci_lock);
1110 
1111 	if (cip->ci_flags & FMD_CF_SOLVED)
1112 		state = FMD_EVS_DIAGNOSED;
1113 	else
1114 		state = FMD_EVS_ACCEPTED;
1115 
1116 	oep = cip->ci_principal;
1117 	cip->ci_principal = ep;
1118 
1119 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1120 		if (cit->cit_event == ep)
1121 			break;
1122 	}
1123 
1124 	cip->ci_flags |= FMD_CF_DIRTY;
1125 	new = cit == NULL && ep != oep;
1126 
1127 	(void) pthread_mutex_unlock(&cip->ci_lock);
1128 
1129 	fmd_module_setcdirty(cip->ci_mod);
1130 	fmd_event_transition(ep, state);
1131 
1132 	if (oep != NULL)
1133 		fmd_event_rele(oep);
1134 
1135 	return (new);
1136 }
1137 
1138 int
1139 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1140 {
1141 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1142 	fmd_case_item_t *cit;
1143 	uint_t state;
1144 	int new;
1145 
1146 	(void) pthread_mutex_lock(&cip->ci_lock);
1147 
1148 	if (cip->ci_flags & FMD_CF_SOLVED)
1149 		state = FMD_EVS_DIAGNOSED;
1150 	else
1151 		state = FMD_EVS_ACCEPTED;
1152 
1153 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1154 		if (cit->cit_event == ep)
1155 			break;
1156 	}
1157 
1158 	new = cit == NULL && ep != cip->ci_principal;
1159 
1160 	/*
1161 	 * If the event is already in the case or the case is already solved,
1162 	 * there is no reason to save it: just transition it appropriately.
1163 	 */
1164 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1165 		(void) pthread_mutex_unlock(&cip->ci_lock);
1166 		fmd_event_transition(ep, state);
1167 		return (new);
1168 	}
1169 
1170 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1171 	fmd_event_hold(ep);
1172 
1173 	cit->cit_next = cip->ci_items;
1174 	cit->cit_event = ep;
1175 
1176 	cip->ci_items = cit;
1177 	cip->ci_nitems++;
1178 
1179 	cip->ci_flags |= FMD_CF_DIRTY;
1180 	(void) pthread_mutex_unlock(&cip->ci_lock);
1181 
1182 	fmd_module_setcdirty(cip->ci_mod);
1183 	fmd_event_transition(ep, state);
1184 
1185 	return (new);
1186 }
1187 
1188 void
1189 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1190 {
1191 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1192 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1193 
1194 	(void) pthread_mutex_lock(&cip->ci_lock);
1195 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1196 	cip->ci_flags |= FMD_CF_DIRTY;
1197 
1198 	cis->cis_next = cip->ci_suspects;
1199 	cis->cis_nvl = nvl;
1200 
1201 	cip->ci_suspects = cis;
1202 	cip->ci_nsuspects++;
1203 
1204 	(void) pthread_mutex_unlock(&cip->ci_lock);
1205 	fmd_module_setcdirty(cip->ci_mod);
1206 }
1207 
1208 void
1209 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1210 {
1211 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1212 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1213 	boolean_t b;
1214 
1215 	(void) pthread_mutex_lock(&cip->ci_lock);
1216 	ASSERT(cip->ci_state == FMD_CASE_CLOSED ||
1217 	    cip->ci_state == FMD_CASE_REPAIRED);
1218 	ASSERT(cip->ci_mod == fmd.d_rmod);
1219 
1220 	cis->cis_next = cip->ci_suspects;
1221 	cis->cis_nvl = nvl;
1222 
1223 	if (nvlist_lookup_boolean_value(nvl,
1224 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1225 		cip->ci_flags |= FMD_CF_INVISIBLE;
1226 
1227 	cip->ci_suspects = cis;
1228 	cip->ci_nsuspects++;
1229 
1230 	(void) pthread_mutex_unlock(&cip->ci_lock);
1231 }
1232 
1233 void
1234 fmd_case_reset_suspects(fmd_case_t *cp)
1235 {
1236 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1237 
1238 	(void) pthread_mutex_lock(&cip->ci_lock);
1239 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1240 
1241 	fmd_case_destroy_suspects(cip);
1242 	cip->ci_flags |= FMD_CF_DIRTY;
1243 
1244 	(void) pthread_mutex_unlock(&cip->ci_lock);
1245 	fmd_module_setcdirty(cip->ci_mod);
1246 }
1247 
1248 /*ARGSUSED*/
1249 static void
1250 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1251 {
1252 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1253 }
1254 
1255 /*
1256  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1257  * whatever actions and emit whatever events are appropriate for the state.
1258  * Refer to the topmost block comment explaining the state machine for details.
1259  */
1260 void
1261 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1262 {
1263 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1264 	fmd_case_item_t *cit;
1265 	fmd_event_t *e;
1266 	int resolved = 0;
1267 	int any_unusable_and_present = 0;
1268 
1269 	ASSERT(state <= FMD_CASE_RESOLVED);
1270 	(void) pthread_mutex_lock(&cip->ci_lock);
1271 
1272 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1273 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
1274 
1275 	cip->ci_flags |= flags;
1276 
1277 	if (cip->ci_state >= state) {
1278 		(void) pthread_mutex_unlock(&cip->ci_lock);
1279 		return; /* already in specified state */
1280 	}
1281 
1282 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1283 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1284 
1285 	cip->ci_state = state;
1286 	cip->ci_flags |= FMD_CF_DIRTY;
1287 
1288 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1289 		fmd_module_setcdirty(cip->ci_mod);
1290 
1291 	switch (state) {
1292 	case FMD_CASE_SOLVED:
1293 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1294 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1295 
1296 		if (cip->ci_principal != NULL) {
1297 			fmd_event_transition(cip->ci_principal,
1298 			    FMD_EVS_DIAGNOSED);
1299 		}
1300 		break;
1301 
1302 	case FMD_CASE_CLOSE_WAIT:
1303 		/*
1304 		 * If the case was never solved, do not change ASRUs.
1305 		 * If the case was never fmd_case_closed, do not change ASRUs.
1306 		 * If the case was repaired, do not change ASRUs.
1307 		 */
1308 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1309 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1310 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1311 			    fmd_case_unusable, NULL);
1312 
1313 		/*
1314 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1315 		 * module is no longer loaded: continue on to CASE_CLOSED.
1316 		 */
1317 		if (fmd_case_orphaned(cp))
1318 			state = cip->ci_state = FMD_CASE_CLOSED;
1319 		break;
1320 
1321 	case FMD_CASE_REPAIRED:
1322 		ASSERT(fmd_case_orphaned(cp));
1323 
1324 		/*
1325 		 * If all suspects are already either usable or not present then
1326 		 * transition straight to RESOLVED state, publishing both the
1327 		 * list.repaired and list.resolved.
1328 		 */
1329 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1330 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1331 		if (any_unusable_and_present)
1332 			break;
1333 
1334 		fmd_module_lock(cip->ci_mod);
1335 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1336 		fmd_module_unlock(cip->ci_mod);
1337 		cip->ci_state = FMD_CASE_RESOLVED;
1338 		(void) pthread_mutex_unlock(&cip->ci_lock);
1339 		fmd_case_publish(cp, state);
1340 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1341 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1342 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1343 		state = FMD_CASE_RESOLVED;
1344 		resolved = 1;
1345 		(void) pthread_mutex_lock(&cip->ci_lock);
1346 		break;
1347 
1348 	case FMD_CASE_RESOLVED:
1349 		ASSERT(fmd_case_orphaned(cp));
1350 
1351 		/*
1352 		 * If all suspects are already either usable or not present then
1353 		 * carry on, publish list.resolved and discard the case.
1354 		 */
1355 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1356 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1357 		if (any_unusable_and_present) {
1358 			(void) pthread_mutex_unlock(&cip->ci_lock);
1359 			return;
1360 		}
1361 
1362 		fmd_module_lock(cip->ci_mod);
1363 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1364 		fmd_module_unlock(cip->ci_mod);
1365 		resolved = 1;
1366 		break;
1367 	}
1368 
1369 	(void) pthread_mutex_unlock(&cip->ci_lock);
1370 
1371 	/*
1372 	 * If the module has initialized, then publish the appropriate event
1373 	 * for the new case state.  If not, we are being called from the
1374 	 * checkpoint code during module load, in which case the module's
1375 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1376 	 * may not be open yet, which will prevent us from computing the event
1377 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1378 	 * event in our queue: this won't be processed until _fmd_init is done.
1379 	 */
1380 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1381 		fmd_case_publish(cp, state);
1382 	else {
1383 		fmd_case_hold(cp);
1384 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1385 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1386 	}
1387 
1388 	if (resolved) {
1389 		/*
1390 		 * If we transitioned to RESOLVED, adjust the reference count to
1391 		 * reflect our removal from fmd.d_rmod->mod_cases above.  If the
1392 		 * caller has not placed an additional hold on the case, it
1393 		 * will now be freed.
1394 		 */
1395 		(void) pthread_mutex_lock(&cip->ci_lock);
1396 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1397 		(void) pthread_mutex_unlock(&cip->ci_lock);
1398 		fmd_case_rele(cp);
1399 	}
1400 }
1401 
1402 /*
1403  * Transition the specified case to *at least* the specified state by first
1404  * re-validating the suspect list using the resource cache.  This function is
1405  * employed by the checkpoint code when restoring a saved, solved case to see
1406  * if the state of the case has effectively changed while fmd was not running
1407  * or the module was not loaded.
1408  */
1409 void
1410 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1411 {
1412 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1413 
1414 	int usable = 0;		/* are any suspects usable? */
1415 
1416 	ASSERT(state >= FMD_CASE_SOLVED);
1417 	(void) pthread_mutex_lock(&cip->ci_lock);
1418 
1419 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1420 
1421 	(void) pthread_mutex_unlock(&cip->ci_lock);
1422 
1423 	if (!usable) {
1424 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1425 		flags |= FMD_CF_ISOLATED;
1426 	}
1427 
1428 	fmd_case_transition(cp, state, flags);
1429 }
1430 
1431 void
1432 fmd_case_setdirty(fmd_case_t *cp)
1433 {
1434 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1435 
1436 	(void) pthread_mutex_lock(&cip->ci_lock);
1437 	cip->ci_flags |= FMD_CF_DIRTY;
1438 	(void) pthread_mutex_unlock(&cip->ci_lock);
1439 
1440 	fmd_module_setcdirty(cip->ci_mod);
1441 }
1442 
1443 void
1444 fmd_case_clrdirty(fmd_case_t *cp)
1445 {
1446 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1447 
1448 	(void) pthread_mutex_lock(&cip->ci_lock);
1449 	cip->ci_flags &= ~FMD_CF_DIRTY;
1450 	(void) pthread_mutex_unlock(&cip->ci_lock);
1451 }
1452 
1453 void
1454 fmd_case_commit(fmd_case_t *cp)
1455 {
1456 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1457 	fmd_case_item_t *cit;
1458 
1459 	(void) pthread_mutex_lock(&cip->ci_lock);
1460 
1461 	if (cip->ci_flags & FMD_CF_DIRTY) {
1462 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1463 			fmd_event_commit(cit->cit_event);
1464 
1465 		if (cip->ci_principal != NULL)
1466 			fmd_event_commit(cip->ci_principal);
1467 
1468 		fmd_buf_hash_commit(&cip->ci_bufs);
1469 		cip->ci_flags &= ~FMD_CF_DIRTY;
1470 	}
1471 
1472 	(void) pthread_mutex_unlock(&cip->ci_lock);
1473 }
1474 
1475 /*
1476  * Indicate that the case may need to change state because one or more of the
1477  * ASRUs named as a suspect has changed state.  We examine all the suspects
1478  * and if none are still faulty, we initiate a case close transition.
1479  */
1480 void
1481 fmd_case_update(fmd_case_t *cp)
1482 {
1483 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1484 	uint_t cstate;
1485 	int faulty = 0;
1486 
1487 	(void) pthread_mutex_lock(&cip->ci_lock);
1488 	cstate = cip->ci_state;
1489 
1490 	if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1491 		(void) pthread_mutex_unlock(&cip->ci_lock);
1492 		return; /* update is not appropriate */
1493 	}
1494 
1495 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1496 		(void) pthread_mutex_unlock(&cip->ci_lock);
1497 		return; /* already repaired */
1498 	}
1499 
1500 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1501 	(void) pthread_mutex_unlock(&cip->ci_lock);
1502 
1503 	if (faulty) {
1504 		nvlist_t *nvl;
1505 		fmd_event_t *e;
1506 		char *class;
1507 
1508 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
1509 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1510 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1511 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1512 		fmd_log_append(fmd.d_fltlog, e, cp);
1513 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1514 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1515 		return; /* one or more suspects are still marked faulty */
1516 	}
1517 
1518 	if (cstate == FMD_CASE_CLOSED)
1519 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1520 	else
1521 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1522 }
1523 
1524 /*
1525  * Delete a closed case from the module's case list once the fmdo_close() entry
1526  * point has run to completion.  If the case is owned by a transport module,
1527  * tell the transport to proxy a case close on the other end of the transport.
1528  * If not, transition to the appropriate next state based on ci_flags.  This
1529  * function represents the end of CLOSE_WAIT and transitions the case to either
1530  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1531  * refer to the topmost block comment explaining the state machine for details.
1532  */
1533 void
1534 fmd_case_delete(fmd_case_t *cp)
1535 {
1536 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1537 	fmd_modstat_t *msp;
1538 	size_t buftotal;
1539 
1540 	ASSERT(fmd_module_locked(cip->ci_mod));
1541 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1542 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1543 
1544 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1545 	msp = cip->ci_mod->mod_stats;
1546 
1547 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1548 	msp->ms_caseopen.fmds_value.ui64--;
1549 
1550 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1551 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1552 
1553 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1554 
1555 	if (cip->ci_xprt == NULL)
1556 		fmd_module_setcdirty(cip->ci_mod);
1557 
1558 	fmd_module_rele(cip->ci_mod);
1559 	cip->ci_mod = fmd.d_rmod;
1560 	fmd_module_hold(cip->ci_mod);
1561 
1562 	/*
1563 	 * If the case is not proxied and it has been solved, then retain it
1564 	 * on the root module's case list at least until we're transitioned.
1565 	 * Otherwise free the case with our final fmd_case_rele() below.
1566 	 */
1567 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1568 		fmd_module_lock(cip->ci_mod);
1569 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1570 		fmd_module_unlock(cip->ci_mod);
1571 		fmd_case_hold(cp);
1572 	}
1573 
1574 	/*
1575 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1576 	 * rather than orphaned because by definition it can have no entries
1577 	 * in the resource cache of the current fault manager.
1578 	 */
1579 	if (cip->ci_xprt != NULL)
1580 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1581 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1582 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1583 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1584 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1585 
1586 	fmd_case_rele(cp);
1587 }
1588 
1589 void
1590 fmd_case_discard(fmd_case_t *cp)
1591 {
1592 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1593 
1594 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1595 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1596 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1597 
1598 	ASSERT(fmd_module_locked(cip->ci_mod));
1599 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1600 	fmd_case_rele(cp);
1601 }
1602 
1603 /*
1604  * Indicate that the problem corresponding to a case has been repaired by
1605  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1606  * already been closed, this function initiates the transition to CLOSE_WAIT.
1607  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1608  * grab and drop ci_lock without the case being able to be freed in between.
1609  */
1610 int
1611 fmd_case_repair(fmd_case_t *cp)
1612 {
1613 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1614 	uint_t cstate;
1615 
1616 	(void) pthread_mutex_lock(&cip->ci_lock);
1617 	cstate = cip->ci_state;
1618 
1619 	if (cip->ci_xprt != NULL) {
1620 		(void) pthread_mutex_unlock(&cip->ci_lock);
1621 		return (fmd_set_errno(EFMD_CASE_OWNER));
1622 	}
1623 
1624 	if (cstate < FMD_CASE_SOLVED) {
1625 		(void) pthread_mutex_unlock(&cip->ci_lock);
1626 		return (fmd_set_errno(EFMD_CASE_STATE));
1627 	}
1628 
1629 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1630 		(void) pthread_mutex_unlock(&cip->ci_lock);
1631 		return (0); /* already repaired */
1632 	}
1633 
1634 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, NULL);
1635 	(void) pthread_mutex_unlock(&cip->ci_lock);
1636 
1637 	if (cstate == FMD_CASE_CLOSED)
1638 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1639 	else
1640 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1641 
1642 	return (0);
1643 }
1644 
1645 int
1646 fmd_case_acquit(fmd_case_t *cp)
1647 {
1648 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1649 	uint_t cstate;
1650 
1651 	(void) pthread_mutex_lock(&cip->ci_lock);
1652 	cstate = cip->ci_state;
1653 
1654 	if (cip->ci_xprt != NULL) {
1655 		(void) pthread_mutex_unlock(&cip->ci_lock);
1656 		return (fmd_set_errno(EFMD_CASE_OWNER));
1657 	}
1658 
1659 	if (cstate < FMD_CASE_SOLVED) {
1660 		(void) pthread_mutex_unlock(&cip->ci_lock);
1661 		return (fmd_set_errno(EFMD_CASE_STATE));
1662 	}
1663 
1664 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1665 		(void) pthread_mutex_unlock(&cip->ci_lock);
1666 		return (0); /* already repaired */
1667 	}
1668 
1669 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_acquit, NULL);
1670 	(void) pthread_mutex_unlock(&cip->ci_lock);
1671 
1672 	if (cstate == FMD_CASE_CLOSED)
1673 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1674 	else
1675 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1676 
1677 	return (0);
1678 }
1679 
1680 int
1681 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1682 {
1683 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1684 	fmd_case_item_t *cit;
1685 	uint_t state;
1686 	int rv = 0;
1687 
1688 	(void) pthread_mutex_lock(&cip->ci_lock);
1689 
1690 	if (cip->ci_state >= FMD_CASE_SOLVED)
1691 		state = FMD_EVS_DIAGNOSED;
1692 	else
1693 		state = FMD_EVS_ACCEPTED;
1694 
1695 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1696 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1697 			break;
1698 	}
1699 
1700 	if (rv == 0 && cip->ci_principal != NULL)
1701 		rv = fmd_event_equal(ep, cip->ci_principal);
1702 
1703 	(void) pthread_mutex_unlock(&cip->ci_lock);
1704 
1705 	if (rv != 0)
1706 		fmd_event_transition(ep, state);
1707 
1708 	return (rv);
1709 }
1710 
1711 int
1712 fmd_case_orphaned(fmd_case_t *cp)
1713 {
1714 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1715 }
1716 
1717 void
1718 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1719 {
1720 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1721 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1722 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1723 }
1724 
1725 /*ARGSUSED*/
1726 void
1727 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
1728 {
1729 	int not_faulty = 0;
1730 	int faulty = 0;
1731 	nvlist_t *nvl;
1732 	fmd_event_t *e;
1733 	char *class;
1734 	int any_unusable_and_present = 0;
1735 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1736 
1737 	if (cip->ci_state < FMD_CASE_SOLVED)
1738 		return;
1739 
1740 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1741 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
1742 	    &not_faulty);
1743 
1744 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
1745 		/*
1746 		 * If none of the suspects is faulty, replay the list.repaired.
1747 		 * If all suspects are already either usable or not present then
1748 		 * also transition straight to RESOLVED state.
1749 		 */
1750 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1751 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1752 		if (!any_unusable_and_present) {
1753 			fmd_module_lock(cip->ci_mod);
1754 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1755 			fmd_module_unlock(cip->ci_mod);
1756 			cip->ci_state = FMD_CASE_RESOLVED;
1757 
1758 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1759 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1760 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
1761 			    class);
1762 			fmd_dispq_dispatch(fmd.d_disp, e, class);
1763 
1764 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
1765 			(void) pthread_mutex_lock(&cip->ci_lock);
1766 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1767 			(void) pthread_mutex_unlock(&cip->ci_lock);
1768 			fmd_case_rele(cp);
1769 		} else {
1770 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1771 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1772 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
1773 			    class);
1774 			fmd_dispq_dispatch(fmd.d_disp, e, class);
1775 		}
1776 	} else if (faulty && not_faulty) {
1777 		/*
1778 		 * if some but not all of the suspects are not faulty, replay
1779 		 * the list.updated.
1780 		 */
1781 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
1782 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1783 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1784 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1785 	}
1786 }
1787 
1788 void
1789 fmd_case_repair_replay()
1790 {
1791 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
1792 }
1793