xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 39b361b2ebefcef5612a54ae5cbd2179e19be296)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * FMD Case Subsystem
29  *
30  * Diagnosis engines are expected to group telemetry events related to the
31  * diagnosis of a particular problem on the system into a set of cases.  The
32  * diagnosis engine may have any number of cases open at a given point in time.
33  * Some cases may eventually be *solved* by associating a suspect list of one
34  * or more problems with the case, at which point fmd publishes a list.suspect
35  * event for the case and it becomes visible to administrators and agents.
36  *
37  * Every case is named using a UUID, and is globally visible in the case hash.
38  * Cases are reference-counted, except for the reference from the case hash
39  * itself.  Consumers of case references include modules, which store active
40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
41  *
42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
44  * or transport) and the case is referenced by the mod_cases list.  Once the
45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
47  *
48  *			+------------+
49  *	     +----------|  UNSOLVED  |
50  *	     |		+------------+
51  *	     |		      1 |
52  *	     |			|
53  *	     |		+-------v----+
54  *	   2 |		|    SOLVED  |
55  *	     |		+------------+
56  *	     |		    3 |  5 |
57  *	     +------------+   |    |
58  *			  |   |    |
59  *			+-v---v----v-+
60  *			| CLOSE_WAIT |
61  *			+------------+
62  *			  |   |    |
63  *	      +-----------+   |    +------------+
64  *	      |		    4 |			|
65  *	      v		+-----v------+		|
66  *	   discard      |   CLOSED   |	      6	|
67  *			+------------+		|
68  *			      |			|
69  *			      |	   +------------+
70  *			    7 |	   |
71  *			+-----v----v-+
72  *			|  REPAIRED  |
73  *			+------------+
74  *			      |
75  *			    8 |
76  *			+-----v------+
77  *			|  RESOLVED  |
78  *			+------------+
79  *			      |
80  *			      v
81  *			   discard
82  *
83  * The state machine changes are triggered by calls to fmd_case_transition()
84  * from various locations inside of fmd, as described below:
85  *
86  * [1] Called by: fmd_case_solve()
87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
88  *                conviction policy is applied to suspect list
89  *                suspects convicted are marked faulty (F) in R$
90  *                list.suspect event logged and dispatched
91  *
92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
94  *                case discarded upon exit from CLOSE_WAIT
95  *
96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
98  *                suspects convicted (F) are marked unusable (U) in R$
99  *                diagnosis engine fmdo_close() entry point scheduled
100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
101  *
102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
103  *       Actions: list.isolated event dispatched
104  *                case deleted from module's list of open cases
105  *
106  * [5] Called by: fmd_case_repair(), fmd_case_update()
107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
108  *                diagnosis engine fmdo_close() entry point scheduled
109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
110  *
111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
112  *       Actions: suspects convicted are marked non faulty (!F) in R$
113  *                list.repaired or list.updated event dispatched
114  *
115  * [7] Called by: fmd_case_repair(), fmd_case_update()
116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
117  *                suspects convicted are marked non faulty (!F) in R$
118  *                list.repaired or list.updated event dispatched
119  *
120  * [8] Called by: fmd_case_uuresolve()
121  *       Actions: list.resolved event dispatched
122  *		  case is discarded
123  */
124 
125 #include <sys/fm/protocol.h>
126 #include <uuid/uuid.h>
127 #include <alloca.h>
128 
129 #include <fmd_alloc.h>
130 #include <fmd_module.h>
131 #include <fmd_error.h>
132 #include <fmd_conf.h>
133 #include <fmd_case.h>
134 #include <fmd_string.h>
135 #include <fmd_subr.h>
136 #include <fmd_protocol.h>
137 #include <fmd_event.h>
138 #include <fmd_eventq.h>
139 #include <fmd_dispq.h>
140 #include <fmd_buf.h>
141 #include <fmd_log.h>
142 #include <fmd_asru.h>
143 #include <fmd_fmri.h>
144 #include <fmd_xprt.h>
145 
146 #include <fmd.h>
147 
148 static const char *const _fmd_case_snames[] = {
149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
150 	"SOLVED",	/* FMD_CASE_SOLVED */
151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
152 	"CLOSED",	/* FMD_CASE_CLOSED */
153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
155 };
156 
157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
158 
159 fmd_case_hash_t *
160 fmd_case_hash_create(void)
161 {
162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
163 
164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
165 	chp->ch_hashlen = fmd.d_str_buckets;
166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
168 	    FMD_SLEEP);
169 	chp->ch_count = 0;
170 
171 	return (chp);
172 }
173 
174 /*
175  * Destroy the case hash.  Unlike most of our hash tables, no active references
176  * are kept by the case hash itself; all references come from other subsystems.
177  * The hash must be destroyed after all modules are unloaded; if anything was
178  * present in the hash it would be by definition a reference count leak.
179  */
180 void
181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
182 {
183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
185 	fmd_free(chp, sizeof (fmd_case_hash_t));
186 }
187 
188 /*
189  * Take a snapshot of the case hash by placing an additional hold on each
190  * member in an auxiliary array, and then call 'func' for each case.
191  */
192 void
193 fmd_case_hash_apply(fmd_case_hash_t *chp,
194     void (*func)(fmd_case_t *, void *), void *arg)
195 {
196 	fmd_case_impl_t *cp, **cps, **cpp;
197 	uint_t cpc, i;
198 
199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
200 
201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
202 	cpc = chp->ch_count;
203 
204 	for (i = 0; i < chp->ch_hashlen; i++) {
205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
206 			*cpp++ = fmd_case_tryhold(cp);
207 	}
208 
209 	ASSERT(cpp == cps + cpc);
210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
211 
212 	for (i = 0; i < cpc; i++) {
213 		if (cps[i] != NULL) {
214 			func((fmd_case_t *)cps[i], arg);
215 			fmd_case_rele((fmd_case_t *)cps[i]);
216 		}
217 	}
218 
219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
220 }
221 
222 static void
223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
224 {
225 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
226 
227 	cip->ci_code_next = chp->ch_code_hash[h];
228 	chp->ch_code_hash[h] = cip;
229 }
230 
231 static void
232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
233 {
234 	fmd_case_impl_t **pp, *cp;
235 
236 	if (cip->ci_code) {
237 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
238 
239 		pp = &chp->ch_code_hash[h];
240 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
241 			if (cp != cip)
242 				pp = &cp->ci_code_next;
243 			else
244 				break;
245 		}
246 		if (cp != NULL) {
247 			*pp = cp->ci_code_next;
248 			cp->ci_code_next = NULL;
249 		}
250 	}
251 }
252 
253 /*
254  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
255  * were defined for this case or if the lookup fails, the event dictionary or
256  * module code is broken, and we set the event code to a precomputed default.
257  */
258 static const char *
259 fmd_case_mkcode(fmd_case_t *cp)
260 {
261 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
262 	fmd_case_susp_t *cis;
263 	fmd_case_hash_t *chp = fmd.d_cases;
264 
265 	char **keys, **keyp;
266 	const char *s;
267 
268 	ASSERT(MUTEX_HELD(&cip->ci_lock));
269 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
270 
271 	/*
272 	 * delete any existing entry from code hash if it is on it
273 	 */
274 	fmd_case_code_hash_delete(chp, cip);
275 
276 	fmd_free(cip->ci_code, cip->ci_codelen);
277 	cip->ci_codelen = cip->ci_mod->mod_codelen;
278 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
279 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
280 
281 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
282 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
283 			keyp++;
284 	}
285 
286 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
287 
288 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
289 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
290 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
291 		fmd_free(cip->ci_code, cip->ci_codelen);
292 		cip->ci_codelen = strlen(s) + 1;
293 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
294 		(void) strcpy(cip->ci_code, s);
295 	}
296 
297 	/*
298 	 * add into hash of solved cases
299 	 */
300 	fmd_case_code_hash_insert(chp, cip);
301 
302 	return (cip->ci_code);
303 }
304 
305 typedef struct {
306 	int	*fcl_countp;
307 	uint8_t *fcl_ba;
308 	nvlist_t **fcl_nva;
309 	int	*fcl_msgp;
310 } fmd_case_lst_t;
311 
312 static void
313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
314 {
315 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
316 	boolean_t b;
317 	int state;
318 
319 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
320 	    &b) == 0 && b == B_FALSE)
321 		*entryp->fcl_msgp = B_FALSE;
322 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
323 	state = fmd_asru_al_getstate(alp);
324 	if (state & FMD_ASRU_DEGRADED)
325 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
326 	if (state & FMD_ASRU_UNUSABLE)
327 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
328 	if (state & FMD_ASRU_FAULTY)
329 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
330 	if (!(state & FMD_ASRU_PRESENT))
331 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
332 	if (alp->al_reason == FMD_ASRU_REPAIRED)
333 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
334 	else if (alp->al_reason == FMD_ASRU_REPLACED)
335 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
336 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
337 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
338 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
339 	(*entryp->fcl_countp)++;
340 }
341 
342 static void
343 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
344 {
345 	int *faultyp = (int *)arg;
346 
347 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
348 }
349 
350 static void
351 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
352 {
353 	int *usablep = (int *)arg;
354 
355 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
356 }
357 
358 static void
359 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
360 {
361 	int *not_faultyp = (int *)arg;
362 
363 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
364 }
365 
366 /*
367  * Have we got any suspects with an asru that are still unusable and present?
368  */
369 static void
370 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
371 {
372 	int *rvalp = (int *)arg;
373 	int state = fmd_asru_al_getstate(alp);
374 	nvlist_t *asru;
375 
376 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
377 		return;
378 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
379 }
380 
381 nvlist_t *
382 fmd_case_mkevent(fmd_case_t *cp, const char *class)
383 {
384 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
385 	nvlist_t **nva, *nvl;
386 	uint8_t *ba;
387 	int msg = B_TRUE;
388 	const char *code;
389 	fmd_case_lst_t fcl;
390 	int count = 0;
391 
392 	(void) pthread_mutex_lock(&cip->ci_lock);
393 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
394 
395 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
396 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
397 
398 	/*
399 	 * For each suspect associated with the case, store its fault event
400 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
401 	 * have asked not to be messaged.  If any of them have made such a
402 	 * request, propagate that attribute to the composite list.* event.
403 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
404 	 */
405 	fcl.fcl_countp = &count;
406 	fcl.fcl_msgp = &msg;
407 	fcl.fcl_ba = ba;
408 	fcl.fcl_nva = nva;
409 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
410 
411 	if (cip->ci_code == NULL)
412 		(void) fmd_case_mkcode(cp);
413 	/*
414 	 * For repair and updated event, we lookup diagcode from dict using key
415 	 * "list.repaired" or "list.updated" or "list.resolved".
416 	 */
417 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
418 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
419 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
420 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
421 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
422 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
423 	else
424 		code = cip->ci_code;
425 
426 	if (msg == B_FALSE)
427 		cip->ci_flags |= FMD_CF_INVISIBLE;
428 
429 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
430 	    code, count, nva, ba, msg, &cip->ci_tv);
431 
432 	(void) pthread_mutex_unlock(&cip->ci_lock);
433 	return (nvl);
434 }
435 
436 static boolean_t
437 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
438 {
439 	nvlist_t *new_rsrc;
440 	nvlist_t *rsrc;
441 	char *new_name = NULL;
442 	char *name = NULL;
443 	ssize_t new_namelen;
444 	ssize_t namelen;
445 	int fmri_present = 1;
446 	int new_fmri_present = 1;
447 	int match = B_FALSE;
448 	fmd_topo_t *ftp = fmd_topo_hold();
449 
450 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
451 		fmri_present = 0;
452 	else {
453 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
454 			goto done;
455 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
456 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
457 			goto done;
458 	}
459 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
460 		new_fmri_present = 0;
461 	else {
462 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
463 			goto done;
464 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
465 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
466 			goto done;
467 	}
468 	match = (fmri_present == new_fmri_present &&
469 	    (fmri_present == 0 ||
470 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
471 done:
472 	if (name != NULL)
473 		fmd_free(name, namelen + 1);
474 	if (new_name != NULL)
475 		fmd_free(new_name, new_namelen + 1);
476 	fmd_topo_rele(ftp);
477 	return (match);
478 }
479 
480 static int
481 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
482 {
483 	char *class, *new_class;
484 
485 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
486 		return (0);
487 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
488 	    FM_FAULT_RESOURCE))
489 		return (0);
490 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
491 		return (0);
492 	(void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
493 	(void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
494 	return (strcmp(class, new_class) == 0);
495 }
496 
497 /*
498  * see if an identical suspect list already exists in the cache
499  */
500 static int
501 fmd_case_check_for_dups(fmd_case_t *cp)
502 {
503 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
504 	fmd_case_hash_t *chp = fmd.d_cases;
505 	fmd_case_susp_t *xcis, *cis;
506 	int match = 0, match_susp;
507 	uint_t h;
508 
509 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
510 
511 	/*
512 	 * Find all cases with this code
513 	 */
514 	h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
515 	for (xcip = chp->ch_code_hash[h]; xcip != NULL;
516 	    xcip = xcip->ci_code_next) {
517 		/*
518 		 * only look for any cases (apart from this one)
519 		 * whose code and number of suspects match
520 		 */
521 		if (xcip == cip || fmd_case_tryhold(xcip) == NULL)
522 			continue;
523 		if (strcmp(xcip->ci_code, cip->ci_code) != 0 ||
524 		    xcip->ci_nsuspects != cip->ci_nsuspects) {
525 			fmd_case_rele((fmd_case_t *)xcip);
526 			continue;
527 		}
528 
529 		/*
530 		 * For each suspect in one list, check if there
531 		 * is an identical suspect in the other list
532 		 */
533 		match = 1;
534 		for (xcis = xcip->ci_suspects; xcis != NULL;
535 		    xcis = xcis->cis_next) {
536 			match_susp = 0;
537 			for (cis = cip->ci_suspects; cis != NULL;
538 			    cis = cis->cis_next) {
539 				if (fmd_case_match_suspect(cis, xcis) == 1) {
540 					match_susp = 1;
541 					break;
542 				}
543 			}
544 			if (match_susp == 0) {
545 				match = 0;
546 				break;
547 			}
548 		}
549 		fmd_case_rele((fmd_case_t *)xcip);
550 		if (match) {
551 			(void) pthread_rwlock_unlock(&chp->ch_lock);
552 			return (1);
553 		}
554 	}
555 	(void) pthread_rwlock_unlock(&chp->ch_lock);
556 	return (0);
557 }
558 
559 /*
560  * Convict suspects in a case by applying a conviction policy and updating the
561  * resource cache prior to emitting the list.suspect event for the given case.
562  * At present, our policy is very simple: convict every suspect in the case.
563  * In the future, this policy can be extended and made configurable to permit:
564  *
565  * - convicting the suspect with the highest FIT rate
566  * - convicting the suspect with the cheapest FRU
567  * - convicting the suspect with the FRU that is in a depot's inventory
568  * - convicting the suspect with the longest lifetime
569  *
570  * and so forth.  A word to the wise: this problem is significantly harder that
571  * it seems at first glance.  Future work should heed the following advice:
572  *
573  * Hacking the policy into C code here is a very bad idea.  The policy needs to
574  * be decided upon very carefully and fundamentally encodes knowledge of what
575  * suspect list combinations can be emitted by what diagnosis engines.  As such
576  * fmd's code is the wrong location, because that would require fmd itself to
577  * be updated for every diagnosis engine change, defeating the entire design.
578  * The FMA Event Registry knows the suspect list combinations: policy inputs
579  * can be derived from it and used to produce per-module policy configuration.
580  *
581  * If the policy needs to be dynamic and not statically fixed at either fmd
582  * startup or module load time, any implementation of dynamic policy retrieval
583  * must employ some kind of caching mechanism or be part of a built-in module.
584  * The fmd_case_convict() function is called with locks held inside of fmd and
585  * is not a place where unbounded blocking on some inter-process or inter-
586  * system communication to another service (e.g. another daemon) can occur.
587  */
588 static int
589 fmd_case_convict(fmd_case_t *cp)
590 {
591 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
592 	fmd_asru_hash_t *ahp = fmd.d_asrus;
593 
594 	fmd_case_susp_t *cis;
595 	fmd_asru_link_t *alp;
596 
597 	(void) pthread_mutex_lock(&cip->ci_lock);
598 	(void) fmd_case_mkcode(cp);
599 	if (fmd_case_check_for_dups(cp) == 1) {
600 		(void) pthread_mutex_unlock(&cip->ci_lock);
601 		return (1);
602 	}
603 
604 	/*
605 	 * no suspect list already exists  - allocate new cache entries
606 	 */
607 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
608 		if ((alp = fmd_asru_hash_create_entry(ahp,
609 		    cp, cis->cis_nvl)) == NULL) {
610 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
611 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
612 			continue;
613 		}
614 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
615 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
616 	}
617 
618 	(void) pthread_mutex_unlock(&cip->ci_lock);
619 	return (0);
620 }
621 
622 void
623 fmd_case_publish(fmd_case_t *cp, uint_t state)
624 {
625 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
626 	fmd_event_t *e;
627 	nvlist_t *nvl;
628 	char *class;
629 
630 	if (state == FMD_CASE_CURRENT)
631 		state = cip->ci_state; /* use current state */
632 
633 	switch (state) {
634 	case FMD_CASE_SOLVED:
635 		(void) pthread_mutex_lock(&cip->ci_lock);
636 		if (cip->ci_tv_valid == 0) {
637 			fmd_time_gettimeofday(&cip->ci_tv);
638 			cip->ci_tv_valid = 1;
639 		}
640 		(void) pthread_mutex_unlock(&cip->ci_lock);
641 
642 		if (fmd_case_convict(cp) == 1) { /* dupclose */
643 			cip->ci_flags &= ~FMD_CF_SOLVED;
644 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
645 			break;
646 		}
647 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
648 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
649 
650 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
651 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
652 		fmd_log_append(fmd.d_fltlog, e, cp);
653 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
654 		fmd_dispq_dispatch(fmd.d_disp, e, class);
655 
656 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
657 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
658 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
659 
660 		break;
661 
662 	case FMD_CASE_CLOSE_WAIT:
663 		fmd_case_hold(cp);
664 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
665 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
666 
667 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
668 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
669 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
670 
671 		break;
672 
673 	case FMD_CASE_CLOSED:
674 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
675 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
676 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
677 		fmd_dispq_dispatch(fmd.d_disp, e, class);
678 		break;
679 
680 	case FMD_CASE_REPAIRED:
681 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
682 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
683 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
684 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
685 		fmd_log_append(fmd.d_fltlog, e, cp);
686 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
687 		fmd_dispq_dispatch(fmd.d_disp, e, class);
688 		break;
689 
690 	case FMD_CASE_RESOLVED:
691 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
692 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
693 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
694 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
695 		fmd_log_append(fmd.d_fltlog, e, cp);
696 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
697 		fmd_dispq_dispatch(fmd.d_disp, e, class);
698 		break;
699 	}
700 }
701 
702 fmd_case_t *
703 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
704 {
705 	fmd_case_impl_t *cip;
706 	uint_t h;
707 
708 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
709 	h = fmd_strhash(uuid) % chp->ch_hashlen;
710 
711 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
712 		if (strcmp(cip->ci_uuid, uuid) == 0)
713 			break;
714 	}
715 
716 	/*
717 	 * If deleting bit is set, treat the case as if it doesn't exist.
718 	 */
719 	if (cip != NULL)
720 		cip = fmd_case_tryhold(cip);
721 
722 	if (cip == NULL)
723 		(void) fmd_set_errno(EFMD_CASE_INVAL);
724 
725 	(void) pthread_rwlock_unlock(&chp->ch_lock);
726 	return ((fmd_case_t *)cip);
727 }
728 
729 static fmd_case_impl_t *
730 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
731 {
732 	fmd_case_impl_t *eip;
733 	uint_t h;
734 
735 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
736 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
737 
738 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
739 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
740 		    fmd_case_tryhold(eip) != NULL) {
741 			(void) pthread_rwlock_unlock(&chp->ch_lock);
742 			return (eip); /* uuid already present */
743 		}
744 	}
745 
746 	cip->ci_next = chp->ch_hash[h];
747 	chp->ch_hash[h] = cip;
748 
749 	chp->ch_count++;
750 	ASSERT(chp->ch_count != 0);
751 
752 	(void) pthread_rwlock_unlock(&chp->ch_lock);
753 	return (cip);
754 }
755 
756 static void
757 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
758 {
759 	fmd_case_impl_t *cp, **pp;
760 	uint_t h;
761 
762 	ASSERT(MUTEX_HELD(&cip->ci_lock));
763 
764 	cip->ci_flags |= FMD_CF_DELETING;
765 	(void) pthread_mutex_unlock(&cip->ci_lock);
766 
767 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
768 
769 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
770 	pp = &chp->ch_hash[h];
771 
772 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
773 		if (cp != cip)
774 			pp = &cp->ci_next;
775 		else
776 			break;
777 	}
778 
779 	if (cp == NULL) {
780 		fmd_panic("case %p (%s) not found on hash chain %u\n",
781 		    (void *)cip, cip->ci_uuid, h);
782 	}
783 
784 	*pp = cp->ci_next;
785 	cp->ci_next = NULL;
786 
787 	/*
788 	 * delete from code hash if it is on it
789 	 */
790 	fmd_case_code_hash_delete(chp, cip);
791 
792 	ASSERT(chp->ch_count != 0);
793 	chp->ch_count--;
794 
795 	(void) pthread_rwlock_unlock(&chp->ch_lock);
796 
797 	(void) pthread_mutex_lock(&cip->ci_lock);
798 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
799 }
800 
801 fmd_case_t *
802 fmd_case_create(fmd_module_t *mp, void *data)
803 {
804 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
805 	fmd_case_impl_t *eip = NULL;
806 	uuid_t uuid;
807 
808 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
809 	fmd_buf_hash_create(&cip->ci_bufs);
810 
811 	fmd_module_hold(mp);
812 	cip->ci_mod = mp;
813 	cip->ci_refs = 1;
814 	cip->ci_state = FMD_CASE_UNSOLVED;
815 	cip->ci_flags = FMD_CF_DIRTY;
816 	cip->ci_data = data;
817 
818 	/*
819 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
820 	 * define any constant for the length of an unparse string, and do not
821 	 * permit the caller to specify a buffer length for safety.  The spec
822 	 * says it will be 36 bytes, but we make it tunable just in case.
823 	 */
824 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
825 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
826 
827 	/*
828 	 * We expect this loop to execute only once, but code it defensively
829 	 * against the possibility of libuuid bugs.  Keep generating uuids and
830 	 * attempting to do a hash insert until we get a unique one.
831 	 */
832 	do {
833 		if (eip != NULL)
834 			fmd_case_rele((fmd_case_t *)eip);
835 		uuid_generate(uuid);
836 		uuid_unparse(uuid, cip->ci_uuid);
837 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
838 
839 	ASSERT(fmd_module_locked(mp));
840 	fmd_list_append(&mp->mod_cases, cip);
841 	fmd_module_setcdirty(mp);
842 
843 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
844 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
845 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
846 
847 	return ((fmd_case_t *)cip);
848 }
849 
850 static void
851 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
852 {
853 	fmd_case_susp_t *cis, *ncis;
854 
855 	ASSERT(MUTEX_HELD(&cip->ci_lock));
856 
857 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
858 		ncis = cis->cis_next;
859 		nvlist_free(cis->cis_nvl);
860 		fmd_free(cis, sizeof (fmd_case_susp_t));
861 	}
862 
863 	cip->ci_suspects = NULL;
864 	cip->ci_nsuspects = 0;
865 }
866 
867 fmd_case_t *
868 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
869     uint_t state, const char *uuid, const char *code)
870 {
871 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
872 	fmd_case_impl_t *eip;
873 
874 	ASSERT(state < FMD_CASE_RESOLVED);
875 
876 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
877 	fmd_buf_hash_create(&cip->ci_bufs);
878 
879 	fmd_module_hold(mp);
880 	cip->ci_mod = mp;
881 	cip->ci_xprt = xp;
882 	cip->ci_refs = 1;
883 	cip->ci_state = state;
884 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
885 	cip->ci_uuidlen = strlen(cip->ci_uuid);
886 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
887 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
888 
889 	if (state > FMD_CASE_CLOSE_WAIT)
890 		cip->ci_flags |= FMD_CF_SOLVED;
891 
892 	/*
893 	 * Insert the case into the global case hash.  If the specified UUID is
894 	 * already present, check to see if it is an orphan: if so, reclaim it;
895 	 * otherwise if it is owned by a different module then return NULL.
896 	 */
897 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
898 		(void) pthread_mutex_lock(&cip->ci_lock);
899 		cip->ci_refs--; /* decrement to zero */
900 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
901 
902 		cip = eip; /* switch 'cip' to the existing case */
903 		(void) pthread_mutex_lock(&cip->ci_lock);
904 
905 		/*
906 		 * If the ASRU cache is trying to recreate an orphan, then just
907 		 * return the existing case that we found without changing it.
908 		 */
909 		if (mp == fmd.d_rmod) {
910 			/*
911 			 * When recreating an orphan case, state passed in may
912 			 * either be CLOSED (faulty) or REPAIRED (!faulty). If
913 			 * any suspects are still CLOSED (faulty) then the
914 			 * overall state needs to be CLOSED.
915 			 */
916 			if (state == FMD_CASE_CLOSED)
917 				cip->ci_state = FMD_CASE_CLOSED;
918 			(void) pthread_mutex_unlock(&cip->ci_lock);
919 			fmd_case_rele((fmd_case_t *)cip);
920 			return ((fmd_case_t *)cip);
921 		}
922 
923 		/*
924 		 * If the existing case isn't an orphan or is being proxied,
925 		 * then we have a UUID conflict: return failure to the caller.
926 		 */
927 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
928 			(void) pthread_mutex_unlock(&cip->ci_lock);
929 			fmd_case_rele((fmd_case_t *)cip);
930 			return (NULL);
931 		}
932 
933 		/*
934 		 * If the new module is reclaiming an orphaned case, remove
935 		 * the case from the root module, switch ci_mod, and then fall
936 		 * through to adding the case to the new owner module 'mp'.
937 		 */
938 		fmd_module_lock(cip->ci_mod);
939 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
940 		fmd_module_unlock(cip->ci_mod);
941 
942 		fmd_module_rele(cip->ci_mod);
943 		cip->ci_mod = mp;
944 		fmd_module_hold(mp);
945 
946 		fmd_case_destroy_suspects(cip);
947 		cip->ci_state = state;
948 
949 		(void) pthread_mutex_unlock(&cip->ci_lock);
950 		fmd_case_rele((fmd_case_t *)cip);
951 	} else {
952 		/*
953 		 * add into hash of solved cases
954 		 */
955 		if (cip->ci_code)
956 			fmd_case_code_hash_insert(fmd.d_cases, cip);
957 	}
958 
959 	ASSERT(fmd_module_locked(mp));
960 	fmd_list_append(&mp->mod_cases, cip);
961 
962 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
963 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
964 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
965 
966 	return ((fmd_case_t *)cip);
967 }
968 
969 void
970 fmd_case_destroy(fmd_case_t *cp, int visible)
971 {
972 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
973 	fmd_case_item_t *cit, *ncit;
974 
975 	ASSERT(MUTEX_HELD(&cip->ci_lock));
976 	ASSERT(cip->ci_refs == 0);
977 
978 	if (visible) {
979 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
980 		fmd_case_hash_delete(fmd.d_cases, cip);
981 	}
982 
983 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
984 		ncit = cit->cit_next;
985 		fmd_event_rele(cit->cit_event);
986 		fmd_free(cit, sizeof (fmd_case_item_t));
987 	}
988 
989 	fmd_case_destroy_suspects(cip);
990 
991 	if (cip->ci_principal != NULL)
992 		fmd_event_rele(cip->ci_principal);
993 
994 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
995 	fmd_free(cip->ci_code, cip->ci_codelen);
996 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
997 
998 	fmd_module_rele(cip->ci_mod);
999 	fmd_free(cip, sizeof (fmd_case_impl_t));
1000 }
1001 
1002 void
1003 fmd_case_hold(fmd_case_t *cp)
1004 {
1005 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1006 
1007 	(void) pthread_mutex_lock(&cip->ci_lock);
1008 	fmd_case_hold_locked(cp);
1009 	(void) pthread_mutex_unlock(&cip->ci_lock);
1010 }
1011 
1012 void
1013 fmd_case_hold_locked(fmd_case_t *cp)
1014 {
1015 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1016 
1017 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1018 	if (cip->ci_flags & FMD_CF_DELETING)
1019 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1020 		    (void *)cip, cip->ci_uuid);
1021 	cip->ci_refs++;
1022 	ASSERT(cip->ci_refs != 0);
1023 }
1024 
1025 static fmd_case_impl_t *
1026 fmd_case_tryhold(fmd_case_impl_t *cip)
1027 {
1028 	/*
1029 	 * If the case's "deleting" bit is unset, hold and return case,
1030 	 * otherwise, return NULL.
1031 	 */
1032 	(void) pthread_mutex_lock(&cip->ci_lock);
1033 	if (cip->ci_flags & FMD_CF_DELETING) {
1034 		(void) pthread_mutex_unlock(&cip->ci_lock);
1035 		cip = NULL;
1036 	} else {
1037 		fmd_case_hold_locked((fmd_case_t *)cip);
1038 		(void) pthread_mutex_unlock(&cip->ci_lock);
1039 	}
1040 	return (cip);
1041 }
1042 
1043 void
1044 fmd_case_rele(fmd_case_t *cp)
1045 {
1046 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1047 
1048 	(void) pthread_mutex_lock(&cip->ci_lock);
1049 	ASSERT(cip->ci_refs != 0);
1050 
1051 	if (--cip->ci_refs == 0)
1052 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1053 	else
1054 		(void) pthread_mutex_unlock(&cip->ci_lock);
1055 }
1056 
1057 void
1058 fmd_case_rele_locked(fmd_case_t *cp)
1059 {
1060 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1061 
1062 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1063 	--cip->ci_refs;
1064 	ASSERT(cip->ci_refs != 0);
1065 }
1066 
1067 int
1068 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1069 {
1070 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1071 	fmd_case_item_t *cit;
1072 	fmd_event_t *oep;
1073 	uint_t state;
1074 	int new;
1075 
1076 	fmd_event_hold(ep);
1077 	(void) pthread_mutex_lock(&cip->ci_lock);
1078 
1079 	if (cip->ci_flags & FMD_CF_SOLVED)
1080 		state = FMD_EVS_DIAGNOSED;
1081 	else
1082 		state = FMD_EVS_ACCEPTED;
1083 
1084 	oep = cip->ci_principal;
1085 	cip->ci_principal = ep;
1086 
1087 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1088 		if (cit->cit_event == ep)
1089 			break;
1090 	}
1091 
1092 	cip->ci_flags |= FMD_CF_DIRTY;
1093 	new = cit == NULL && ep != oep;
1094 
1095 	(void) pthread_mutex_unlock(&cip->ci_lock);
1096 
1097 	fmd_module_setcdirty(cip->ci_mod);
1098 	fmd_event_transition(ep, state);
1099 
1100 	if (oep != NULL)
1101 		fmd_event_rele(oep);
1102 
1103 	return (new);
1104 }
1105 
1106 int
1107 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1108 {
1109 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1110 	fmd_case_item_t *cit;
1111 	uint_t state;
1112 	int new;
1113 
1114 	(void) pthread_mutex_lock(&cip->ci_lock);
1115 
1116 	if (cip->ci_flags & FMD_CF_SOLVED)
1117 		state = FMD_EVS_DIAGNOSED;
1118 	else
1119 		state = FMD_EVS_ACCEPTED;
1120 
1121 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1122 		if (cit->cit_event == ep)
1123 			break;
1124 	}
1125 
1126 	new = cit == NULL && ep != cip->ci_principal;
1127 
1128 	/*
1129 	 * If the event is already in the case or the case is already solved,
1130 	 * there is no reason to save it: just transition it appropriately.
1131 	 */
1132 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1133 		(void) pthread_mutex_unlock(&cip->ci_lock);
1134 		fmd_event_transition(ep, state);
1135 		return (new);
1136 	}
1137 
1138 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1139 	fmd_event_hold(ep);
1140 
1141 	cit->cit_next = cip->ci_items;
1142 	cit->cit_event = ep;
1143 
1144 	cip->ci_items = cit;
1145 	cip->ci_nitems++;
1146 
1147 	cip->ci_flags |= FMD_CF_DIRTY;
1148 	(void) pthread_mutex_unlock(&cip->ci_lock);
1149 
1150 	fmd_module_setcdirty(cip->ci_mod);
1151 	fmd_event_transition(ep, state);
1152 
1153 	return (new);
1154 }
1155 
1156 void
1157 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1158 {
1159 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1160 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1161 
1162 	(void) pthread_mutex_lock(&cip->ci_lock);
1163 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1164 	cip->ci_flags |= FMD_CF_DIRTY;
1165 
1166 	cis->cis_next = cip->ci_suspects;
1167 	cis->cis_nvl = nvl;
1168 
1169 	cip->ci_suspects = cis;
1170 	cip->ci_nsuspects++;
1171 
1172 	(void) pthread_mutex_unlock(&cip->ci_lock);
1173 	fmd_module_setcdirty(cip->ci_mod);
1174 }
1175 
1176 void
1177 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1178 {
1179 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1180 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1181 	boolean_t b;
1182 
1183 	(void) pthread_mutex_lock(&cip->ci_lock);
1184 	ASSERT(cip->ci_state == FMD_CASE_CLOSED ||
1185 	    cip->ci_state == FMD_CASE_REPAIRED);
1186 	ASSERT(cip->ci_mod == fmd.d_rmod);
1187 
1188 	cis->cis_next = cip->ci_suspects;
1189 	cis->cis_nvl = nvl;
1190 
1191 	if (nvlist_lookup_boolean_value(nvl,
1192 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1193 		cip->ci_flags |= FMD_CF_INVISIBLE;
1194 
1195 	cip->ci_suspects = cis;
1196 	cip->ci_nsuspects++;
1197 
1198 	(void) pthread_mutex_unlock(&cip->ci_lock);
1199 }
1200 
1201 void
1202 fmd_case_reset_suspects(fmd_case_t *cp)
1203 {
1204 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1205 
1206 	(void) pthread_mutex_lock(&cip->ci_lock);
1207 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1208 
1209 	fmd_case_destroy_suspects(cip);
1210 	cip->ci_flags |= FMD_CF_DIRTY;
1211 
1212 	(void) pthread_mutex_unlock(&cip->ci_lock);
1213 	fmd_module_setcdirty(cip->ci_mod);
1214 }
1215 
1216 /*ARGSUSED*/
1217 static void
1218 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1219 {
1220 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1221 }
1222 
1223 /*
1224  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1225  * whatever actions and emit whatever events are appropriate for the state.
1226  * Refer to the topmost block comment explaining the state machine for details.
1227  */
1228 void
1229 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1230 {
1231 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1232 	fmd_case_item_t *cit;
1233 	fmd_event_t *e;
1234 	int resolved = 0;
1235 	int any_unusable_and_present = 0;
1236 
1237 	ASSERT(state <= FMD_CASE_RESOLVED);
1238 	(void) pthread_mutex_lock(&cip->ci_lock);
1239 
1240 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1241 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
1242 
1243 	cip->ci_flags |= flags;
1244 
1245 	if (cip->ci_state >= state) {
1246 		(void) pthread_mutex_unlock(&cip->ci_lock);
1247 		return; /* already in specified state */
1248 	}
1249 
1250 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1251 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1252 
1253 	cip->ci_state = state;
1254 	cip->ci_flags |= FMD_CF_DIRTY;
1255 
1256 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1257 		fmd_module_setcdirty(cip->ci_mod);
1258 
1259 	switch (state) {
1260 	case FMD_CASE_SOLVED:
1261 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1262 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1263 
1264 		if (cip->ci_principal != NULL) {
1265 			fmd_event_transition(cip->ci_principal,
1266 			    FMD_EVS_DIAGNOSED);
1267 		}
1268 		break;
1269 
1270 	case FMD_CASE_CLOSE_WAIT:
1271 		/*
1272 		 * If the case was never solved, do not change ASRUs.
1273 		 * If the case was never fmd_case_closed, do not change ASRUs.
1274 		 * If the case was repaired, do not change ASRUs.
1275 		 */
1276 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1277 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1278 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1279 			    fmd_case_unusable, NULL);
1280 
1281 		/*
1282 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1283 		 * module is no longer loaded: continue on to CASE_CLOSED.
1284 		 */
1285 		if (fmd_case_orphaned(cp))
1286 			state = cip->ci_state = FMD_CASE_CLOSED;
1287 		break;
1288 
1289 	case FMD_CASE_REPAIRED:
1290 		ASSERT(fmd_case_orphaned(cp));
1291 
1292 		/*
1293 		 * If all suspects are already either usable or not present then
1294 		 * transition straight to RESOLVED state, publishing both the
1295 		 * list.repaired and list.resolved.
1296 		 */
1297 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1298 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1299 		if (any_unusable_and_present)
1300 			break;
1301 
1302 		fmd_module_lock(cip->ci_mod);
1303 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1304 		fmd_module_unlock(cip->ci_mod);
1305 		cip->ci_state = FMD_CASE_RESOLVED;
1306 		(void) pthread_mutex_unlock(&cip->ci_lock);
1307 		fmd_case_publish(cp, state);
1308 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1309 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1310 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1311 		state = FMD_CASE_RESOLVED;
1312 		resolved = 1;
1313 		(void) pthread_mutex_lock(&cip->ci_lock);
1314 		break;
1315 
1316 	case FMD_CASE_RESOLVED:
1317 		ASSERT(fmd_case_orphaned(cp));
1318 
1319 		/*
1320 		 * If all suspects are already either usable or not present then
1321 		 * carry on, publish list.resolved and discard the case.
1322 		 */
1323 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1324 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1325 		if (any_unusable_and_present) {
1326 			(void) pthread_mutex_unlock(&cip->ci_lock);
1327 			return;
1328 		}
1329 
1330 		fmd_module_lock(cip->ci_mod);
1331 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1332 		fmd_module_unlock(cip->ci_mod);
1333 		resolved = 1;
1334 		break;
1335 	}
1336 
1337 	(void) pthread_mutex_unlock(&cip->ci_lock);
1338 
1339 	/*
1340 	 * If the module has initialized, then publish the appropriate event
1341 	 * for the new case state.  If not, we are being called from the
1342 	 * checkpoint code during module load, in which case the module's
1343 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1344 	 * may not be open yet, which will prevent us from computing the event
1345 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1346 	 * event in our queue: this won't be processed until _fmd_init is done.
1347 	 */
1348 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1349 		fmd_case_publish(cp, state);
1350 	else {
1351 		fmd_case_hold(cp);
1352 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1353 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1354 	}
1355 
1356 	if (resolved) {
1357 		/*
1358 		 * If we transitioned to RESOLVED, adjust the reference count to
1359 		 * reflect our removal from fmd.d_rmod->mod_cases above.  If the
1360 		 * caller has not placed an additional hold on the case, it
1361 		 * will now be freed.
1362 		 */
1363 		(void) pthread_mutex_lock(&cip->ci_lock);
1364 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1365 		(void) pthread_mutex_unlock(&cip->ci_lock);
1366 		fmd_case_rele(cp);
1367 	}
1368 }
1369 
1370 /*
1371  * Transition the specified case to *at least* the specified state by first
1372  * re-validating the suspect list using the resource cache.  This function is
1373  * employed by the checkpoint code when restoring a saved, solved case to see
1374  * if the state of the case has effectively changed while fmd was not running
1375  * or the module was not loaded.
1376  */
1377 void
1378 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1379 {
1380 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1381 
1382 	int usable = 0;		/* are any suspects usable? */
1383 
1384 	ASSERT(state >= FMD_CASE_SOLVED);
1385 	(void) pthread_mutex_lock(&cip->ci_lock);
1386 
1387 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1388 
1389 	(void) pthread_mutex_unlock(&cip->ci_lock);
1390 
1391 	if (!usable) {
1392 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1393 		flags |= FMD_CF_ISOLATED;
1394 	}
1395 
1396 	fmd_case_transition(cp, state, flags);
1397 }
1398 
1399 void
1400 fmd_case_setdirty(fmd_case_t *cp)
1401 {
1402 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1403 
1404 	(void) pthread_mutex_lock(&cip->ci_lock);
1405 	cip->ci_flags |= FMD_CF_DIRTY;
1406 	(void) pthread_mutex_unlock(&cip->ci_lock);
1407 
1408 	fmd_module_setcdirty(cip->ci_mod);
1409 }
1410 
1411 void
1412 fmd_case_clrdirty(fmd_case_t *cp)
1413 {
1414 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1415 
1416 	(void) pthread_mutex_lock(&cip->ci_lock);
1417 	cip->ci_flags &= ~FMD_CF_DIRTY;
1418 	(void) pthread_mutex_unlock(&cip->ci_lock);
1419 }
1420 
1421 void
1422 fmd_case_commit(fmd_case_t *cp)
1423 {
1424 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1425 	fmd_case_item_t *cit;
1426 
1427 	(void) pthread_mutex_lock(&cip->ci_lock);
1428 
1429 	if (cip->ci_flags & FMD_CF_DIRTY) {
1430 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1431 			fmd_event_commit(cit->cit_event);
1432 
1433 		if (cip->ci_principal != NULL)
1434 			fmd_event_commit(cip->ci_principal);
1435 
1436 		fmd_buf_hash_commit(&cip->ci_bufs);
1437 		cip->ci_flags &= ~FMD_CF_DIRTY;
1438 	}
1439 
1440 	(void) pthread_mutex_unlock(&cip->ci_lock);
1441 }
1442 
1443 /*
1444  * Indicate that the case may need to change state because one or more of the
1445  * ASRUs named as a suspect has changed state.  We examine all the suspects
1446  * and if none are still faulty, we initiate a case close transition.
1447  */
1448 void
1449 fmd_case_update(fmd_case_t *cp)
1450 {
1451 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1452 	uint_t cstate;
1453 	int faulty = 0;
1454 
1455 	(void) pthread_mutex_lock(&cip->ci_lock);
1456 	cstate = cip->ci_state;
1457 
1458 	if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1459 		(void) pthread_mutex_unlock(&cip->ci_lock);
1460 		return; /* update is not appropriate */
1461 	}
1462 
1463 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1464 		(void) pthread_mutex_unlock(&cip->ci_lock);
1465 		return; /* already repaired */
1466 	}
1467 
1468 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1469 	(void) pthread_mutex_unlock(&cip->ci_lock);
1470 
1471 	if (faulty) {
1472 		nvlist_t *nvl;
1473 		fmd_event_t *e;
1474 		char *class;
1475 
1476 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
1477 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1478 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1479 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1480 		fmd_log_append(fmd.d_fltlog, e, cp);
1481 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1482 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1483 		return; /* one or more suspects are still marked faulty */
1484 	}
1485 
1486 	if (cstate == FMD_CASE_CLOSED)
1487 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1488 	else
1489 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1490 }
1491 
1492 /*
1493  * Delete a closed case from the module's case list once the fmdo_close() entry
1494  * point has run to completion.  If the case is owned by a transport module,
1495  * tell the transport to proxy a case close on the other end of the transport.
1496  * If not, transition to the appropriate next state based on ci_flags.  This
1497  * function represents the end of CLOSE_WAIT and transitions the case to either
1498  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1499  * refer to the topmost block comment explaining the state machine for details.
1500  */
1501 void
1502 fmd_case_delete(fmd_case_t *cp)
1503 {
1504 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1505 	fmd_modstat_t *msp;
1506 	size_t buftotal;
1507 
1508 	ASSERT(fmd_module_locked(cip->ci_mod));
1509 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1510 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1511 
1512 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1513 	msp = cip->ci_mod->mod_stats;
1514 
1515 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1516 	msp->ms_caseopen.fmds_value.ui64--;
1517 
1518 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1519 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1520 
1521 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1522 
1523 	if (cip->ci_xprt == NULL)
1524 		fmd_module_setcdirty(cip->ci_mod);
1525 
1526 	fmd_module_rele(cip->ci_mod);
1527 	cip->ci_mod = fmd.d_rmod;
1528 	fmd_module_hold(cip->ci_mod);
1529 
1530 	/*
1531 	 * If the case is not proxied and it has been solved, then retain it
1532 	 * on the root module's case list at least until we're transitioned.
1533 	 * Otherwise free the case with our final fmd_case_rele() below.
1534 	 */
1535 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1536 		fmd_module_lock(cip->ci_mod);
1537 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1538 		fmd_module_unlock(cip->ci_mod);
1539 		fmd_case_hold(cp);
1540 	}
1541 
1542 	/*
1543 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1544 	 * rather than orphaned because by definition it can have no entries
1545 	 * in the resource cache of the current fault manager.
1546 	 */
1547 	if (cip->ci_xprt != NULL)
1548 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1549 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1550 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1551 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1552 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1553 
1554 	fmd_case_rele(cp);
1555 }
1556 
1557 void
1558 fmd_case_discard(fmd_case_t *cp)
1559 {
1560 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1561 
1562 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1563 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1564 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1565 
1566 	ASSERT(fmd_module_locked(cip->ci_mod));
1567 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1568 	fmd_case_rele(cp);
1569 }
1570 
1571 /*
1572  * Indicate that the problem corresponding to a case has been repaired by
1573  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1574  * already been closed, this function initiates the transition to CLOSE_WAIT.
1575  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1576  * grab and drop ci_lock without the case being able to be freed in between.
1577  */
1578 int
1579 fmd_case_repair(fmd_case_t *cp)
1580 {
1581 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1582 	uint_t cstate;
1583 
1584 	(void) pthread_mutex_lock(&cip->ci_lock);
1585 	cstate = cip->ci_state;
1586 
1587 	if (cip->ci_xprt != NULL) {
1588 		(void) pthread_mutex_unlock(&cip->ci_lock);
1589 		return (fmd_set_errno(EFMD_CASE_OWNER));
1590 	}
1591 
1592 	if (cstate < FMD_CASE_SOLVED) {
1593 		(void) pthread_mutex_unlock(&cip->ci_lock);
1594 		return (fmd_set_errno(EFMD_CASE_STATE));
1595 	}
1596 
1597 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1598 		(void) pthread_mutex_unlock(&cip->ci_lock);
1599 		return (0); /* already repaired */
1600 	}
1601 
1602 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, NULL);
1603 	(void) pthread_mutex_unlock(&cip->ci_lock);
1604 
1605 	if (cstate == FMD_CASE_CLOSED)
1606 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1607 	else
1608 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1609 
1610 	return (0);
1611 }
1612 
1613 int
1614 fmd_case_acquit(fmd_case_t *cp)
1615 {
1616 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1617 	uint_t cstate;
1618 
1619 	(void) pthread_mutex_lock(&cip->ci_lock);
1620 	cstate = cip->ci_state;
1621 
1622 	if (cip->ci_xprt != NULL) {
1623 		(void) pthread_mutex_unlock(&cip->ci_lock);
1624 		return (fmd_set_errno(EFMD_CASE_OWNER));
1625 	}
1626 
1627 	if (cstate < FMD_CASE_SOLVED) {
1628 		(void) pthread_mutex_unlock(&cip->ci_lock);
1629 		return (fmd_set_errno(EFMD_CASE_STATE));
1630 	}
1631 
1632 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1633 		(void) pthread_mutex_unlock(&cip->ci_lock);
1634 		return (0); /* already repaired */
1635 	}
1636 
1637 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_acquit, NULL);
1638 	(void) pthread_mutex_unlock(&cip->ci_lock);
1639 
1640 	if (cstate == FMD_CASE_CLOSED)
1641 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1642 	else
1643 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1644 
1645 	return (0);
1646 }
1647 
1648 int
1649 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1650 {
1651 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1652 	fmd_case_item_t *cit;
1653 	uint_t state;
1654 	int rv = 0;
1655 
1656 	(void) pthread_mutex_lock(&cip->ci_lock);
1657 
1658 	if (cip->ci_state >= FMD_CASE_SOLVED)
1659 		state = FMD_EVS_DIAGNOSED;
1660 	else
1661 		state = FMD_EVS_ACCEPTED;
1662 
1663 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1664 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1665 			break;
1666 	}
1667 
1668 	if (rv == 0 && cip->ci_principal != NULL)
1669 		rv = fmd_event_equal(ep, cip->ci_principal);
1670 
1671 	(void) pthread_mutex_unlock(&cip->ci_lock);
1672 
1673 	if (rv != 0)
1674 		fmd_event_transition(ep, state);
1675 
1676 	return (rv);
1677 }
1678 
1679 int
1680 fmd_case_orphaned(fmd_case_t *cp)
1681 {
1682 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1683 }
1684 
1685 void
1686 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1687 {
1688 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1689 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1690 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1691 }
1692 
1693 /*ARGSUSED*/
1694 void
1695 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
1696 {
1697 	int not_faulty = 0;
1698 	int faulty = 0;
1699 	nvlist_t *nvl;
1700 	fmd_event_t *e;
1701 	char *class;
1702 	int any_unusable_and_present = 0;
1703 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1704 
1705 	if (cip->ci_state < FMD_CASE_SOLVED)
1706 		return;
1707 
1708 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1709 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
1710 	    &not_faulty);
1711 
1712 	if (!faulty) {
1713 		/*
1714 		 * If none of the suspects is faulty, replay the list.repaired.
1715 		 * If all suspects are already either usable or not present then
1716 		 * also transition straight to RESOLVED state.
1717 		 */
1718 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1719 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1720 		if (!any_unusable_and_present) {
1721 			fmd_module_lock(cip->ci_mod);
1722 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1723 			fmd_module_unlock(cip->ci_mod);
1724 			cip->ci_state = FMD_CASE_RESOLVED;
1725 
1726 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1727 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1728 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
1729 			    class);
1730 			fmd_dispq_dispatch(fmd.d_disp, e, class);
1731 
1732 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
1733 			(void) pthread_mutex_lock(&cip->ci_lock);
1734 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1735 			(void) pthread_mutex_unlock(&cip->ci_lock);
1736 			fmd_case_rele(cp);
1737 		} else {
1738 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1739 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1740 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
1741 			    class);
1742 			fmd_dispq_dispatch(fmd.d_disp, e, class);
1743 		}
1744 	} else if (not_faulty) {
1745 		/*
1746 		 * if some but not all of the suspects are not faulty, replay
1747 		 * the list.updated.
1748 		 */
1749 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
1750 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1751 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1752 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1753 	}
1754 }
1755 
1756 void
1757 fmd_case_repair_replay()
1758 {
1759 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
1760 }
1761