xref: /titanic_52/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 6d2259e1baf8d4ac11c96570f45ecdcd9771a68d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * FMD Case Subsystem
29  *
30  * Diagnosis engines are expected to group telemetry events related to the
31  * diagnosis of a particular problem on the system into a set of cases.  The
32  * diagnosis engine may have any number of cases open at a given point in time.
33  * Some cases may eventually be *solved* by associating a suspect list of one
34  * or more problems with the case, at which point fmd publishes a list.suspect
35  * event for the case and it becomes visible to administrators and agents.
36  *
37  * Every case is named using a UUID, and is globally visible in the case hash.
38  * Cases are reference-counted, except for the reference from the case hash
39  * itself.  Consumers of case references include modules, which store active
40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
41  *
42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
44  * or transport) and the case is referenced by the mod_cases list.  Once the
45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
47  *
48  *			+------------+
49  *	     +----------|  UNSOLVED  |
50  *	     |		+------------+
51  *	     |		      1 |
52  *	     |			|
53  *	     |		+-------v----+
54  *	   2 |		|    SOLVED  |
55  *	     |		+------------+
56  *	     |		    3 |  5 |
57  *	     +------------+   |    |
58  *			  |   |    |
59  *			+-v---v----v-+
60  *			| CLOSE_WAIT |
61  *			+------------+
62  *			  |   |    |
63  *	      +-----------+   |    +------------+
64  *	      |		    4 |			|
65  *	      v		+-----v------+		|
66  *	   discard      |   CLOSED   |	      6	|
67  *			+------------+		|
68  *			      |			|
69  *			      |	   +------------+
70  *			    7 |	   |
71  *			+-----v----v-+
72  *			|  REPAIRED  |
73  *			+------------+
74  *			      |
75  *			    8 |
76  *			+-----v------+
77  *			|  RESOLVED  |
78  *			+------------+
79  *			      |
80  *			      v
81  *			   discard
82  *
83  * The state machine changes are triggered by calls to fmd_case_transition()
84  * from various locations inside of fmd, as described below:
85  *
86  * [1] Called by: fmd_case_solve()
87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
88  *                conviction policy is applied to suspect list
89  *                suspects convicted are marked faulty (F) in R$
90  *                list.suspect event logged and dispatched
91  *
92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
94  *                case discarded upon exit from CLOSE_WAIT
95  *
96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
98  *                suspects convicted (F) are marked unusable (U) in R$
99  *                diagnosis engine fmdo_close() entry point scheduled
100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
101  *
102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
103  *       Actions: list.isolated event dispatched
104  *                case deleted from module's list of open cases
105  *
106  * [5] Called by: fmd_case_repair(), fmd_case_update()
107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
108  *                diagnosis engine fmdo_close() entry point scheduled
109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
110  *
111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
112  *       Actions: suspects convicted are marked non faulty (!F) in R$
113  *                list.repaired or list.updated event dispatched
114  *
115  * [7] Called by: fmd_case_repair(), fmd_case_update()
116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
117  *                suspects convicted are marked non faulty (!F) in R$
118  *                list.repaired or list.updated event dispatched
119  *
120  * [8] Called by: fmd_case_uuresolve()
121  *       Actions: list.resolved event dispatched
122  *		  case is discarded
123  */
124 
125 #include <sys/fm/protocol.h>
126 #include <uuid/uuid.h>
127 #include <alloca.h>
128 
129 #include <fmd_alloc.h>
130 #include <fmd_module.h>
131 #include <fmd_error.h>
132 #include <fmd_conf.h>
133 #include <fmd_case.h>
134 #include <fmd_string.h>
135 #include <fmd_subr.h>
136 #include <fmd_protocol.h>
137 #include <fmd_event.h>
138 #include <fmd_eventq.h>
139 #include <fmd_dispq.h>
140 #include <fmd_buf.h>
141 #include <fmd_log.h>
142 #include <fmd_asru.h>
143 #include <fmd_fmri.h>
144 #include <fmd_xprt.h>
145 
146 #include <fmd.h>
147 
148 static const char *const _fmd_case_snames[] = {
149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
150 	"SOLVED",	/* FMD_CASE_SOLVED */
151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
152 	"CLOSED",	/* FMD_CASE_CLOSED */
153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
155 };
156 
157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
158 
159 fmd_case_hash_t *
160 fmd_case_hash_create(void)
161 {
162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
163 
164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
165 	chp->ch_hashlen = fmd.d_str_buckets;
166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
168 	    FMD_SLEEP);
169 	chp->ch_count = 0;
170 
171 	return (chp);
172 }
173 
174 /*
175  * Destroy the case hash.  Unlike most of our hash tables, no active references
176  * are kept by the case hash itself; all references come from other subsystems.
177  * The hash must be destroyed after all modules are unloaded; if anything was
178  * present in the hash it would be by definition a reference count leak.
179  */
180 void
181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
182 {
183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
185 	fmd_free(chp, sizeof (fmd_case_hash_t));
186 }
187 
188 /*
189  * Take a snapshot of the case hash by placing an additional hold on each
190  * member in an auxiliary array, and then call 'func' for each case.
191  */
192 void
193 fmd_case_hash_apply(fmd_case_hash_t *chp,
194     void (*func)(fmd_case_t *, void *), void *arg)
195 {
196 	fmd_case_impl_t *cp, **cps, **cpp;
197 	uint_t cpc, i;
198 
199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
200 
201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
202 	cpc = chp->ch_count;
203 
204 	for (i = 0; i < chp->ch_hashlen; i++) {
205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
206 			*cpp++ = fmd_case_tryhold(cp);
207 	}
208 
209 	ASSERT(cpp == cps + cpc);
210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
211 
212 	for (i = 0; i < cpc; i++) {
213 		if (cps[i] != NULL) {
214 			func((fmd_case_t *)cps[i], arg);
215 			fmd_case_rele((fmd_case_t *)cps[i]);
216 		}
217 	}
218 
219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
220 }
221 
222 static void
223 fmd_case_hash_apply_except_current(fmd_case_hash_t *chp,
224     void (*func)(fmd_case_t *, void *), void *arg, fmd_case_t *current)
225 {
226 	fmd_case_impl_t *cp, **cps, **cpp;
227 	uint_t cpc, i;
228 
229 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
230 
231 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
232 	cpc = chp->ch_count;
233 
234 	for (i = 0; i < chp->ch_hashlen; i++) {
235 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
236 			if (cp != (fmd_case_impl_t *)current)
237 				*cpp++ = fmd_case_tryhold(cp);
238 			else
239 				*cpp++ = cp;
240 	}
241 
242 	ASSERT(cpp == cps + cpc);
243 	(void) pthread_rwlock_unlock(&chp->ch_lock);
244 
245 	for (i = 0; i < cpc; i++) {
246 		if (cps[i] != NULL && cps[i] != (fmd_case_impl_t *)current) {
247 			func((fmd_case_t *)cps[i], arg);
248 			fmd_case_rele((fmd_case_t *)cps[i]);
249 		}
250 	}
251 
252 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
253 }
254 
255 static void
256 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
257 {
258 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
259 
260 	cip->ci_code_next = chp->ch_code_hash[h];
261 	chp->ch_code_hash[h] = cip;
262 }
263 
264 static void
265 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
266 {
267 	fmd_case_impl_t **pp, *cp;
268 
269 	if (cip->ci_code) {
270 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
271 
272 		pp = &chp->ch_code_hash[h];
273 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
274 			if (cp != cip)
275 				pp = &cp->ci_code_next;
276 			else
277 				break;
278 		}
279 		if (cp != NULL) {
280 			*pp = cp->ci_code_next;
281 			cp->ci_code_next = NULL;
282 		}
283 	}
284 }
285 
286 /*
287  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
288  * were defined for this case or if the lookup fails, the event dictionary or
289  * module code is broken, and we set the event code to a precomputed default.
290  */
291 static const char *
292 fmd_case_mkcode(fmd_case_t *cp)
293 {
294 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
295 	fmd_case_susp_t *cis;
296 	fmd_case_hash_t *chp = fmd.d_cases;
297 
298 	char **keys, **keyp;
299 	const char *s;
300 
301 	ASSERT(MUTEX_HELD(&cip->ci_lock));
302 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
303 
304 	/*
305 	 * delete any existing entry from code hash if it is on it
306 	 */
307 	fmd_case_code_hash_delete(chp, cip);
308 
309 	fmd_free(cip->ci_code, cip->ci_codelen);
310 	cip->ci_codelen = cip->ci_mod->mod_codelen;
311 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
312 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
313 
314 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
315 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
316 			keyp++;
317 	}
318 
319 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
320 
321 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
322 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
323 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
324 		fmd_free(cip->ci_code, cip->ci_codelen);
325 		cip->ci_codelen = strlen(s) + 1;
326 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
327 		(void) strcpy(cip->ci_code, s);
328 	}
329 
330 	/*
331 	 * add into hash of solved cases
332 	 */
333 	fmd_case_code_hash_insert(chp, cip);
334 
335 	return (cip->ci_code);
336 }
337 
338 typedef struct {
339 	int	*fcl_countp;
340 	int	fcl_maxcount;
341 	uint8_t *fcl_ba;
342 	nvlist_t **fcl_nva;
343 	int	*fcl_msgp;
344 } fmd_case_lst_t;
345 
346 static void
347 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
348 {
349 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
350 	boolean_t b;
351 	int state;
352 
353 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
354 		return;
355 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
356 	    &b) == 0 && b == B_FALSE)
357 		*entryp->fcl_msgp = B_FALSE;
358 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
359 	state = fmd_asru_al_getstate(alp);
360 	if (state & FMD_ASRU_DEGRADED)
361 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
362 	if (state & FMD_ASRU_UNUSABLE)
363 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
364 	if (state & FMD_ASRU_FAULTY)
365 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
366 	if (!(state & FMD_ASRU_PRESENT))
367 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
368 	if (alp->al_reason == FMD_ASRU_REPAIRED)
369 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
370 	else if (alp->al_reason == FMD_ASRU_REPLACED)
371 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
372 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
373 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
374 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
375 	(*entryp->fcl_countp)++;
376 }
377 
378 static void
379 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
380 {
381 	int *faultyp = (int *)arg;
382 
383 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
384 }
385 
386 static void
387 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
388 {
389 	int *usablep = (int *)arg;
390 
391 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
392 }
393 
394 static void
395 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
396 {
397 	int *not_faultyp = (int *)arg;
398 
399 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
400 }
401 
402 /*
403  * Have we got any suspects with an asru that are still unusable and present?
404  */
405 static void
406 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
407 {
408 	int *rvalp = (int *)arg;
409 	int state;
410 	nvlist_t *asru;
411 
412 	/*
413 	 * if this a proxy case and this suspect doesn't have an local asru
414 	 * then state is unknown so we must assume it may still be unusable.
415 	 */
416 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
417 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
418 		*rvalp |= B_TRUE;
419 		return;
420 	}
421 
422 	state = fmd_asru_al_getstate(alp);
423 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
424 		return;
425 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
426 }
427 
428 nvlist_t *
429 fmd_case_mkevent(fmd_case_t *cp, const char *class)
430 {
431 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
432 	nvlist_t **nva, *nvl;
433 	uint8_t *ba;
434 	int msg = B_TRUE;
435 	const char *code;
436 	fmd_case_lst_t fcl;
437 	int count = 0;
438 
439 	(void) pthread_mutex_lock(&cip->ci_lock);
440 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
441 
442 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
443 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
444 
445 	/*
446 	 * For each suspect associated with the case, store its fault event
447 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
448 	 * have asked not to be messaged.  If any of them have made such a
449 	 * request, propagate that attribute to the composite list.* event.
450 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
451 	 */
452 	fcl.fcl_countp = &count;
453 	fcl.fcl_maxcount = cip->ci_nsuspects;
454 	fcl.fcl_msgp = &msg;
455 	fcl.fcl_ba = ba;
456 	fcl.fcl_nva = nva;
457 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
458 
459 	if (cip->ci_code == NULL)
460 		(void) fmd_case_mkcode(cp);
461 	/*
462 	 * For repair and updated event, we lookup diagcode from dict using key
463 	 * "list.repaired" or "list.updated" or "list.resolved".
464 	 */
465 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
466 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
467 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
468 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
469 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
470 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
471 	else
472 		code = cip->ci_code;
473 
474 	if (msg == B_FALSE)
475 		cip->ci_flags |= FMD_CF_INVISIBLE;
476 
477 	/*
478 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
479 	 * Otherwise use the authority for the current module.
480 	 */
481 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
482 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
483 	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
484 
485 	(void) pthread_mutex_unlock(&cip->ci_lock);
486 	return (nvl);
487 }
488 
489 static int fmd_case_match_on_faulty_overlap = 1;
490 static int fmd_case_match_on_acquit_overlap = 1;
491 static int fmd_case_auto_acquit_isolated = 1;
492 static int fmd_case_auto_acquit_non_acquitted = 1;
493 static int fmd_case_too_recent = 10; /* time in seconds */
494 
495 static boolean_t
496 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
497 {
498 	nvlist_t *new_rsrc;
499 	nvlist_t *rsrc;
500 	char *new_name = NULL;
501 	char *name = NULL;
502 	ssize_t new_namelen;
503 	ssize_t namelen;
504 	int fmri_present = 1;
505 	int new_fmri_present = 1;
506 	int match = B_FALSE;
507 	fmd_topo_t *ftp = fmd_topo_hold();
508 
509 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
510 		fmri_present = 0;
511 	else {
512 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
513 			goto done;
514 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
515 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
516 			goto done;
517 	}
518 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
519 		new_fmri_present = 0;
520 	else {
521 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
522 			goto done;
523 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
524 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
525 			goto done;
526 	}
527 	match = (fmri_present == new_fmri_present &&
528 	    (fmri_present == 0 ||
529 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
530 done:
531 	if (name != NULL)
532 		fmd_free(name, namelen + 1);
533 	if (new_name != NULL)
534 		fmd_free(new_name, new_namelen + 1);
535 	fmd_topo_rele(ftp);
536 	return (match);
537 }
538 
539 static int
540 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
541 {
542 	char *class, *new_class;
543 
544 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
545 		return (0);
546 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
547 		return (0);
548 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
549 		return (0);
550 	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
551 	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
552 	return (strcmp(class, new_class) == 0);
553 }
554 
555 typedef struct {
556 	int	*fcms_countp;
557 	int	fcms_maxcount;
558 	fmd_case_impl_t *fcms_cip;
559 	uint8_t *fcms_new_susp_state;
560 	uint8_t *fcms_old_susp_state;
561 	uint8_t *fcms_old_match_state;
562 } fcms_t;
563 #define	SUSPECT_STATE_FAULTY				0x1
564 #define	SUSPECT_STATE_ISOLATED				0x2
565 #define	SUSPECT_STATE_REMOVED				0x4
566 #define	SUSPECT_STATE_ACQUITED				0x8
567 #define	SUSPECT_STATE_REPAIRED				0x10
568 #define	SUSPECT_STATE_REPLACED				0x20
569 #define	SUSPECT_STATE_NO_MATCH				0x1
570 
571 /*
572  * This is called for each suspect in the old case. Compare it against each
573  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
574  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
575  * found in the old case.
576  */
577 static void
578 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
579 {
580 	fcms_t *fcmsp = (fcms_t *)arg;
581 	fmd_case_impl_t *cip = fcmsp->fcms_cip;
582 	fmd_case_susp_t *cis;
583 	int i = 0;
584 	int state = fmd_asru_al_getstate(alp);
585 
586 	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
587 		return;
588 
589 	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
590 	    alp->al_reason == FMD_ASRU_REMOVED))
591 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
592 		    SUSPECT_STATE_REMOVED;
593 	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
594 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
595 		    SUSPECT_STATE_ISOLATED;
596 	else if (state & FMD_ASRU_FAULTY)
597 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
598 		    SUSPECT_STATE_FAULTY;
599 	else if (alp->al_reason == FMD_ASRU_REPLACED)
600 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
601 		    SUSPECT_STATE_REPLACED;
602 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
603 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
604 		    SUSPECT_STATE_ACQUITED;
605 	else
606 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
607 		    SUSPECT_STATE_REPAIRED;
608 
609 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
610 		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
611 			break;
612 	if (cis != NULL)
613 		fcmsp->fcms_new_susp_state[i] =
614 		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
615 	else
616 		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
617 		    SUSPECT_STATE_NO_MATCH;
618 	(*fcmsp->fcms_countp)++;
619 }
620 
621 typedef struct {
622 	int	*fca_do_update;
623 	fmd_case_impl_t *fca_cip;
624 } fca_t;
625 
626 /*
627  * Re-fault all acquitted suspects that are still present in the new list.
628  */
629 static void
630 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
631 {
632 	fca_t *fcap = (fca_t *)arg;
633 	fmd_case_impl_t *cip = fcap->fca_cip;
634 	fmd_case_susp_t *cis;
635 	int state = fmd_asru_al_getstate(alp);
636 
637 	if (!(state & FMD_ASRU_FAULTY) &&
638 	    alp->al_reason == FMD_ASRU_ACQUITTED) {
639 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
640 			if (fmd_case_match_suspect(cis->cis_nvl,
641 			    alp->al_event) == 1)
642 				break;
643 		if (cis != NULL) {
644 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
645 			*fcap->fca_do_update = 1;
646 		}
647 	}
648 }
649 
650 /*
651  * Re-fault all suspects that are still present in the new list.
652  */
653 static void
654 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
655 {
656 	fca_t *fcap = (fca_t *)arg;
657 	fmd_case_impl_t *cip = fcap->fca_cip;
658 	fmd_case_susp_t *cis;
659 	int state = fmd_asru_al_getstate(alp);
660 
661 	if (!(state & FMD_ASRU_FAULTY)) {
662 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
663 			if (fmd_case_match_suspect(cis->cis_nvl,
664 			    alp->al_event) == 1)
665 				break;
666 		if (cis != NULL) {
667 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
668 			*fcap->fca_do_update = 1;
669 		}
670 	}
671 }
672 
673 /*
674  * Acquit all suspects that are no longer present in the new list.
675  */
676 static void
677 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
678 {
679 	fca_t *fcap = (fca_t *)arg;
680 	fmd_case_impl_t *cip = fcap->fca_cip;
681 	fmd_case_susp_t *cis;
682 	int state = fmd_asru_al_getstate(alp);
683 
684 	if (state & FMD_ASRU_FAULTY) {
685 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
686 			if (fmd_case_match_suspect(cis->cis_nvl,
687 			    alp->al_event) == 1)
688 				break;
689 		if (cis == NULL) {
690 			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
691 			    FMD_ASRU_ACQUITTED);
692 			*fcap->fca_do_update = 1;
693 		}
694 	}
695 }
696 
697 /*
698  * Acquit all isolated suspects.
699  */
700 static void
701 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
702 {
703 	int *do_update = (int *)arg;
704 	int state = fmd_asru_al_getstate(alp);
705 
706 	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
707 	    (state & FMD_ASRU_FAULTY)) {
708 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
709 		    FMD_ASRU_ACQUITTED);
710 		*do_update = 1;
711 	}
712 }
713 
714 /*
715  * Acquit suspect which matches specified nvlist
716  */
717 static void
718 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
719 {
720 	nvlist_t *nvl = (nvlist_t *)arg;
721 	int state = fmd_asru_al_getstate(alp);
722 
723 	if ((state & FMD_ASRU_FAULTY) &&
724 	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
725 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
726 		    FMD_ASRU_ACQUITTED);
727 }
728 
729 typedef struct {
730 	fmd_case_impl_t *fccd_cip;
731 	uint8_t *fccd_new_susp_state;
732 	uint8_t *fccd_new_match_state;
733 	int *fccd_discard_new;
734 	int *fccd_adjust_new;
735 } fccd_t;
736 
737 /*
738  * see if a matching suspect list already exists in the cache
739  */
740 static void
741 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
742 {
743 	fccd_t *fccdp = (fccd_t *)arg;
744 	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
745 	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
746 	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
747 	int got_faulty_overlap = 0;
748 	int got_acquit_overlap = 0;
749 	boolean_t too_recent;
750 	uint64_t most_recent = 0;
751 	fcms_t fcms;
752 	fca_t fca;
753 	uint8_t *new_susp_state;
754 	uint8_t *old_susp_state;
755 	uint8_t *old_match_state;
756 
757 	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
758 	for (i = 0; i < new_cip->ci_nsuspects; i++)
759 		new_susp_state[i] = 0;
760 	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
761 	for (i = 0; i < old_cip->ci_nsuspects; i++)
762 		old_susp_state[i] = 0;
763 	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
764 	for (i = 0; i < old_cip->ci_nsuspects; i++)
765 		old_match_state[i] = 0;
766 
767 	/*
768 	 * Compare with each suspect in the existing case.
769 	 */
770 	fcms.fcms_countp = &count;
771 	fcms.fcms_maxcount = old_cip->ci_nsuspects;
772 	fcms.fcms_cip = new_cip;
773 	fcms.fcms_new_susp_state = new_susp_state;
774 	fcms.fcms_old_susp_state = old_susp_state;
775 	fcms.fcms_old_match_state = old_match_state;
776 	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
777 	    fmd_case_match_suspects, &fcms);
778 
779 	/*
780 	 * If we have some faulty, non-isolated suspects that overlap, then most
781 	 * likely it is the suspects that overlap in the suspect lists that are
782 	 * to blame. So we can consider this to be a match.
783 	 */
784 	for (i = 0; i < new_cip->ci_nsuspects; i++)
785 		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
786 			got_faulty_overlap = 1;
787 	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
788 		goto got_match;
789 
790 	/*
791 	 * If we have no faulty, non-isolated suspects in the old case, but we
792 	 * do have some acquitted suspects that overlap, then most likely it is
793 	 * the acquitted suspects that overlap in the suspect lists that are
794 	 * to blame. So we can consider this to be a match.
795 	 */
796 	for (i = 0; i < new_cip->ci_nsuspects; i++)
797 		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
798 			got_acquit_overlap = 1;
799 	for (i = 0; i < old_cip->ci_nsuspects; i++)
800 		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
801 			got_acquit_overlap = 0;
802 	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
803 		goto got_match;
804 
805 	/*
806 	 * Check that all suspects in the new list are present in the old list.
807 	 * Return if we find one that isn't.
808 	 */
809 	for (i = 0; i < new_cip->ci_nsuspects; i++)
810 		if (new_susp_state[i] == 0)
811 			return;
812 
813 	/*
814 	 * Check that all suspects in the old list are present in the new list
815 	 * *or* they are isolated or removed/replaced (which would explain why
816 	 * they are not present in the new list). Return if we find one that is
817 	 * faulty and unisolated or repaired or acquitted, and that is not
818 	 * present in the new case.
819 	 */
820 	for (i = 0; i < old_cip->ci_nsuspects; i++)
821 		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
822 		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
823 		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
824 		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
825 			return;
826 
827 got_match:
828 	/*
829 	 * If the old case is already in repaired/resolved state, we can't
830 	 * do anything more with it, so keep the new case, but acquit some
831 	 * of the suspects if appropriate.
832 	 */
833 	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
834 		if (fmd_case_auto_acquit_non_acquitted) {
835 			*fccdp->fccd_adjust_new = 1;
836 			for (i = 0; i < new_cip->ci_nsuspects; i++) {
837 				fccdp->fccd_new_susp_state[i] |=
838 				    new_susp_state[i];
839 				if (new_susp_state[i] == 0)
840 					fccdp->fccd_new_susp_state[i] =
841 					    SUSPECT_STATE_NO_MATCH;
842 			}
843 		}
844 		return;
845 	}
846 
847 	/*
848 	 * Otherwise discard the new case and keep the old, again updating the
849 	 * state of the suspects as appropriate
850 	 */
851 	*fccdp->fccd_discard_new = 1;
852 	fca.fca_cip = new_cip;
853 	fca.fca_do_update = &do_update;
854 
855 	/*
856 	 * See if new case occurred within fmd_case_too_recent seconds of the
857 	 * most recent modification to the old case and if so don't do
858 	 * auto-acquit. This avoids problems if a flood of ereports come in and
859 	 * they don't all get diagnosed before the first case causes some of
860 	 * the devices to be isolated making it appear that an isolated device
861 	 * was in the suspect list.
862 	 */
863 	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
864 	    fmd_asru_most_recent, &most_recent);
865 	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
866 	    fmd_case_too_recent);
867 
868 	if (got_faulty_overlap) {
869 		/*
870 		 * Acquit any suspects not present in the new list, plus
871 		 * any that are are present but are isolated.
872 		 */
873 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
874 		    fmd_case_acquit_no_match, &fca);
875 		if (fmd_case_auto_acquit_isolated && !too_recent)
876 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
877 			    fmd_case_acquit_isolated, &do_update);
878 	} else if (got_acquit_overlap) {
879 		/*
880 		 * Re-fault the acquitted matching suspects and acquit all
881 		 * isolated suspects.
882 		 */
883 		if (fmd_case_auto_acquit_isolated && !too_recent) {
884 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
885 			    fmd_case_fault_acquitted_matching, &fca);
886 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
887 			    fmd_case_acquit_isolated, &do_update);
888 		}
889 	} else if (fmd_case_auto_acquit_isolated) {
890 		/*
891 		 * To get here, there must be no faulty or acquitted suspects,
892 		 * but there must be at least one isolated suspect. Just acquit
893 		 * non-matching isolated suspects. If there are no matching
894 		 * isolated suspects, then re-fault all matching suspects.
895 		 */
896 		for (i = 0; i < new_cip->ci_nsuspects; i++)
897 			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
898 				got_isolated_overlap = 1;
899 		if (!got_isolated_overlap)
900 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
901 			    fmd_case_fault_all_matching, &fca);
902 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
903 		    fmd_case_acquit_no_match, &fca);
904 	}
905 
906 	/*
907 	 * If we've updated anything in the old case, call fmd_case_update()
908 	 */
909 	if (do_update)
910 		fmd_case_update(old_cp);
911 }
912 
913 /*
914  * Convict suspects in a case by applying a conviction policy and updating the
915  * resource cache prior to emitting the list.suspect event for the given case.
916  * At present, our policy is very simple: convict every suspect in the case.
917  * In the future, this policy can be extended and made configurable to permit:
918  *
919  * - convicting the suspect with the highest FIT rate
920  * - convicting the suspect with the cheapest FRU
921  * - convicting the suspect with the FRU that is in a depot's inventory
922  * - convicting the suspect with the longest lifetime
923  *
924  * and so forth.  A word to the wise: this problem is significantly harder that
925  * it seems at first glance.  Future work should heed the following advice:
926  *
927  * Hacking the policy into C code here is a very bad idea.  The policy needs to
928  * be decided upon very carefully and fundamentally encodes knowledge of what
929  * suspect list combinations can be emitted by what diagnosis engines.  As such
930  * fmd's code is the wrong location, because that would require fmd itself to
931  * be updated for every diagnosis engine change, defeating the entire design.
932  * The FMA Event Registry knows the suspect list combinations: policy inputs
933  * can be derived from it and used to produce per-module policy configuration.
934  *
935  * If the policy needs to be dynamic and not statically fixed at either fmd
936  * startup or module load time, any implementation of dynamic policy retrieval
937  * must employ some kind of caching mechanism or be part of a built-in module.
938  * The fmd_case_convict() function is called with locks held inside of fmd and
939  * is not a place where unbounded blocking on some inter-process or inter-
940  * system communication to another service (e.g. another daemon) can occur.
941  */
942 static int
943 fmd_case_convict(fmd_case_t *cp)
944 {
945 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
946 	fmd_asru_hash_t *ahp = fmd.d_asrus;
947 	int discard_new = 0, i;
948 	fmd_case_susp_t *cis;
949 	fmd_asru_link_t *alp;
950 	uint8_t *new_susp_state;
951 	uint8_t *new_match_state;
952 	int adjust_new = 0;
953 	fccd_t fccd;
954 
955 	(void) pthread_mutex_lock(&cip->ci_lock);
956 	if (cip->ci_code == NULL)
957 		(void) fmd_case_mkcode(cp);
958 	else if (cip->ci_precanned)
959 		fmd_case_code_hash_insert(fmd.d_cases, cip);
960 
961 	/*
962 	 * First we must see if any matching cases already exist.
963 	 */
964 	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
965 	for (i = 0; i < cip->ci_nsuspects; i++)
966 		new_susp_state[i] = 0;
967 	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
968 	for (i = 0; i < cip->ci_nsuspects; i++)
969 		new_match_state[i] = 0;
970 	fccd.fccd_cip = cip;
971 	fccd.fccd_adjust_new = &adjust_new;
972 	fccd.fccd_new_susp_state = new_susp_state;
973 	fccd.fccd_new_match_state = new_match_state;
974 	fccd.fccd_discard_new = &discard_new;
975 	fmd_case_hash_apply_except_current(fmd.d_cases, fmd_case_check_for_dups,
976 	    &fccd, cp);
977 
978 	if (discard_new) {
979 		/*
980 		 * We've found an existing case that is a match and it is not
981 		 * already in repaired or resolved state. So we can close this
982 		 * one as a duplicate.
983 		 */
984 		(void) pthread_mutex_unlock(&cip->ci_lock);
985 		return (1);
986 	}
987 
988 	/*
989 	 * Allocate new cache entries
990 	 */
991 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
992 		if ((alp = fmd_asru_hash_create_entry(ahp,
993 		    cp, cis->cis_nvl)) == NULL) {
994 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
995 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
996 			continue;
997 		}
998 		alp->al_flags |= FMD_ASRU_PRESENT;
999 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
1000 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
1001 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
1002 	}
1003 
1004 	if (adjust_new) {
1005 		int some_suspect = 0, some_not_suspect = 0;
1006 
1007 		/*
1008 		 * There is one or more matching case but they are already in
1009 		 * repaired or resolved state. So we need to keep the new
1010 		 * case, but we can adjust it. Repaired/removed/replaced
1011 		 * suspects are unlikely to be to blame (unless there are
1012 		 * actually two separate faults). So if we have a combination of
1013 		 * repaired/replaced/removed suspects and acquitted suspects in
1014 		 * the old lists, then we should acquit in the new list those
1015 		 * that were repaired/replaced/removed in the old.
1016 		 */
1017 		for (i = 0; i < cip->ci_nsuspects; i++) {
1018 			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1019 			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1020 			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1021 			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1022 				some_not_suspect = 1;
1023 			else
1024 				some_suspect = 1;
1025 		}
1026 		if (some_suspect && some_not_suspect) {
1027 			for (cis = cip->ci_suspects, i = 0; cis != NULL;
1028 			    cis = cis->cis_next, i++)
1029 				if ((new_susp_state[i] &
1030 				    SUSPECT_STATE_REPLACED) ||
1031 				    (new_susp_state[i] &
1032 				    SUSPECT_STATE_REPAIRED) ||
1033 				    (new_susp_state[i] &
1034 				    SUSPECT_STATE_REMOVED) ||
1035 				    (new_match_state[i] &
1036 				    SUSPECT_STATE_NO_MATCH))
1037 					fmd_asru_hash_apply_by_case(fmd.d_asrus,
1038 					    cp, fmd_case_acquit_suspect,
1039 					    cis->cis_nvl);
1040 		}
1041 	}
1042 
1043 	(void) pthread_mutex_unlock(&cip->ci_lock);
1044 	return (0);
1045 }
1046 
1047 void
1048 fmd_case_publish(fmd_case_t *cp, uint_t state)
1049 {
1050 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1051 	fmd_event_t *e;
1052 	nvlist_t *nvl;
1053 	char *class;
1054 
1055 	if (state == FMD_CASE_CURRENT)
1056 		state = cip->ci_state; /* use current state */
1057 
1058 	switch (state) {
1059 	case FMD_CASE_SOLVED:
1060 		(void) pthread_mutex_lock(&cip->ci_lock);
1061 
1062 		/*
1063 		 * If we already have a code, then case is already solved.
1064 		 */
1065 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1066 		    cip->ci_code != NULL) {
1067 			(void) pthread_mutex_unlock(&cip->ci_lock);
1068 			break;
1069 		}
1070 
1071 		if (cip->ci_tv_valid == 0) {
1072 			fmd_time_gettimeofday(&cip->ci_tv);
1073 			cip->ci_tv_valid = 1;
1074 		}
1075 		(void) pthread_mutex_unlock(&cip->ci_lock);
1076 
1077 		if (fmd_case_convict(cp) == 1) { /* dupclose */
1078 			cip->ci_flags &= ~FMD_CF_SOLVED;
1079 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1080 			break;
1081 		}
1082 		if (cip->ci_xprt != NULL) {
1083 			/*
1084 			 * For proxy, save some information about the transport
1085 			 * in the resource cache.
1086 			 */
1087 			int count = 0;
1088 			fmd_asru_set_on_proxy_t fasp;
1089 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1090 
1091 			fasp.fasp_countp = &count;
1092 			fasp.fasp_maxcount = cip->ci_nsuspects;
1093 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1094 			fasp.fasp_proxy_external = xip->xi_flags &
1095 			    FMD_XPRT_EXTERNAL;
1096 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1097 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1098 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1099 			    fmd_asru_set_on_proxy, &fasp);
1100 		}
1101 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1102 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1103 
1104 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1105 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1106 		fmd_log_append(fmd.d_fltlog, e, cp);
1107 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1108 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1109 
1110 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1111 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1112 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1113 
1114 		break;
1115 
1116 	case FMD_CASE_CLOSE_WAIT:
1117 		fmd_case_hold(cp);
1118 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1119 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1120 
1121 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1122 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1123 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1124 
1125 		break;
1126 
1127 	case FMD_CASE_CLOSED:
1128 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1129 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1130 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1131 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1132 		break;
1133 
1134 	case FMD_CASE_REPAIRED:
1135 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1136 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1137 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1138 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1139 		fmd_log_append(fmd.d_fltlog, e, cp);
1140 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1141 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1142 		break;
1143 
1144 	case FMD_CASE_RESOLVED:
1145 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1146 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1147 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1148 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1149 		fmd_log_append(fmd.d_fltlog, e, cp);
1150 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1151 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1152 		break;
1153 	}
1154 }
1155 
1156 fmd_case_t *
1157 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1158 {
1159 	fmd_case_impl_t *cip;
1160 	uint_t h;
1161 
1162 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
1163 	h = fmd_strhash(uuid) % chp->ch_hashlen;
1164 
1165 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1166 		if (strcmp(cip->ci_uuid, uuid) == 0)
1167 			break;
1168 	}
1169 
1170 	/*
1171 	 * If deleting bit is set, treat the case as if it doesn't exist.
1172 	 */
1173 	if (cip != NULL)
1174 		cip = fmd_case_tryhold(cip);
1175 
1176 	if (cip == NULL)
1177 		(void) fmd_set_errno(EFMD_CASE_INVAL);
1178 
1179 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1180 	return ((fmd_case_t *)cip);
1181 }
1182 
1183 static fmd_case_impl_t *
1184 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1185 {
1186 	fmd_case_impl_t *eip;
1187 	uint_t h;
1188 
1189 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1190 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1191 
1192 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1193 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1194 		    fmd_case_tryhold(eip) != NULL) {
1195 			(void) pthread_rwlock_unlock(&chp->ch_lock);
1196 			return (eip); /* uuid already present */
1197 		}
1198 	}
1199 
1200 	cip->ci_next = chp->ch_hash[h];
1201 	chp->ch_hash[h] = cip;
1202 
1203 	chp->ch_count++;
1204 	ASSERT(chp->ch_count != 0);
1205 
1206 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1207 	return (cip);
1208 }
1209 
1210 static void
1211 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1212 {
1213 	fmd_case_impl_t *cp, **pp;
1214 	uint_t h;
1215 
1216 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1217 
1218 	cip->ci_flags |= FMD_CF_DELETING;
1219 	(void) pthread_mutex_unlock(&cip->ci_lock);
1220 
1221 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1222 
1223 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1224 	pp = &chp->ch_hash[h];
1225 
1226 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1227 		if (cp != cip)
1228 			pp = &cp->ci_next;
1229 		else
1230 			break;
1231 	}
1232 
1233 	if (cp == NULL) {
1234 		fmd_panic("case %p (%s) not found on hash chain %u\n",
1235 		    (void *)cip, cip->ci_uuid, h);
1236 	}
1237 
1238 	*pp = cp->ci_next;
1239 	cp->ci_next = NULL;
1240 
1241 	/*
1242 	 * delete from code hash if it is on it
1243 	 */
1244 	fmd_case_code_hash_delete(chp, cip);
1245 
1246 	ASSERT(chp->ch_count != 0);
1247 	chp->ch_count--;
1248 
1249 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1250 
1251 	(void) pthread_mutex_lock(&cip->ci_lock);
1252 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
1253 }
1254 
1255 fmd_case_t *
1256 fmd_case_create(fmd_module_t *mp, void *data)
1257 {
1258 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1259 	fmd_case_impl_t *eip = NULL;
1260 	uuid_t uuid;
1261 
1262 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1263 	fmd_buf_hash_create(&cip->ci_bufs);
1264 
1265 	fmd_module_hold(mp);
1266 	cip->ci_mod = mp;
1267 	cip->ci_refs = 1;
1268 	cip->ci_state = FMD_CASE_UNSOLVED;
1269 	cip->ci_flags = FMD_CF_DIRTY;
1270 	cip->ci_data = data;
1271 
1272 	/*
1273 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
1274 	 * define any constant for the length of an unparse string, and do not
1275 	 * permit the caller to specify a buffer length for safety.  The spec
1276 	 * says it will be 36 bytes, but we make it tunable just in case.
1277 	 */
1278 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1279 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1280 
1281 	/*
1282 	 * We expect this loop to execute only once, but code it defensively
1283 	 * against the possibility of libuuid bugs.  Keep generating uuids and
1284 	 * attempting to do a hash insert until we get a unique one.
1285 	 */
1286 	do {
1287 		if (eip != NULL)
1288 			fmd_case_rele((fmd_case_t *)eip);
1289 		uuid_generate(uuid);
1290 		uuid_unparse(uuid, cip->ci_uuid);
1291 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1292 
1293 	ASSERT(fmd_module_locked(mp));
1294 	fmd_list_append(&mp->mod_cases, cip);
1295 	fmd_module_setcdirty(mp);
1296 
1297 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1298 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1299 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1300 
1301 	return ((fmd_case_t *)cip);
1302 }
1303 
1304 static void
1305 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1306 {
1307 	fmd_case_susp_t *cis, *ncis;
1308 
1309 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1310 
1311 	if (cip->ci_proxy_asru)
1312 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1313 		    cip->ci_nsuspects);
1314 	if (cip->ci_diag_de)
1315 		nvlist_free(cip->ci_diag_de);
1316 	if (cip->ci_diag_asru)
1317 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1318 		    cip->ci_nsuspects);
1319 
1320 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1321 		ncis = cis->cis_next;
1322 		nvlist_free(cis->cis_nvl);
1323 		fmd_free(cis, sizeof (fmd_case_susp_t));
1324 	}
1325 
1326 	cip->ci_suspects = NULL;
1327 	cip->ci_nsuspects = 0;
1328 }
1329 
1330 fmd_case_t *
1331 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1332     uint_t state, const char *uuid, const char *code)
1333 {
1334 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1335 	fmd_case_impl_t *eip;
1336 
1337 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1338 	fmd_buf_hash_create(&cip->ci_bufs);
1339 
1340 	fmd_module_hold(mp);
1341 	cip->ci_mod = mp;
1342 	cip->ci_xprt = xp;
1343 	cip->ci_refs = 1;
1344 	cip->ci_state = state;
1345 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1346 	cip->ci_uuidlen = strlen(cip->ci_uuid);
1347 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1348 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1349 
1350 	if (state > FMD_CASE_CLOSE_WAIT)
1351 		cip->ci_flags |= FMD_CF_SOLVED;
1352 
1353 	/*
1354 	 * Insert the case into the global case hash.  If the specified UUID is
1355 	 * already present, check to see if it is an orphan: if so, reclaim it;
1356 	 * otherwise if it is owned by a different module then return NULL.
1357 	 */
1358 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1359 		(void) pthread_mutex_lock(&cip->ci_lock);
1360 		cip->ci_refs--; /* decrement to zero */
1361 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1362 
1363 		cip = eip; /* switch 'cip' to the existing case */
1364 		(void) pthread_mutex_lock(&cip->ci_lock);
1365 
1366 		/*
1367 		 * If the ASRU cache is trying to recreate an orphan, then just
1368 		 * return the existing case that we found without changing it.
1369 		 */
1370 		if (mp == fmd.d_rmod) {
1371 			/*
1372 			 * In case the case has already been created from
1373 			 * a checkpoint file we need to set up code now.
1374 			 */
1375 			if (cip->ci_state < FMD_CASE_CLOSED) {
1376 				if (code != NULL && cip->ci_code == NULL) {
1377 					cip->ci_code = fmd_strdup(code,
1378 					    FMD_SLEEP);
1379 					cip->ci_codelen = cip->ci_code ?
1380 					    strlen(cip->ci_code) + 1 : 0;
1381 					fmd_case_code_hash_insert(fmd.d_cases,
1382 					    cip);
1383 				}
1384 			}
1385 
1386 			/*
1387 			 * When recreating an orphan case, state passed in may
1388 			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1389 			 * any suspects are still CLOSED (faulty) then the
1390 			 * overall state needs to be CLOSED.
1391 			 */
1392 			if ((cip->ci_state == FMD_CASE_REPAIRED ||
1393 			    cip->ci_state == FMD_CASE_RESOLVED) &&
1394 			    state == FMD_CASE_CLOSED)
1395 				cip->ci_state = FMD_CASE_CLOSED;
1396 			(void) pthread_mutex_unlock(&cip->ci_lock);
1397 			fmd_case_rele((fmd_case_t *)cip);
1398 			return ((fmd_case_t *)cip);
1399 		}
1400 
1401 		/*
1402 		 * If the existing case isn't an orphan or is being proxied,
1403 		 * then we have a UUID conflict: return failure to the caller.
1404 		 */
1405 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1406 			(void) pthread_mutex_unlock(&cip->ci_lock);
1407 			fmd_case_rele((fmd_case_t *)cip);
1408 			return (NULL);
1409 		}
1410 
1411 		/*
1412 		 * If the new module is reclaiming an orphaned case, remove
1413 		 * the case from the root module, switch ci_mod, and then fall
1414 		 * through to adding the case to the new owner module 'mp'.
1415 		 */
1416 		fmd_module_lock(cip->ci_mod);
1417 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1418 		fmd_module_unlock(cip->ci_mod);
1419 
1420 		fmd_module_rele(cip->ci_mod);
1421 		cip->ci_mod = mp;
1422 		fmd_module_hold(mp);
1423 
1424 		/*
1425 		 * It's possible that fmd crashed or was restarted during a
1426 		 * previous solve operation between the asru cache being created
1427 		 * and the ckpt file being updated to SOLVED. Thus when the DE
1428 		 * recreates the case here from the checkpoint file, the state
1429 		 * will be UNSOLVED and yet we are having to reclaim because
1430 		 * the case was in the asru cache. If this happens, revert the
1431 		 * case back to the UNSOLVED state and let the DE solve it again
1432 		 */
1433 		if (state == FMD_CASE_UNSOLVED) {
1434 			fmd_asru_hash_delete_case(fmd.d_asrus,
1435 			    (fmd_case_t *)cip);
1436 			fmd_case_destroy_suspects(cip);
1437 			fmd_case_code_hash_delete(fmd.d_cases, cip);
1438 			fmd_free(cip->ci_code, cip->ci_codelen);
1439 			cip->ci_code = NULL;
1440 			cip->ci_codelen = 0;
1441 			cip->ci_tv_valid = 0;
1442 		}
1443 
1444 		cip->ci_state = state;
1445 
1446 		(void) pthread_mutex_unlock(&cip->ci_lock);
1447 		fmd_case_rele((fmd_case_t *)cip);
1448 	} else {
1449 		/*
1450 		 * add into hash of solved cases
1451 		 */
1452 		if (cip->ci_code)
1453 			fmd_case_code_hash_insert(fmd.d_cases, cip);
1454 	}
1455 
1456 	ASSERT(fmd_module_locked(mp));
1457 	fmd_list_append(&mp->mod_cases, cip);
1458 
1459 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1460 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1461 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1462 
1463 	return ((fmd_case_t *)cip);
1464 }
1465 
1466 void
1467 fmd_case_destroy(fmd_case_t *cp, int visible)
1468 {
1469 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1470 	fmd_case_item_t *cit, *ncit;
1471 
1472 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1473 	ASSERT(cip->ci_refs == 0);
1474 
1475 	if (visible) {
1476 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1477 		fmd_case_hash_delete(fmd.d_cases, cip);
1478 	}
1479 
1480 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1481 		ncit = cit->cit_next;
1482 		fmd_event_rele(cit->cit_event);
1483 		fmd_free(cit, sizeof (fmd_case_item_t));
1484 	}
1485 
1486 	fmd_case_destroy_suspects(cip);
1487 
1488 	if (cip->ci_principal != NULL)
1489 		fmd_event_rele(cip->ci_principal);
1490 
1491 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1492 	fmd_free(cip->ci_code, cip->ci_codelen);
1493 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1494 
1495 	fmd_module_rele(cip->ci_mod);
1496 	fmd_free(cip, sizeof (fmd_case_impl_t));
1497 }
1498 
1499 void
1500 fmd_case_hold(fmd_case_t *cp)
1501 {
1502 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1503 
1504 	(void) pthread_mutex_lock(&cip->ci_lock);
1505 	fmd_case_hold_locked(cp);
1506 	(void) pthread_mutex_unlock(&cip->ci_lock);
1507 }
1508 
1509 void
1510 fmd_case_hold_locked(fmd_case_t *cp)
1511 {
1512 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1513 
1514 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1515 	if (cip->ci_flags & FMD_CF_DELETING)
1516 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1517 		    (void *)cip, cip->ci_uuid);
1518 	cip->ci_refs++;
1519 	ASSERT(cip->ci_refs != 0);
1520 }
1521 
1522 static fmd_case_impl_t *
1523 fmd_case_tryhold(fmd_case_impl_t *cip)
1524 {
1525 	/*
1526 	 * If the case's "deleting" bit is unset, hold and return case,
1527 	 * otherwise, return NULL.
1528 	 */
1529 	(void) pthread_mutex_lock(&cip->ci_lock);
1530 	if (cip->ci_flags & FMD_CF_DELETING) {
1531 		(void) pthread_mutex_unlock(&cip->ci_lock);
1532 		cip = NULL;
1533 	} else {
1534 		fmd_case_hold_locked((fmd_case_t *)cip);
1535 		(void) pthread_mutex_unlock(&cip->ci_lock);
1536 	}
1537 	return (cip);
1538 }
1539 
1540 void
1541 fmd_case_rele(fmd_case_t *cp)
1542 {
1543 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1544 
1545 	(void) pthread_mutex_lock(&cip->ci_lock);
1546 	ASSERT(cip->ci_refs != 0);
1547 
1548 	if (--cip->ci_refs == 0)
1549 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1550 	else
1551 		(void) pthread_mutex_unlock(&cip->ci_lock);
1552 }
1553 
1554 void
1555 fmd_case_rele_locked(fmd_case_t *cp)
1556 {
1557 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1558 
1559 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1560 	--cip->ci_refs;
1561 	ASSERT(cip->ci_refs != 0);
1562 }
1563 
1564 int
1565 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1566 {
1567 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1568 	fmd_case_item_t *cit;
1569 	fmd_event_t *oep;
1570 	uint_t state;
1571 	int new;
1572 
1573 	fmd_event_hold(ep);
1574 	(void) pthread_mutex_lock(&cip->ci_lock);
1575 
1576 	if (cip->ci_flags & FMD_CF_SOLVED)
1577 		state = FMD_EVS_DIAGNOSED;
1578 	else
1579 		state = FMD_EVS_ACCEPTED;
1580 
1581 	oep = cip->ci_principal;
1582 	cip->ci_principal = ep;
1583 
1584 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1585 		if (cit->cit_event == ep)
1586 			break;
1587 	}
1588 
1589 	cip->ci_flags |= FMD_CF_DIRTY;
1590 	new = cit == NULL && ep != oep;
1591 
1592 	(void) pthread_mutex_unlock(&cip->ci_lock);
1593 
1594 	fmd_module_setcdirty(cip->ci_mod);
1595 	fmd_event_transition(ep, state);
1596 
1597 	if (oep != NULL)
1598 		fmd_event_rele(oep);
1599 
1600 	return (new);
1601 }
1602 
1603 int
1604 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1605 {
1606 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1607 	fmd_case_item_t *cit;
1608 	uint_t state;
1609 	int new;
1610 	boolean_t injected;
1611 
1612 	(void) pthread_mutex_lock(&cip->ci_lock);
1613 
1614 	if (cip->ci_flags & FMD_CF_SOLVED)
1615 		state = FMD_EVS_DIAGNOSED;
1616 	else
1617 		state = FMD_EVS_ACCEPTED;
1618 
1619 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1620 		if (cit->cit_event == ep)
1621 			break;
1622 	}
1623 
1624 	new = cit == NULL && ep != cip->ci_principal;
1625 
1626 	/*
1627 	 * If the event is already in the case or the case is already solved,
1628 	 * there is no reason to save it: just transition it appropriately.
1629 	 */
1630 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1631 		(void) pthread_mutex_unlock(&cip->ci_lock);
1632 		fmd_event_transition(ep, state);
1633 		return (new);
1634 	}
1635 
1636 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1637 	fmd_event_hold(ep);
1638 
1639 	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1640 	    "__injected", &injected) == 0 && injected)
1641 		fmd_case_set_injected(cp);
1642 
1643 	cit->cit_next = cip->ci_items;
1644 	cit->cit_event = ep;
1645 
1646 	cip->ci_items = cit;
1647 	cip->ci_nitems++;
1648 
1649 	cip->ci_flags |= FMD_CF_DIRTY;
1650 	(void) pthread_mutex_unlock(&cip->ci_lock);
1651 
1652 	fmd_module_setcdirty(cip->ci_mod);
1653 	fmd_event_transition(ep, state);
1654 
1655 	return (new);
1656 }
1657 
1658 void
1659 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1660 {
1661 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1662 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1663 
1664 	(void) pthread_mutex_lock(&cip->ci_lock);
1665 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1666 	cip->ci_flags |= FMD_CF_DIRTY;
1667 
1668 	cis->cis_next = cip->ci_suspects;
1669 	cis->cis_nvl = nvl;
1670 
1671 	cip->ci_suspects = cis;
1672 	cip->ci_nsuspects++;
1673 
1674 	(void) pthread_mutex_unlock(&cip->ci_lock);
1675 	if (cip->ci_xprt == NULL)
1676 		fmd_module_setcdirty(cip->ci_mod);
1677 }
1678 
1679 void
1680 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1681 {
1682 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1683 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1684 	boolean_t b;
1685 
1686 	(void) pthread_mutex_lock(&cip->ci_lock);
1687 
1688 	cis->cis_next = cip->ci_suspects;
1689 	cis->cis_nvl = nvl;
1690 
1691 	if (nvlist_lookup_boolean_value(nvl,
1692 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1693 		cip->ci_flags |= FMD_CF_INVISIBLE;
1694 
1695 	cip->ci_suspects = cis;
1696 	cip->ci_nsuspects++;
1697 
1698 	(void) pthread_mutex_unlock(&cip->ci_lock);
1699 }
1700 
1701 void
1702 fmd_case_reset_suspects(fmd_case_t *cp)
1703 {
1704 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1705 
1706 	(void) pthread_mutex_lock(&cip->ci_lock);
1707 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1708 
1709 	fmd_case_destroy_suspects(cip);
1710 	cip->ci_flags |= FMD_CF_DIRTY;
1711 
1712 	(void) pthread_mutex_unlock(&cip->ci_lock);
1713 	fmd_module_setcdirty(cip->ci_mod);
1714 }
1715 
1716 /*ARGSUSED*/
1717 static void
1718 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1719 {
1720 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1721 }
1722 
1723 /*
1724  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1725  * whatever actions and emit whatever events are appropriate for the state.
1726  * Refer to the topmost block comment explaining the state machine for details.
1727  */
1728 void
1729 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1730 {
1731 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1732 	fmd_case_item_t *cit;
1733 	fmd_event_t *e;
1734 	int resolved = 0;
1735 	int any_unusable_and_present = 0;
1736 
1737 	ASSERT(state <= FMD_CASE_RESOLVED);
1738 	(void) pthread_mutex_lock(&cip->ci_lock);
1739 
1740 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1741 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1742 
1743 	cip->ci_flags |= flags;
1744 
1745 	if (cip->ci_state >= state) {
1746 		(void) pthread_mutex_unlock(&cip->ci_lock);
1747 		return; /* already in specified state */
1748 	}
1749 
1750 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1751 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1752 
1753 	cip->ci_state = state;
1754 	cip->ci_flags |= FMD_CF_DIRTY;
1755 
1756 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1757 		fmd_module_setcdirty(cip->ci_mod);
1758 
1759 	switch (state) {
1760 	case FMD_CASE_SOLVED:
1761 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1762 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1763 
1764 		if (cip->ci_principal != NULL) {
1765 			fmd_event_transition(cip->ci_principal,
1766 			    FMD_EVS_DIAGNOSED);
1767 		}
1768 		break;
1769 
1770 	case FMD_CASE_CLOSE_WAIT:
1771 		/*
1772 		 * If the case was never solved, do not change ASRUs.
1773 		 * If the case was never fmd_case_closed, do not change ASRUs.
1774 		 * If the case was repaired, do not change ASRUs.
1775 		 */
1776 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1777 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1778 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1779 			    fmd_case_unusable, NULL);
1780 
1781 		/*
1782 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1783 		 * module is no longer loaded: continue on to CASE_CLOSED.
1784 		 */
1785 		if (fmd_case_orphaned(cp))
1786 			state = cip->ci_state = FMD_CASE_CLOSED;
1787 		break;
1788 
1789 	case FMD_CASE_REPAIRED:
1790 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1791 
1792 		/*
1793 		 * If we've been requested to transition straight on to the
1794 		 * RESOLVED state (which can happen with fault proxying where a
1795 		 * list.resolved or a uuresolved is received from the other
1796 		 * side), or if all suspects are already either usable or not
1797 		 * present then transition straight to RESOLVED state,
1798 		 * publishing both the list.repaired and list.resolved. For a
1799 		 * proxy, if we discover here that all suspects are already
1800 		 * either usable or not present, notify the diag side instead
1801 		 * using fmd_xprt_uuresolved().
1802 		 */
1803 		if (flags & FMD_CF_RESOLVED) {
1804 			if (cip->ci_xprt != NULL)
1805 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1806 		} else {
1807 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1808 			    fmd_case_unusable_and_present,
1809 			    &any_unusable_and_present);
1810 			if (any_unusable_and_present)
1811 				break;
1812 			if (cip->ci_xprt != NULL) {
1813 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1814 				break;
1815 			}
1816 		}
1817 
1818 		cip->ci_state = FMD_CASE_RESOLVED;
1819 		(void) pthread_mutex_unlock(&cip->ci_lock);
1820 		fmd_case_publish(cp, state);
1821 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1822 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1823 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1824 		state = FMD_CASE_RESOLVED;
1825 		resolved = 1;
1826 		(void) pthread_mutex_lock(&cip->ci_lock);
1827 		break;
1828 
1829 	case FMD_CASE_RESOLVED:
1830 		/*
1831 		 * For a proxy, no need to check that all suspects are already
1832 		 * either usable or not present - this request has come from
1833 		 * the diagnosing side which makes the final decision on this.
1834 		 */
1835 		if (cip->ci_xprt != NULL) {
1836 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1837 			resolved = 1;
1838 			break;
1839 		}
1840 
1841 		ASSERT(fmd_case_orphaned(cp));
1842 
1843 		/*
1844 		 * If all suspects are already either usable or not present then
1845 		 * carry on, publish list.resolved and discard the case.
1846 		 */
1847 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1848 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1849 		if (any_unusable_and_present) {
1850 			(void) pthread_mutex_unlock(&cip->ci_lock);
1851 			return;
1852 		}
1853 
1854 		resolved = 1;
1855 		break;
1856 	}
1857 
1858 	(void) pthread_mutex_unlock(&cip->ci_lock);
1859 
1860 	/*
1861 	 * If the module has initialized, then publish the appropriate event
1862 	 * for the new case state.  If not, we are being called from the
1863 	 * checkpoint code during module load, in which case the module's
1864 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1865 	 * may not be open yet, which will prevent us from computing the event
1866 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1867 	 * event in our queue: this won't be processed until _fmd_init is done.
1868 	 */
1869 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1870 		fmd_case_publish(cp, state);
1871 	else {
1872 		fmd_case_hold(cp);
1873 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1874 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1875 	}
1876 
1877 	if (resolved) {
1878 		if (cip->ci_xprt != NULL) {
1879 			/*
1880 			 * If we transitioned to RESOLVED, adjust the reference
1881 			 * count to reflect our removal from
1882 			 * fmd.d_rmod->mod_cases above.  If the caller has not
1883 			 * placed an additional hold on the case, it will now
1884 			 * be freed.
1885 			 */
1886 			(void) pthread_mutex_lock(&cip->ci_lock);
1887 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1888 			(void) pthread_mutex_unlock(&cip->ci_lock);
1889 			fmd_case_rele(cp);
1890 		} else {
1891 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1892 			    fmd_asru_log_resolved, NULL);
1893 			(void) pthread_mutex_lock(&cip->ci_lock);
1894 			/* mark as "ready to be discarded */
1895 			cip->ci_flags |= FMD_CF_RES_CMPL;
1896 			(void) pthread_mutex_unlock(&cip->ci_lock);
1897 		}
1898 	}
1899 }
1900 
1901 /*
1902  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1903  * is set if all suspects have passed the rsrc.aged time).
1904  */
1905 void
1906 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1907 {
1908 	int check_if_aged = *(int *)arg;
1909 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1910 
1911 	/*
1912 	 * First check if case has completed transition to resolved.
1913 	 */
1914 	(void) pthread_mutex_lock(&cip->ci_lock);
1915 	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1916 		(void) pthread_mutex_unlock(&cip->ci_lock);
1917 		return;
1918 	}
1919 
1920 	/*
1921 	 * Now if check_is_aged is set, see if all suspects have aged.
1922 	 */
1923 	if (check_if_aged) {
1924 		int aged = 1;
1925 
1926 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1927 		    fmd_asru_check_if_aged, &aged);
1928 		if (!aged) {
1929 			(void) pthread_mutex_unlock(&cip->ci_lock);
1930 			return;
1931 		}
1932 	}
1933 
1934 	/*
1935 	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1936 	 * do it twice.
1937 	 */
1938 	fmd_module_lock(cip->ci_mod);
1939 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1940 	fmd_module_unlock(cip->ci_mod);
1941 	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1942 	cip->ci_flags &= ~FMD_CF_RES_CMPL;
1943 	(void) pthread_mutex_unlock(&cip->ci_lock);
1944 	fmd_case_rele(cp);
1945 }
1946 
1947 /*
1948  * Transition the specified case to *at least* the specified state by first
1949  * re-validating the suspect list using the resource cache.  This function is
1950  * employed by the checkpoint code when restoring a saved, solved case to see
1951  * if the state of the case has effectively changed while fmd was not running
1952  * or the module was not loaded.
1953  */
1954 void
1955 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1956 {
1957 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1958 
1959 	int usable = 0;		/* are any suspects usable? */
1960 
1961 	ASSERT(state >= FMD_CASE_SOLVED);
1962 	(void) pthread_mutex_lock(&cip->ci_lock);
1963 
1964 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1965 
1966 	(void) pthread_mutex_unlock(&cip->ci_lock);
1967 
1968 	if (!usable) {
1969 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1970 		flags |= FMD_CF_ISOLATED;
1971 	}
1972 
1973 	fmd_case_transition(cp, state, flags);
1974 }
1975 
1976 void
1977 fmd_case_setdirty(fmd_case_t *cp)
1978 {
1979 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1980 
1981 	(void) pthread_mutex_lock(&cip->ci_lock);
1982 	cip->ci_flags |= FMD_CF_DIRTY;
1983 	(void) pthread_mutex_unlock(&cip->ci_lock);
1984 
1985 	fmd_module_setcdirty(cip->ci_mod);
1986 }
1987 
1988 void
1989 fmd_case_clrdirty(fmd_case_t *cp)
1990 {
1991 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1992 
1993 	(void) pthread_mutex_lock(&cip->ci_lock);
1994 	cip->ci_flags &= ~FMD_CF_DIRTY;
1995 	(void) pthread_mutex_unlock(&cip->ci_lock);
1996 }
1997 
1998 void
1999 fmd_case_commit(fmd_case_t *cp)
2000 {
2001 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2002 	fmd_case_item_t *cit;
2003 
2004 	(void) pthread_mutex_lock(&cip->ci_lock);
2005 
2006 	if (cip->ci_flags & FMD_CF_DIRTY) {
2007 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2008 			fmd_event_commit(cit->cit_event);
2009 
2010 		if (cip->ci_principal != NULL)
2011 			fmd_event_commit(cip->ci_principal);
2012 
2013 		fmd_buf_hash_commit(&cip->ci_bufs);
2014 		cip->ci_flags &= ~FMD_CF_DIRTY;
2015 	}
2016 
2017 	(void) pthread_mutex_unlock(&cip->ci_lock);
2018 }
2019 
2020 /*
2021  * On proxy side, send back repair/acquit/etc request to diagnosing side
2022  */
2023 void
2024 fmd_case_xprt_updated(fmd_case_t *cp)
2025 {
2026 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2027 	nvlist_t **nva;
2028 	uint8_t *ba;
2029 	int msg = B_TRUE;
2030 	int count = 0;
2031 	fmd_case_lst_t fcl;
2032 
2033 	ASSERT(cip->ci_xprt != NULL);
2034 	(void) pthread_mutex_lock(&cip->ci_lock);
2035 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2036 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2037 	fcl.fcl_countp = &count;
2038 	fcl.fcl_maxcount = cip->ci_nsuspects;
2039 	fcl.fcl_msgp = &msg;
2040 	fcl.fcl_ba = ba;
2041 	fcl.fcl_nva = nva;
2042 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2043 	(void) pthread_mutex_unlock(&cip->ci_lock);
2044 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2045 	    count);
2046 }
2047 
2048 /*
2049  * fmd_case_update_status() can be called on either the proxy side when a
2050  * list.suspect is received, or on the diagnosing side when an update request
2051  * is received from the proxy. It updates the status in the resource cache.
2052  */
2053 void
2054 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2055     uint8_t *diag_asrup)
2056 {
2057 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2058 	int count = 0;
2059 	fmd_asru_update_status_t faus;
2060 
2061 	/*
2062 	 * update status of resource cache entries
2063 	 */
2064 	faus.faus_countp = &count;
2065 	faus.faus_maxcount = cip->ci_nsuspects;
2066 	faus.faus_ba = statusp;
2067 	faus.faus_proxy_asru = proxy_asrup;
2068 	faus.faus_diag_asru = diag_asrup;
2069 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
2070 	(void) pthread_mutex_lock(&cip->ci_lock);
2071 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2072 	    &faus);
2073 	(void) pthread_mutex_unlock(&cip->ci_lock);
2074 }
2075 
2076 /*
2077  * Called on either the proxy side or the diag side when a repair has taken
2078  * place on the other side but this side may know the asru "contains"
2079  * relationships.
2080  */
2081 void
2082 fmd_case_update_containees(fmd_case_t *cp)
2083 {
2084 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2085 
2086 	(void) pthread_mutex_lock(&cip->ci_lock);
2087 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2088 	    fmd_asru_update_containees, NULL);
2089 	(void) pthread_mutex_unlock(&cip->ci_lock);
2090 }
2091 
2092 /*
2093  * fmd_case_close_status() is called on diagnosing side when proxy side
2094  * has had a uuclose. It updates the status in the resource cache.
2095  */
2096 void
2097 fmd_case_close_status(fmd_case_t *cp)
2098 {
2099 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2100 	int count = 0;
2101 	fmd_asru_close_status_t facs;
2102 
2103 	/*
2104 	 * update status of resource cache entries
2105 	 */
2106 	facs.facs_countp = &count;
2107 	facs.facs_maxcount = cip->ci_nsuspects;
2108 	(void) pthread_mutex_lock(&cip->ci_lock);
2109 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2110 	    &facs);
2111 	(void) pthread_mutex_unlock(&cip->ci_lock);
2112 }
2113 
2114 /*
2115  * Indicate that the case may need to change state because one or more of the
2116  * ASRUs named as a suspect has changed state.  We examine all the suspects
2117  * and if none are still faulty, we initiate a case close transition.
2118  */
2119 void
2120 fmd_case_update(fmd_case_t *cp)
2121 {
2122 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2123 	uint_t cstate;
2124 	int faulty = 0;
2125 
2126 	(void) pthread_mutex_lock(&cip->ci_lock);
2127 	cstate = cip->ci_state;
2128 
2129 	if (cip->ci_state < FMD_CASE_SOLVED) {
2130 		(void) pthread_mutex_unlock(&cip->ci_lock);
2131 		return; /* update is not appropriate */
2132 	}
2133 
2134 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2135 		(void) pthread_mutex_unlock(&cip->ci_lock);
2136 		return; /* already repaired */
2137 	}
2138 
2139 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2140 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2141 	(void) pthread_mutex_unlock(&cip->ci_lock);
2142 
2143 	if (faulty) {
2144 		nvlist_t *nvl;
2145 		fmd_event_t *e;
2146 		char *class;
2147 
2148 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2149 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2150 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2151 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2152 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2153 		fmd_log_append(fmd.d_fltlog, e, cp);
2154 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
2155 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2156 		return; /* one or more suspects are still marked faulty */
2157 	}
2158 
2159 	if (cstate == FMD_CASE_CLOSED)
2160 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2161 	else
2162 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2163 }
2164 
2165 /*
2166  * Delete a closed case from the module's case list once the fmdo_close() entry
2167  * point has run to completion.  If the case is owned by a transport module,
2168  * tell the transport to proxy a case close on the other end of the transport.
2169  * Transition to the appropriate next state based on ci_flags.  This
2170  * function represents the end of CLOSE_WAIT and transitions the case to either
2171  * CLOSED or REPAIRED or discards it entirely because it was never solved;
2172  * refer to the topmost block comment explaining the state machine for details.
2173  */
2174 void
2175 fmd_case_delete(fmd_case_t *cp)
2176 {
2177 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2178 	fmd_modstat_t *msp;
2179 	size_t buftotal;
2180 
2181 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2182 	ASSERT(fmd_module_locked(cip->ci_mod));
2183 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2184 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2185 
2186 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2187 	msp = cip->ci_mod->mod_stats;
2188 
2189 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2190 	msp->ms_caseopen.fmds_value.ui64--;
2191 
2192 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2193 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2194 
2195 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2196 
2197 	if (cip->ci_xprt == NULL)
2198 		fmd_module_setcdirty(cip->ci_mod);
2199 
2200 	fmd_module_rele(cip->ci_mod);
2201 	cip->ci_mod = fmd.d_rmod;
2202 	fmd_module_hold(cip->ci_mod);
2203 
2204 	/*
2205 	 * If the case has been solved, then retain it
2206 	 * on the root module's case list at least until we're transitioned.
2207 	 * Otherwise free the case with our final fmd_case_rele() below.
2208 	 */
2209 	if (cip->ci_flags & FMD_CF_SOLVED) {
2210 		fmd_module_lock(cip->ci_mod);
2211 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
2212 		fmd_module_unlock(cip->ci_mod);
2213 		fmd_case_hold(cp);
2214 	}
2215 
2216 	/*
2217 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
2218 	 * Note that for proxy case if we're transitioning to CLOSED it means
2219 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2220 	 * the diagnosing side. No need to notify the diagnosing side if we are
2221 	 * transitioning to REPAIRED as we only do this when requested to do
2222 	 * so by the diagnosing side anyway.
2223 	 */
2224 	if (cip->ci_flags & FMD_CF_REPAIRED)
2225 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2226 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
2227 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2228 		if (cip->ci_xprt != NULL)
2229 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2230 	}
2231 
2232 	fmd_case_rele(cp);
2233 }
2234 
2235 void
2236 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2237 {
2238 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2239 
2240 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2241 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2242 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2243 
2244 	ASSERT(fmd_module_locked(cip->ci_mod));
2245 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2246 	if (delete_from_asru_cache) {
2247 		(void) pthread_mutex_lock(&cip->ci_lock);
2248 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2249 		(void) pthread_mutex_unlock(&cip->ci_lock);
2250 	}
2251 	fmd_case_rele(cp);
2252 }
2253 
2254 /*
2255  * Indicate that the problem corresponding to a case has been repaired by
2256  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2257  * already been closed, this function initiates the transition to CLOSE_WAIT.
2258  * The caller must have the case held from fmd_case_hash_lookup(), so we can
2259  * grab and drop ci_lock without the case being able to be freed in between.
2260  */
2261 int
2262 fmd_case_repair(fmd_case_t *cp)
2263 {
2264 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2265 	uint_t cstate;
2266 	fmd_asru_rep_arg_t fara;
2267 
2268 	(void) pthread_mutex_lock(&cip->ci_lock);
2269 	cstate = cip->ci_state;
2270 
2271 	if (cstate < FMD_CASE_SOLVED) {
2272 		(void) pthread_mutex_unlock(&cip->ci_lock);
2273 		return (fmd_set_errno(EFMD_CASE_STATE));
2274 	}
2275 
2276 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2277 		(void) pthread_mutex_unlock(&cip->ci_lock);
2278 		return (0); /* already repaired */
2279 	}
2280 
2281 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2282 	fara.fara_reason = FMD_ASRU_REPAIRED;
2283 	fara.fara_bywhat = FARA_BY_CASE;
2284 	fara.fara_rval = NULL;
2285 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2286 	(void) pthread_mutex_unlock(&cip->ci_lock);
2287 
2288 	/*
2289 	 * if this is a proxied case, send the repair across the transport.
2290 	 * The remote side will then do the repair and send a list.repaired back
2291 	 * again such that we can finally repair the case on this side.
2292 	 */
2293 	if (cip->ci_xprt != NULL) {
2294 		fmd_case_xprt_updated(cp);
2295 		return (0);
2296 	}
2297 
2298 	if (cstate == FMD_CASE_CLOSED)
2299 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2300 	else
2301 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2302 
2303 	return (0);
2304 }
2305 
2306 int
2307 fmd_case_acquit(fmd_case_t *cp)
2308 {
2309 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2310 	uint_t cstate;
2311 	fmd_asru_rep_arg_t fara;
2312 
2313 	(void) pthread_mutex_lock(&cip->ci_lock);
2314 	cstate = cip->ci_state;
2315 
2316 	if (cstate < FMD_CASE_SOLVED) {
2317 		(void) pthread_mutex_unlock(&cip->ci_lock);
2318 		return (fmd_set_errno(EFMD_CASE_STATE));
2319 	}
2320 
2321 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2322 		(void) pthread_mutex_unlock(&cip->ci_lock);
2323 		return (0); /* already repaired */
2324 	}
2325 
2326 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2327 	fara.fara_reason = FMD_ASRU_ACQUITTED;
2328 	fara.fara_bywhat = FARA_BY_CASE;
2329 	fara.fara_rval = NULL;
2330 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2331 	(void) pthread_mutex_unlock(&cip->ci_lock);
2332 
2333 	/*
2334 	 * if this is a proxied case, send the repair across the transport.
2335 	 * The remote side will then do the repair and send a list.repaired back
2336 	 * again such that we can finally repair the case on this side.
2337 	 */
2338 	if (cip->ci_xprt != NULL) {
2339 		fmd_case_xprt_updated(cp);
2340 		return (0);
2341 	}
2342 
2343 	if (cstate == FMD_CASE_CLOSED)
2344 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2345 	else
2346 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2347 
2348 	return (0);
2349 }
2350 
2351 int
2352 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2353 {
2354 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2355 	fmd_case_item_t *cit;
2356 	uint_t state;
2357 	int rv = 0;
2358 
2359 	(void) pthread_mutex_lock(&cip->ci_lock);
2360 
2361 	if (cip->ci_state >= FMD_CASE_SOLVED)
2362 		state = FMD_EVS_DIAGNOSED;
2363 	else
2364 		state = FMD_EVS_ACCEPTED;
2365 
2366 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2367 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2368 			break;
2369 	}
2370 
2371 	if (rv == 0 && cip->ci_principal != NULL)
2372 		rv = fmd_event_equal(ep, cip->ci_principal);
2373 
2374 	(void) pthread_mutex_unlock(&cip->ci_lock);
2375 
2376 	if (rv != 0)
2377 		fmd_event_transition(ep, state);
2378 
2379 	return (rv);
2380 }
2381 
2382 int
2383 fmd_case_orphaned(fmd_case_t *cp)
2384 {
2385 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2386 }
2387 
2388 void
2389 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2390 {
2391 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2392 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2393 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2394 }
2395 
2396 void
2397 fmd_case_set_injected(fmd_case_t *cp)
2398 {
2399 	((fmd_case_impl_t *)cp)->ci_injected = 1;
2400 }
2401 
2402 void
2403 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2404 {
2405 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2406 
2407 	if (cip->ci_diag_de)
2408 		nvlist_free(cip->ci_diag_de);
2409 	cip->ci_diag_de = nvl;
2410 }
2411 
2412 void
2413 fmd_case_setcode(fmd_case_t *cp, char *code)
2414 {
2415 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2416 
2417 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2418 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2419 }
2420 
2421 /*ARGSUSED*/
2422 static void
2423 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2424 {
2425 	int not_faulty = 0;
2426 	int faulty = 0;
2427 	nvlist_t *nvl;
2428 	fmd_event_t *e;
2429 	char *class;
2430 	int any_unusable_and_present = 0;
2431 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2432 
2433 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2434 		return;
2435 
2436 	if (cip->ci_state == FMD_CASE_RESOLVED) {
2437 		cip->ci_flags |= FMD_CF_RES_CMPL;
2438 		return;
2439 	}
2440 
2441 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2442 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2443 	    &not_faulty);
2444 
2445 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2446 		/*
2447 		 * If none of the suspects is faulty, replay the list.repaired.
2448 		 * If all suspects are already either usable or not present then
2449 		 * also transition straight to RESOLVED state.
2450 		 */
2451 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2452 		    fmd_case_unusable_and_present, &any_unusable_and_present);
2453 		if (!any_unusable_and_present) {
2454 			cip->ci_state = FMD_CASE_RESOLVED;
2455 
2456 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2457 			    cip->ci_uuid));
2458 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2459 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2460 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2461 			    class);
2462 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2463 
2464 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2465 			    cip->ci_uuid));
2466 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2467 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2468 			    fmd_asru_log_resolved, NULL);
2469 			cip->ci_flags |= FMD_CF_RES_CMPL;
2470 		} else {
2471 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2472 			    cip->ci_uuid));
2473 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2474 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2475 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2476 			    class);
2477 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2478 		}
2479 	} else if (faulty && not_faulty) {
2480 		/*
2481 		 * if some but not all of the suspects are not faulty, replay
2482 		 * the list.updated.
2483 		 */
2484 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2485 		    cip->ci_uuid));
2486 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2487 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2488 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2489 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2490 	}
2491 }
2492 
2493 void
2494 fmd_case_repair_replay()
2495 {
2496 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2497 }
2498