xref: /titanic_51/usr/src/cmd/fm/fmd/common/fmd_case.c (revision b8201470142151ac3303d2d0b875fc282299de45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * FMD Case Subsystem
29  *
30  * Diagnosis engines are expected to group telemetry events related to the
31  * diagnosis of a particular problem on the system into a set of cases.  The
32  * diagnosis engine may have any number of cases open at a given point in time.
33  * Some cases may eventually be *solved* by associating a suspect list of one
34  * or more problems with the case, at which point fmd publishes a list.suspect
35  * event for the case and it becomes visible to administrators and agents.
36  *
37  * Every case is named using a UUID, and is globally visible in the case hash.
38  * Cases are reference-counted, except for the reference from the case hash
39  * itself.  Consumers of case references include modules, which store active
40  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
41  *
42  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
43  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
44  * or transport) and the case is referenced by the mod_cases list.  Once the
45  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
46  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
47  *
48  *			+------------+
49  *	     +----------|  UNSOLVED  |
50  *	     |		+------------+
51  *	     |		      1 |
52  *	     |			|
53  *	     |		+-------v----+
54  *	   2 |		|    SOLVED  |
55  *	     |		+------------+
56  *	     |		    3 |  5 |
57  *	     +------------+   |    |
58  *			  |   |    |
59  *			+-v---v----v-+
60  *			| CLOSE_WAIT |
61  *			+------------+
62  *			  |   |    |
63  *	      +-----------+   |    +------------+
64  *	      |		    4 |			|
65  *	      v		+-----v------+		|
66  *	   discard      |   CLOSED   |	      6	|
67  *			+------------+		|
68  *			      |			|
69  *			      |	   +------------+
70  *			    7 |	   |
71  *			+-----v----v-+
72  *			|  REPAIRED  |
73  *			+------------+
74  *			      |
75  *			    8 |
76  *			+-----v------+
77  *			|  RESOLVED  |
78  *			+------------+
79  *			      |
80  *			      v
81  *			   discard
82  *
83  * The state machine changes are triggered by calls to fmd_case_transition()
84  * from various locations inside of fmd, as described below:
85  *
86  * [1] Called by: fmd_case_solve()
87  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
88  *                conviction policy is applied to suspect list
89  *                suspects convicted are marked faulty (F) in R$
90  *                list.suspect event logged and dispatched
91  *
92  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
93  *       Actions: diagnosis engine fmdo_close() entry point scheduled
94  *                case discarded upon exit from CLOSE_WAIT
95  *
96  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
97  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
98  *                suspects convicted (F) are marked unusable (U) in R$
99  *                diagnosis engine fmdo_close() entry point scheduled
100  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
101  *
102  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
103  *       Actions: list.isolated event dispatched
104  *                case deleted from module's list of open cases
105  *
106  * [5] Called by: fmd_case_repair(), fmd_case_update()
107  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
108  *                diagnosis engine fmdo_close() entry point scheduled
109  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
110  *
111  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
112  *       Actions: suspects convicted are marked non faulty (!F) in R$
113  *                list.repaired or list.updated event dispatched
114  *
115  * [7] Called by: fmd_case_repair(), fmd_case_update()
116  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
117  *                suspects convicted are marked non faulty (!F) in R$
118  *                list.repaired or list.updated event dispatched
119  *
120  * [8] Called by: fmd_case_uuresolve()
121  *       Actions: list.resolved event dispatched
122  *		  case is discarded
123  */
124 
125 #include <sys/fm/protocol.h>
126 #include <uuid/uuid.h>
127 #include <alloca.h>
128 
129 #include <fmd_alloc.h>
130 #include <fmd_module.h>
131 #include <fmd_error.h>
132 #include <fmd_conf.h>
133 #include <fmd_case.h>
134 #include <fmd_string.h>
135 #include <fmd_subr.h>
136 #include <fmd_protocol.h>
137 #include <fmd_event.h>
138 #include <fmd_eventq.h>
139 #include <fmd_dispq.h>
140 #include <fmd_buf.h>
141 #include <fmd_log.h>
142 #include <fmd_asru.h>
143 #include <fmd_fmri.h>
144 #include <fmd_xprt.h>
145 
146 #include <fmd.h>
147 
148 static const char *const _fmd_case_snames[] = {
149 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
150 	"SOLVED",	/* FMD_CASE_SOLVED */
151 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
152 	"CLOSED",	/* FMD_CASE_CLOSED */
153 	"REPAIRED",	/* FMD_CASE_REPAIRED */
154 	"RESOLVED"	/* FMD_CASE_RESOLVED */
155 };
156 
157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
158 
159 fmd_case_hash_t *
160 fmd_case_hash_create(void)
161 {
162 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
163 
164 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
165 	chp->ch_hashlen = fmd.d_str_buckets;
166 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
167 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
168 	    FMD_SLEEP);
169 	chp->ch_count = 0;
170 
171 	return (chp);
172 }
173 
174 /*
175  * Destroy the case hash.  Unlike most of our hash tables, no active references
176  * are kept by the case hash itself; all references come from other subsystems.
177  * The hash must be destroyed after all modules are unloaded; if anything was
178  * present in the hash it would be by definition a reference count leak.
179  */
180 void
181 fmd_case_hash_destroy(fmd_case_hash_t *chp)
182 {
183 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
185 	fmd_free(chp, sizeof (fmd_case_hash_t));
186 }
187 
188 /*
189  * Take a snapshot of the case hash by placing an additional hold on each
190  * member in an auxiliary array, and then call 'func' for each case.
191  */
192 void
193 fmd_case_hash_apply(fmd_case_hash_t *chp,
194     void (*func)(fmd_case_t *, void *), void *arg)
195 {
196 	fmd_case_impl_t *cp, **cps, **cpp;
197 	uint_t cpc, i;
198 
199 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
200 
201 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
202 	cpc = chp->ch_count;
203 
204 	for (i = 0; i < chp->ch_hashlen; i++) {
205 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
206 			*cpp++ = fmd_case_tryhold(cp);
207 	}
208 
209 	ASSERT(cpp == cps + cpc);
210 	(void) pthread_rwlock_unlock(&chp->ch_lock);
211 
212 	for (i = 0; i < cpc; i++) {
213 		if (cps[i] != NULL) {
214 			func((fmd_case_t *)cps[i], arg);
215 			fmd_case_rele((fmd_case_t *)cps[i]);
216 		}
217 	}
218 
219 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
220 }
221 
222 static void
223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
224 {
225 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
226 
227 	cip->ci_code_next = chp->ch_code_hash[h];
228 	chp->ch_code_hash[h] = cip;
229 }
230 
231 static void
232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
233 {
234 	fmd_case_impl_t **pp, *cp;
235 
236 	if (cip->ci_code) {
237 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
238 
239 		pp = &chp->ch_code_hash[h];
240 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
241 			if (cp != cip)
242 				pp = &cp->ci_code_next;
243 			else
244 				break;
245 		}
246 		if (cp != NULL) {
247 			*pp = cp->ci_code_next;
248 			cp->ci_code_next = NULL;
249 		}
250 	}
251 }
252 
253 /*
254  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
255  * were defined for this case or if the lookup fails, the event dictionary or
256  * module code is broken, and we set the event code to a precomputed default.
257  */
258 static const char *
259 fmd_case_mkcode(fmd_case_t *cp)
260 {
261 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
262 	fmd_case_susp_t *cis;
263 	fmd_case_hash_t *chp = fmd.d_cases;
264 
265 	char **keys, **keyp;
266 	const char *s;
267 
268 	ASSERT(MUTEX_HELD(&cip->ci_lock));
269 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
270 
271 	/*
272 	 * delete any existing entry from code hash if it is on it
273 	 */
274 	fmd_case_code_hash_delete(chp, cip);
275 
276 	fmd_free(cip->ci_code, cip->ci_codelen);
277 	cip->ci_codelen = cip->ci_mod->mod_codelen;
278 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
279 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
280 
281 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
282 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
283 			keyp++;
284 	}
285 
286 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
287 
288 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
289 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
290 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
291 		fmd_free(cip->ci_code, cip->ci_codelen);
292 		cip->ci_codelen = strlen(s) + 1;
293 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
294 		(void) strcpy(cip->ci_code, s);
295 	}
296 
297 	/*
298 	 * add into hash of solved cases
299 	 */
300 	fmd_case_code_hash_insert(chp, cip);
301 
302 	return (cip->ci_code);
303 }
304 
305 typedef struct {
306 	int	*fcl_countp;
307 	int	fcl_maxcount;
308 	uint8_t *fcl_ba;
309 	nvlist_t **fcl_nva;
310 	int	*fcl_msgp;
311 } fmd_case_lst_t;
312 
313 static void
314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
315 {
316 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
317 	boolean_t b;
318 	int state;
319 
320 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
321 		return;
322 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
323 	    &b) == 0 && b == B_FALSE)
324 		*entryp->fcl_msgp = B_FALSE;
325 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
326 	state = fmd_asru_al_getstate(alp);
327 	if (state & FMD_ASRU_DEGRADED)
328 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
329 	if (state & FMD_ASRU_UNUSABLE)
330 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
331 	if (state & FMD_ASRU_FAULTY)
332 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
333 	if (!(state & FMD_ASRU_PRESENT))
334 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
335 	if (alp->al_reason == FMD_ASRU_REPAIRED)
336 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
337 	else if (alp->al_reason == FMD_ASRU_REPLACED)
338 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
339 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
340 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
341 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
342 	(*entryp->fcl_countp)++;
343 }
344 
345 static void
346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
347 {
348 	int *faultyp = (int *)arg;
349 
350 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
351 }
352 
353 static void
354 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
355 {
356 	int *usablep = (int *)arg;
357 
358 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
359 }
360 
361 static void
362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
363 {
364 	int *not_faultyp = (int *)arg;
365 
366 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
367 }
368 
369 /*
370  * Have we got any suspects with an asru that are still unusable and present?
371  */
372 static void
373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
374 {
375 	int *rvalp = (int *)arg;
376 	int state;
377 	nvlist_t *asru;
378 
379 	/*
380 	 * if this a proxy case and this suspect doesn't have an local asru
381 	 * then state is unknown so we must assume it may still be unusable.
382 	 */
383 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
384 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
385 		*rvalp |= B_TRUE;
386 		return;
387 	}
388 
389 	state = fmd_asru_al_getstate(alp);
390 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
391 		return;
392 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
393 }
394 
395 nvlist_t *
396 fmd_case_mkevent(fmd_case_t *cp, const char *class)
397 {
398 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
399 	nvlist_t **nva, *nvl;
400 	uint8_t *ba;
401 	int msg = B_TRUE;
402 	const char *code;
403 	fmd_case_lst_t fcl;
404 	int count = 0;
405 
406 	(void) pthread_mutex_lock(&cip->ci_lock);
407 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
408 
409 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
410 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
411 
412 	/*
413 	 * For each suspect associated with the case, store its fault event
414 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
415 	 * have asked not to be messaged.  If any of them have made such a
416 	 * request, propagate that attribute to the composite list.* event.
417 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
418 	 */
419 	fcl.fcl_countp = &count;
420 	fcl.fcl_maxcount = cip->ci_nsuspects;
421 	fcl.fcl_msgp = &msg;
422 	fcl.fcl_ba = ba;
423 	fcl.fcl_nva = nva;
424 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
425 
426 	if (cip->ci_code == NULL)
427 		(void) fmd_case_mkcode(cp);
428 	/*
429 	 * For repair and updated event, we lookup diagcode from dict using key
430 	 * "list.repaired" or "list.updated" or "list.resolved".
431 	 */
432 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
433 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
434 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
435 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
436 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
437 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
438 	else
439 		code = cip->ci_code;
440 
441 	if (msg == B_FALSE)
442 		cip->ci_flags |= FMD_CF_INVISIBLE;
443 
444 	/*
445 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
446 	 * Otherwise use the authority for the current module.
447 	 */
448 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
449 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
450 	    nva, ba, msg, &cip->ci_tv);
451 
452 	(void) pthread_mutex_unlock(&cip->ci_lock);
453 	return (nvl);
454 }
455 
456 static boolean_t
457 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
458 {
459 	nvlist_t *new_rsrc;
460 	nvlist_t *rsrc;
461 	char *new_name = NULL;
462 	char *name = NULL;
463 	ssize_t new_namelen;
464 	ssize_t namelen;
465 	int fmri_present = 1;
466 	int new_fmri_present = 1;
467 	int match = B_FALSE;
468 	fmd_topo_t *ftp = fmd_topo_hold();
469 
470 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
471 		fmri_present = 0;
472 	else {
473 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
474 			goto done;
475 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
476 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
477 			goto done;
478 	}
479 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
480 		new_fmri_present = 0;
481 	else {
482 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
483 			goto done;
484 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
485 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
486 			goto done;
487 	}
488 	match = (fmri_present == new_fmri_present &&
489 	    (fmri_present == 0 ||
490 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
491 done:
492 	if (name != NULL)
493 		fmd_free(name, namelen + 1);
494 	if (new_name != NULL)
495 		fmd_free(new_name, new_namelen + 1);
496 	fmd_topo_rele(ftp);
497 	return (match);
498 }
499 
500 static int
501 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
502 {
503 	char *class, *new_class;
504 
505 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
506 		return (0);
507 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
508 	    FM_FAULT_RESOURCE))
509 		return (0);
510 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
511 		return (0);
512 	(void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
513 	(void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
514 	return (strcmp(class, new_class) == 0);
515 }
516 
517 /*
518  * see if an identical suspect list already exists in the cache
519  */
520 static int
521 fmd_case_check_for_dups(fmd_case_t *cp)
522 {
523 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
524 	fmd_case_hash_t *chp = fmd.d_cases;
525 	fmd_case_susp_t *xcis, *cis;
526 	int match = 0, match_susp;
527 	uint_t h;
528 
529 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
530 
531 	/*
532 	 * Find all cases with this code
533 	 */
534 	h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
535 	for (xcip = chp->ch_code_hash[h]; xcip != NULL;
536 	    xcip = xcip->ci_code_next) {
537 		/*
538 		 * only look for any cases (apart from this one)
539 		 * whose code and number of suspects match
540 		 */
541 		if (xcip == cip || fmd_case_tryhold(xcip) == NULL)
542 			continue;
543 		if (strcmp(xcip->ci_code, cip->ci_code) != 0 ||
544 		    xcip->ci_nsuspects != cip->ci_nsuspects) {
545 			fmd_case_rele((fmd_case_t *)xcip);
546 			continue;
547 		}
548 
549 		/*
550 		 * For each suspect in one list, check if there
551 		 * is an identical suspect in the other list
552 		 */
553 		match = 1;
554 		for (xcis = xcip->ci_suspects; xcis != NULL;
555 		    xcis = xcis->cis_next) {
556 			match_susp = 0;
557 			for (cis = cip->ci_suspects; cis != NULL;
558 			    cis = cis->cis_next) {
559 				if (fmd_case_match_suspect(cis, xcis) == 1) {
560 					match_susp = 1;
561 					break;
562 				}
563 			}
564 			if (match_susp == 0) {
565 				match = 0;
566 				break;
567 			}
568 		}
569 		fmd_case_rele((fmd_case_t *)xcip);
570 		if (match) {
571 			(void) pthread_rwlock_unlock(&chp->ch_lock);
572 			return (1);
573 		}
574 	}
575 	(void) pthread_rwlock_unlock(&chp->ch_lock);
576 	return (0);
577 }
578 
579 /*
580  * Convict suspects in a case by applying a conviction policy and updating the
581  * resource cache prior to emitting the list.suspect event for the given case.
582  * At present, our policy is very simple: convict every suspect in the case.
583  * In the future, this policy can be extended and made configurable to permit:
584  *
585  * - convicting the suspect with the highest FIT rate
586  * - convicting the suspect with the cheapest FRU
587  * - convicting the suspect with the FRU that is in a depot's inventory
588  * - convicting the suspect with the longest lifetime
589  *
590  * and so forth.  A word to the wise: this problem is significantly harder that
591  * it seems at first glance.  Future work should heed the following advice:
592  *
593  * Hacking the policy into C code here is a very bad idea.  The policy needs to
594  * be decided upon very carefully and fundamentally encodes knowledge of what
595  * suspect list combinations can be emitted by what diagnosis engines.  As such
596  * fmd's code is the wrong location, because that would require fmd itself to
597  * be updated for every diagnosis engine change, defeating the entire design.
598  * The FMA Event Registry knows the suspect list combinations: policy inputs
599  * can be derived from it and used to produce per-module policy configuration.
600  *
601  * If the policy needs to be dynamic and not statically fixed at either fmd
602  * startup or module load time, any implementation of dynamic policy retrieval
603  * must employ some kind of caching mechanism or be part of a built-in module.
604  * The fmd_case_convict() function is called with locks held inside of fmd and
605  * is not a place where unbounded blocking on some inter-process or inter-
606  * system communication to another service (e.g. another daemon) can occur.
607  */
608 static int
609 fmd_case_convict(fmd_case_t *cp)
610 {
611 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
612 	fmd_asru_hash_t *ahp = fmd.d_asrus;
613 
614 	fmd_case_susp_t *cis;
615 	fmd_asru_link_t *alp;
616 
617 	(void) pthread_mutex_lock(&cip->ci_lock);
618 	if (cip->ci_code == NULL)
619 		(void) fmd_case_mkcode(cp);
620 	else if (cip->ci_precanned)
621 		fmd_case_code_hash_insert(fmd.d_cases, cip);
622 	if (fmd_case_check_for_dups(cp) == 1) {
623 		(void) pthread_mutex_unlock(&cip->ci_lock);
624 		return (1);
625 	}
626 
627 	/*
628 	 * no suspect list already exists  - allocate new cache entries
629 	 */
630 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
631 		if ((alp = fmd_asru_hash_create_entry(ahp,
632 		    cp, cis->cis_nvl)) == NULL) {
633 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
634 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
635 			continue;
636 		}
637 		alp->al_flags |= FMD_ASRU_PRESENT;
638 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
639 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
640 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
641 	}
642 
643 	(void) pthread_mutex_unlock(&cip->ci_lock);
644 	return (0);
645 }
646 
647 void
648 fmd_case_publish(fmd_case_t *cp, uint_t state)
649 {
650 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
651 	fmd_event_t *e;
652 	nvlist_t *nvl;
653 	char *class;
654 
655 	if (state == FMD_CASE_CURRENT)
656 		state = cip->ci_state; /* use current state */
657 
658 	switch (state) {
659 	case FMD_CASE_SOLVED:
660 		(void) pthread_mutex_lock(&cip->ci_lock);
661 
662 		/*
663 		 * If we already have a code, then case is already solved.
664 		 */
665 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
666 		    cip->ci_code != NULL) {
667 			(void) pthread_mutex_unlock(&cip->ci_lock);
668 			break;
669 		}
670 
671 		if (cip->ci_tv_valid == 0) {
672 			fmd_time_gettimeofday(&cip->ci_tv);
673 			cip->ci_tv_valid = 1;
674 		}
675 		(void) pthread_mutex_unlock(&cip->ci_lock);
676 
677 		if (fmd_case_convict(cp) == 1) { /* dupclose */
678 			cip->ci_flags &= ~FMD_CF_SOLVED;
679 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
680 			break;
681 		}
682 		if (cip->ci_xprt != NULL) {
683 			/*
684 			 * For proxy, save some information about the transport
685 			 * in the resource cache.
686 			 */
687 			int count = 0;
688 			fmd_asru_set_on_proxy_t fasp;
689 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
690 
691 			fasp.fasp_countp = &count;
692 			fasp.fasp_maxcount = cip->ci_nsuspects;
693 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
694 			fasp.fasp_proxy_external = xip->xi_flags &
695 			    FMD_XPRT_EXTERNAL;
696 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
697 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
698 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
699 			    fmd_asru_set_on_proxy, &fasp);
700 		}
701 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
702 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
703 
704 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
705 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
706 		fmd_log_append(fmd.d_fltlog, e, cp);
707 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
708 		fmd_dispq_dispatch(fmd.d_disp, e, class);
709 
710 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
711 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
712 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
713 
714 		break;
715 
716 	case FMD_CASE_CLOSE_WAIT:
717 		fmd_case_hold(cp);
718 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
719 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
720 
721 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
722 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
723 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
724 
725 		break;
726 
727 	case FMD_CASE_CLOSED:
728 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
729 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
730 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
731 		fmd_dispq_dispatch(fmd.d_disp, e, class);
732 		break;
733 
734 	case FMD_CASE_REPAIRED:
735 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
736 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
737 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
738 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
739 		fmd_log_append(fmd.d_fltlog, e, cp);
740 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
741 		fmd_dispq_dispatch(fmd.d_disp, e, class);
742 		break;
743 
744 	case FMD_CASE_RESOLVED:
745 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
746 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
747 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
748 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
749 		fmd_log_append(fmd.d_fltlog, e, cp);
750 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
751 		fmd_dispq_dispatch(fmd.d_disp, e, class);
752 		break;
753 	}
754 }
755 
756 fmd_case_t *
757 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
758 {
759 	fmd_case_impl_t *cip;
760 	uint_t h;
761 
762 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
763 	h = fmd_strhash(uuid) % chp->ch_hashlen;
764 
765 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
766 		if (strcmp(cip->ci_uuid, uuid) == 0)
767 			break;
768 	}
769 
770 	/*
771 	 * If deleting bit is set, treat the case as if it doesn't exist.
772 	 */
773 	if (cip != NULL)
774 		cip = fmd_case_tryhold(cip);
775 
776 	if (cip == NULL)
777 		(void) fmd_set_errno(EFMD_CASE_INVAL);
778 
779 	(void) pthread_rwlock_unlock(&chp->ch_lock);
780 	return ((fmd_case_t *)cip);
781 }
782 
783 static fmd_case_impl_t *
784 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
785 {
786 	fmd_case_impl_t *eip;
787 	uint_t h;
788 
789 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
790 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
791 
792 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
793 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
794 		    fmd_case_tryhold(eip) != NULL) {
795 			(void) pthread_rwlock_unlock(&chp->ch_lock);
796 			return (eip); /* uuid already present */
797 		}
798 	}
799 
800 	cip->ci_next = chp->ch_hash[h];
801 	chp->ch_hash[h] = cip;
802 
803 	chp->ch_count++;
804 	ASSERT(chp->ch_count != 0);
805 
806 	(void) pthread_rwlock_unlock(&chp->ch_lock);
807 	return (cip);
808 }
809 
810 static void
811 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
812 {
813 	fmd_case_impl_t *cp, **pp;
814 	uint_t h;
815 
816 	ASSERT(MUTEX_HELD(&cip->ci_lock));
817 
818 	cip->ci_flags |= FMD_CF_DELETING;
819 	(void) pthread_mutex_unlock(&cip->ci_lock);
820 
821 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
822 
823 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
824 	pp = &chp->ch_hash[h];
825 
826 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
827 		if (cp != cip)
828 			pp = &cp->ci_next;
829 		else
830 			break;
831 	}
832 
833 	if (cp == NULL) {
834 		fmd_panic("case %p (%s) not found on hash chain %u\n",
835 		    (void *)cip, cip->ci_uuid, h);
836 	}
837 
838 	*pp = cp->ci_next;
839 	cp->ci_next = NULL;
840 
841 	/*
842 	 * delete from code hash if it is on it
843 	 */
844 	fmd_case_code_hash_delete(chp, cip);
845 
846 	ASSERT(chp->ch_count != 0);
847 	chp->ch_count--;
848 
849 	(void) pthread_rwlock_unlock(&chp->ch_lock);
850 
851 	(void) pthread_mutex_lock(&cip->ci_lock);
852 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
853 }
854 
855 fmd_case_t *
856 fmd_case_create(fmd_module_t *mp, void *data)
857 {
858 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
859 	fmd_case_impl_t *eip = NULL;
860 	uuid_t uuid;
861 
862 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
863 	fmd_buf_hash_create(&cip->ci_bufs);
864 
865 	fmd_module_hold(mp);
866 	cip->ci_mod = mp;
867 	cip->ci_refs = 1;
868 	cip->ci_state = FMD_CASE_UNSOLVED;
869 	cip->ci_flags = FMD_CF_DIRTY;
870 	cip->ci_data = data;
871 
872 	/*
873 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
874 	 * define any constant for the length of an unparse string, and do not
875 	 * permit the caller to specify a buffer length for safety.  The spec
876 	 * says it will be 36 bytes, but we make it tunable just in case.
877 	 */
878 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
879 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
880 
881 	/*
882 	 * We expect this loop to execute only once, but code it defensively
883 	 * against the possibility of libuuid bugs.  Keep generating uuids and
884 	 * attempting to do a hash insert until we get a unique one.
885 	 */
886 	do {
887 		if (eip != NULL)
888 			fmd_case_rele((fmd_case_t *)eip);
889 		uuid_generate(uuid);
890 		uuid_unparse(uuid, cip->ci_uuid);
891 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
892 
893 	ASSERT(fmd_module_locked(mp));
894 	fmd_list_append(&mp->mod_cases, cip);
895 	fmd_module_setcdirty(mp);
896 
897 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
898 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
899 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
900 
901 	return ((fmd_case_t *)cip);
902 }
903 
904 static void
905 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
906 {
907 	fmd_case_susp_t *cis, *ncis;
908 
909 	ASSERT(MUTEX_HELD(&cip->ci_lock));
910 
911 	if (cip->ci_proxy_asru)
912 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
913 		    cip->ci_nsuspects);
914 	if (cip->ci_diag_de)
915 		nvlist_free(cip->ci_diag_de);
916 	if (cip->ci_diag_asru)
917 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
918 		    cip->ci_nsuspects);
919 
920 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
921 		ncis = cis->cis_next;
922 		nvlist_free(cis->cis_nvl);
923 		fmd_free(cis, sizeof (fmd_case_susp_t));
924 	}
925 
926 	cip->ci_suspects = NULL;
927 	cip->ci_nsuspects = 0;
928 }
929 
930 fmd_case_t *
931 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
932     uint_t state, const char *uuid, const char *code)
933 {
934 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
935 	fmd_case_impl_t *eip;
936 
937 	ASSERT(state < FMD_CASE_RESOLVED);
938 
939 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
940 	fmd_buf_hash_create(&cip->ci_bufs);
941 
942 	fmd_module_hold(mp);
943 	cip->ci_mod = mp;
944 	cip->ci_xprt = xp;
945 	cip->ci_refs = 1;
946 	cip->ci_state = state;
947 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
948 	cip->ci_uuidlen = strlen(cip->ci_uuid);
949 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
950 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
951 
952 	if (state > FMD_CASE_CLOSE_WAIT)
953 		cip->ci_flags |= FMD_CF_SOLVED;
954 
955 	/*
956 	 * Insert the case into the global case hash.  If the specified UUID is
957 	 * already present, check to see if it is an orphan: if so, reclaim it;
958 	 * otherwise if it is owned by a different module then return NULL.
959 	 */
960 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
961 		(void) pthread_mutex_lock(&cip->ci_lock);
962 		cip->ci_refs--; /* decrement to zero */
963 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
964 
965 		cip = eip; /* switch 'cip' to the existing case */
966 		(void) pthread_mutex_lock(&cip->ci_lock);
967 
968 		/*
969 		 * If the ASRU cache is trying to recreate an orphan, then just
970 		 * return the existing case that we found without changing it.
971 		 */
972 		if (mp == fmd.d_rmod) {
973 			/*
974 			 * In case the case has already been created from
975 			 * a checkpoint file we need to set up code now.
976 			 */
977 			if (cip->ci_state < FMD_CASE_CLOSED) {
978 				if (code != NULL && cip->ci_code == NULL) {
979 					cip->ci_code = fmd_strdup(code,
980 					    FMD_SLEEP);
981 					cip->ci_codelen = cip->ci_code ?
982 					    strlen(cip->ci_code) + 1 : 0;
983 					fmd_case_code_hash_insert(fmd.d_cases,
984 					    cip);
985 				}
986 			}
987 
988 			/*
989 			 * When recreating an orphan case, state passed in may
990 			 * either be CLOSED (faulty) or REPAIRED (!faulty). If
991 			 * any suspects are still CLOSED (faulty) then the
992 			 * overall state needs to be CLOSED.
993 			 */
994 			if (cip->ci_state == FMD_CASE_REPAIRED &&
995 			    state == FMD_CASE_CLOSED)
996 				cip->ci_state = FMD_CASE_CLOSED;
997 			(void) pthread_mutex_unlock(&cip->ci_lock);
998 			fmd_case_rele((fmd_case_t *)cip);
999 			return ((fmd_case_t *)cip);
1000 		}
1001 
1002 		/*
1003 		 * If the existing case isn't an orphan or is being proxied,
1004 		 * then we have a UUID conflict: return failure to the caller.
1005 		 */
1006 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1007 			(void) pthread_mutex_unlock(&cip->ci_lock);
1008 			fmd_case_rele((fmd_case_t *)cip);
1009 			return (NULL);
1010 		}
1011 
1012 		/*
1013 		 * If the new module is reclaiming an orphaned case, remove
1014 		 * the case from the root module, switch ci_mod, and then fall
1015 		 * through to adding the case to the new owner module 'mp'.
1016 		 */
1017 		fmd_module_lock(cip->ci_mod);
1018 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1019 		fmd_module_unlock(cip->ci_mod);
1020 
1021 		fmd_module_rele(cip->ci_mod);
1022 		cip->ci_mod = mp;
1023 		fmd_module_hold(mp);
1024 
1025 		/*
1026 		 * It's possible that fmd crashed or was restarted during a
1027 		 * previous solve operation between the asru cache being created
1028 		 * and the ckpt file being updated to SOLVED. Thus when the DE
1029 		 * recreates the case here from the checkpoint file, the state
1030 		 * will be UNSOLVED and yet we are having to reclaim because
1031 		 * the case was in the asru cache. If this happens, revert the
1032 		 * case back to the UNSOLVED state and let the DE solve it again
1033 		 */
1034 		if (state == FMD_CASE_UNSOLVED) {
1035 			fmd_asru_hash_delete_case(fmd.d_asrus,
1036 			    (fmd_case_t *)cip);
1037 			fmd_case_destroy_suspects(cip);
1038 			fmd_case_code_hash_delete(fmd.d_cases, cip);
1039 			fmd_free(cip->ci_code, cip->ci_codelen);
1040 			cip->ci_code = NULL;
1041 			cip->ci_codelen = 0;
1042 			cip->ci_tv_valid = 0;
1043 		}
1044 
1045 		cip->ci_state = state;
1046 
1047 		(void) pthread_mutex_unlock(&cip->ci_lock);
1048 		fmd_case_rele((fmd_case_t *)cip);
1049 	} else {
1050 		/*
1051 		 * add into hash of solved cases
1052 		 */
1053 		if (cip->ci_code)
1054 			fmd_case_code_hash_insert(fmd.d_cases, cip);
1055 	}
1056 
1057 	ASSERT(fmd_module_locked(mp));
1058 	fmd_list_append(&mp->mod_cases, cip);
1059 
1060 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1061 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1062 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1063 
1064 	return ((fmd_case_t *)cip);
1065 }
1066 
1067 void
1068 fmd_case_destroy(fmd_case_t *cp, int visible)
1069 {
1070 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1071 	fmd_case_item_t *cit, *ncit;
1072 
1073 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1074 	ASSERT(cip->ci_refs == 0);
1075 
1076 	if (visible) {
1077 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1078 		fmd_case_hash_delete(fmd.d_cases, cip);
1079 	}
1080 
1081 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1082 		ncit = cit->cit_next;
1083 		fmd_event_rele(cit->cit_event);
1084 		fmd_free(cit, sizeof (fmd_case_item_t));
1085 	}
1086 
1087 	fmd_case_destroy_suspects(cip);
1088 
1089 	if (cip->ci_principal != NULL)
1090 		fmd_event_rele(cip->ci_principal);
1091 
1092 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1093 	fmd_free(cip->ci_code, cip->ci_codelen);
1094 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1095 
1096 	fmd_module_rele(cip->ci_mod);
1097 	fmd_free(cip, sizeof (fmd_case_impl_t));
1098 }
1099 
1100 void
1101 fmd_case_hold(fmd_case_t *cp)
1102 {
1103 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1104 
1105 	(void) pthread_mutex_lock(&cip->ci_lock);
1106 	fmd_case_hold_locked(cp);
1107 	(void) pthread_mutex_unlock(&cip->ci_lock);
1108 }
1109 
1110 void
1111 fmd_case_hold_locked(fmd_case_t *cp)
1112 {
1113 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1114 
1115 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1116 	if (cip->ci_flags & FMD_CF_DELETING)
1117 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1118 		    (void *)cip, cip->ci_uuid);
1119 	cip->ci_refs++;
1120 	ASSERT(cip->ci_refs != 0);
1121 }
1122 
1123 static fmd_case_impl_t *
1124 fmd_case_tryhold(fmd_case_impl_t *cip)
1125 {
1126 	/*
1127 	 * If the case's "deleting" bit is unset, hold and return case,
1128 	 * otherwise, return NULL.
1129 	 */
1130 	(void) pthread_mutex_lock(&cip->ci_lock);
1131 	if (cip->ci_flags & FMD_CF_DELETING) {
1132 		(void) pthread_mutex_unlock(&cip->ci_lock);
1133 		cip = NULL;
1134 	} else {
1135 		fmd_case_hold_locked((fmd_case_t *)cip);
1136 		(void) pthread_mutex_unlock(&cip->ci_lock);
1137 	}
1138 	return (cip);
1139 }
1140 
1141 void
1142 fmd_case_rele(fmd_case_t *cp)
1143 {
1144 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1145 
1146 	(void) pthread_mutex_lock(&cip->ci_lock);
1147 	ASSERT(cip->ci_refs != 0);
1148 
1149 	if (--cip->ci_refs == 0)
1150 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1151 	else
1152 		(void) pthread_mutex_unlock(&cip->ci_lock);
1153 }
1154 
1155 void
1156 fmd_case_rele_locked(fmd_case_t *cp)
1157 {
1158 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1159 
1160 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1161 	--cip->ci_refs;
1162 	ASSERT(cip->ci_refs != 0);
1163 }
1164 
1165 int
1166 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1167 {
1168 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1169 	fmd_case_item_t *cit;
1170 	fmd_event_t *oep;
1171 	uint_t state;
1172 	int new;
1173 
1174 	fmd_event_hold(ep);
1175 	(void) pthread_mutex_lock(&cip->ci_lock);
1176 
1177 	if (cip->ci_flags & FMD_CF_SOLVED)
1178 		state = FMD_EVS_DIAGNOSED;
1179 	else
1180 		state = FMD_EVS_ACCEPTED;
1181 
1182 	oep = cip->ci_principal;
1183 	cip->ci_principal = ep;
1184 
1185 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1186 		if (cit->cit_event == ep)
1187 			break;
1188 	}
1189 
1190 	cip->ci_flags |= FMD_CF_DIRTY;
1191 	new = cit == NULL && ep != oep;
1192 
1193 	(void) pthread_mutex_unlock(&cip->ci_lock);
1194 
1195 	fmd_module_setcdirty(cip->ci_mod);
1196 	fmd_event_transition(ep, state);
1197 
1198 	if (oep != NULL)
1199 		fmd_event_rele(oep);
1200 
1201 	return (new);
1202 }
1203 
1204 int
1205 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1206 {
1207 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1208 	fmd_case_item_t *cit;
1209 	uint_t state;
1210 	int new;
1211 
1212 	(void) pthread_mutex_lock(&cip->ci_lock);
1213 
1214 	if (cip->ci_flags & FMD_CF_SOLVED)
1215 		state = FMD_EVS_DIAGNOSED;
1216 	else
1217 		state = FMD_EVS_ACCEPTED;
1218 
1219 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1220 		if (cit->cit_event == ep)
1221 			break;
1222 	}
1223 
1224 	new = cit == NULL && ep != cip->ci_principal;
1225 
1226 	/*
1227 	 * If the event is already in the case or the case is already solved,
1228 	 * there is no reason to save it: just transition it appropriately.
1229 	 */
1230 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1231 		(void) pthread_mutex_unlock(&cip->ci_lock);
1232 		fmd_event_transition(ep, state);
1233 		return (new);
1234 	}
1235 
1236 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1237 	fmd_event_hold(ep);
1238 
1239 	cit->cit_next = cip->ci_items;
1240 	cit->cit_event = ep;
1241 
1242 	cip->ci_items = cit;
1243 	cip->ci_nitems++;
1244 
1245 	cip->ci_flags |= FMD_CF_DIRTY;
1246 	(void) pthread_mutex_unlock(&cip->ci_lock);
1247 
1248 	fmd_module_setcdirty(cip->ci_mod);
1249 	fmd_event_transition(ep, state);
1250 
1251 	return (new);
1252 }
1253 
1254 void
1255 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1256 {
1257 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1258 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1259 
1260 	(void) pthread_mutex_lock(&cip->ci_lock);
1261 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1262 	cip->ci_flags |= FMD_CF_DIRTY;
1263 
1264 	cis->cis_next = cip->ci_suspects;
1265 	cis->cis_nvl = nvl;
1266 
1267 	cip->ci_suspects = cis;
1268 	cip->ci_nsuspects++;
1269 
1270 	(void) pthread_mutex_unlock(&cip->ci_lock);
1271 	if (cip->ci_xprt == NULL)
1272 		fmd_module_setcdirty(cip->ci_mod);
1273 }
1274 
1275 void
1276 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1277 {
1278 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1279 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1280 	boolean_t b;
1281 
1282 	(void) pthread_mutex_lock(&cip->ci_lock);
1283 
1284 	cis->cis_next = cip->ci_suspects;
1285 	cis->cis_nvl = nvl;
1286 
1287 	if (nvlist_lookup_boolean_value(nvl,
1288 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1289 		cip->ci_flags |= FMD_CF_INVISIBLE;
1290 
1291 	cip->ci_suspects = cis;
1292 	cip->ci_nsuspects++;
1293 
1294 	(void) pthread_mutex_unlock(&cip->ci_lock);
1295 }
1296 
1297 void
1298 fmd_case_reset_suspects(fmd_case_t *cp)
1299 {
1300 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1301 
1302 	(void) pthread_mutex_lock(&cip->ci_lock);
1303 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1304 
1305 	fmd_case_destroy_suspects(cip);
1306 	cip->ci_flags |= FMD_CF_DIRTY;
1307 
1308 	(void) pthread_mutex_unlock(&cip->ci_lock);
1309 	fmd_module_setcdirty(cip->ci_mod);
1310 }
1311 
1312 /*ARGSUSED*/
1313 static void
1314 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1315 {
1316 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1317 }
1318 
1319 /*
1320  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1321  * whatever actions and emit whatever events are appropriate for the state.
1322  * Refer to the topmost block comment explaining the state machine for details.
1323  */
1324 void
1325 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1326 {
1327 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1328 	fmd_case_item_t *cit;
1329 	fmd_event_t *e;
1330 	int resolved = 0;
1331 	int any_unusable_and_present = 0;
1332 
1333 	ASSERT(state <= FMD_CASE_RESOLVED);
1334 	(void) pthread_mutex_lock(&cip->ci_lock);
1335 
1336 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1337 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1338 
1339 	cip->ci_flags |= flags;
1340 
1341 	if (cip->ci_state >= state) {
1342 		(void) pthread_mutex_unlock(&cip->ci_lock);
1343 		return; /* already in specified state */
1344 	}
1345 
1346 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1347 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1348 
1349 	cip->ci_state = state;
1350 	cip->ci_flags |= FMD_CF_DIRTY;
1351 
1352 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1353 		fmd_module_setcdirty(cip->ci_mod);
1354 
1355 	switch (state) {
1356 	case FMD_CASE_SOLVED:
1357 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1358 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1359 
1360 		if (cip->ci_principal != NULL) {
1361 			fmd_event_transition(cip->ci_principal,
1362 			    FMD_EVS_DIAGNOSED);
1363 		}
1364 		break;
1365 
1366 	case FMD_CASE_CLOSE_WAIT:
1367 		/*
1368 		 * If the case was never solved, do not change ASRUs.
1369 		 * If the case was never fmd_case_closed, do not change ASRUs.
1370 		 * If the case was repaired, do not change ASRUs.
1371 		 */
1372 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1373 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1374 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1375 			    fmd_case_unusable, NULL);
1376 
1377 		/*
1378 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1379 		 * module is no longer loaded: continue on to CASE_CLOSED.
1380 		 */
1381 		if (fmd_case_orphaned(cp))
1382 			state = cip->ci_state = FMD_CASE_CLOSED;
1383 		break;
1384 
1385 	case FMD_CASE_REPAIRED:
1386 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1387 
1388 		/*
1389 		 * If we've been requested to transition straight on to the
1390 		 * RESOLVED state (which can happen with fault proxying where a
1391 		 * list.resolved or a uuresolved is received from the other
1392 		 * side), or if all suspects are already either usable or not
1393 		 * present then transition straight to RESOLVED state,
1394 		 * publishing both the list.repaired and list.resolved. For a
1395 		 * proxy, if we discover here that all suspects are already
1396 		 * either usable or not present, notify the diag side instead
1397 		 * using fmd_xprt_uuresolved().
1398 		 */
1399 		if (flags & FMD_CF_RESOLVED) {
1400 			if (cip->ci_xprt != NULL) {
1401 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1402 			} else {
1403 				fmd_module_lock(cip->ci_mod);
1404 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1405 				fmd_module_unlock(cip->ci_mod);
1406 			}
1407 		} else {
1408 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1409 			    fmd_case_unusable_and_present,
1410 			    &any_unusable_and_present);
1411 			if (any_unusable_and_present)
1412 				break;
1413 			if (cip->ci_xprt != NULL) {
1414 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1415 				break;
1416 			}
1417 			fmd_module_lock(cip->ci_mod);
1418 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1419 			fmd_module_unlock(cip->ci_mod);
1420 		}
1421 
1422 		cip->ci_state = FMD_CASE_RESOLVED;
1423 		(void) pthread_mutex_unlock(&cip->ci_lock);
1424 		fmd_case_publish(cp, state);
1425 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1426 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1427 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1428 		state = FMD_CASE_RESOLVED;
1429 		resolved = 1;
1430 		(void) pthread_mutex_lock(&cip->ci_lock);
1431 		break;
1432 
1433 	case FMD_CASE_RESOLVED:
1434 		/*
1435 		 * For a proxy, no need to check that all suspects are already
1436 		 * either usable or not present - this request has come from
1437 		 * the diagnosing side which makes the final decision on this.
1438 		 */
1439 		if (cip->ci_xprt != NULL) {
1440 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1441 			resolved = 1;
1442 			break;
1443 		}
1444 
1445 		ASSERT(fmd_case_orphaned(cp));
1446 
1447 		/*
1448 		 * If all suspects are already either usable or not present then
1449 		 * carry on, publish list.resolved and discard the case.
1450 		 */
1451 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1452 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1453 		if (any_unusable_and_present) {
1454 			(void) pthread_mutex_unlock(&cip->ci_lock);
1455 			return;
1456 		}
1457 
1458 		fmd_module_lock(cip->ci_mod);
1459 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1460 		fmd_module_unlock(cip->ci_mod);
1461 		resolved = 1;
1462 		break;
1463 	}
1464 
1465 	(void) pthread_mutex_unlock(&cip->ci_lock);
1466 
1467 	/*
1468 	 * If the module has initialized, then publish the appropriate event
1469 	 * for the new case state.  If not, we are being called from the
1470 	 * checkpoint code during module load, in which case the module's
1471 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1472 	 * may not be open yet, which will prevent us from computing the event
1473 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1474 	 * event in our queue: this won't be processed until _fmd_init is done.
1475 	 */
1476 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1477 		fmd_case_publish(cp, state);
1478 	else {
1479 		fmd_case_hold(cp);
1480 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1481 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1482 	}
1483 
1484 	if (resolved) {
1485 		/*
1486 		 * If we transitioned to RESOLVED, adjust the reference count to
1487 		 * reflect our removal from fmd.d_rmod->mod_cases above.  If the
1488 		 * caller has not placed an additional hold on the case, it
1489 		 * will now be freed.
1490 		 */
1491 		(void) pthread_mutex_lock(&cip->ci_lock);
1492 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1493 		(void) pthread_mutex_unlock(&cip->ci_lock);
1494 		fmd_case_rele(cp);
1495 	}
1496 }
1497 
1498 /*
1499  * Transition the specified case to *at least* the specified state by first
1500  * re-validating the suspect list using the resource cache.  This function is
1501  * employed by the checkpoint code when restoring a saved, solved case to see
1502  * if the state of the case has effectively changed while fmd was not running
1503  * or the module was not loaded.
1504  */
1505 void
1506 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1507 {
1508 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1509 
1510 	int usable = 0;		/* are any suspects usable? */
1511 
1512 	ASSERT(state >= FMD_CASE_SOLVED);
1513 	(void) pthread_mutex_lock(&cip->ci_lock);
1514 
1515 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1516 
1517 	(void) pthread_mutex_unlock(&cip->ci_lock);
1518 
1519 	if (!usable) {
1520 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1521 		flags |= FMD_CF_ISOLATED;
1522 	}
1523 
1524 	fmd_case_transition(cp, state, flags);
1525 }
1526 
1527 void
1528 fmd_case_setdirty(fmd_case_t *cp)
1529 {
1530 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1531 
1532 	(void) pthread_mutex_lock(&cip->ci_lock);
1533 	cip->ci_flags |= FMD_CF_DIRTY;
1534 	(void) pthread_mutex_unlock(&cip->ci_lock);
1535 
1536 	fmd_module_setcdirty(cip->ci_mod);
1537 }
1538 
1539 void
1540 fmd_case_clrdirty(fmd_case_t *cp)
1541 {
1542 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1543 
1544 	(void) pthread_mutex_lock(&cip->ci_lock);
1545 	cip->ci_flags &= ~FMD_CF_DIRTY;
1546 	(void) pthread_mutex_unlock(&cip->ci_lock);
1547 }
1548 
1549 void
1550 fmd_case_commit(fmd_case_t *cp)
1551 {
1552 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1553 	fmd_case_item_t *cit;
1554 
1555 	(void) pthread_mutex_lock(&cip->ci_lock);
1556 
1557 	if (cip->ci_flags & FMD_CF_DIRTY) {
1558 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1559 			fmd_event_commit(cit->cit_event);
1560 
1561 		if (cip->ci_principal != NULL)
1562 			fmd_event_commit(cip->ci_principal);
1563 
1564 		fmd_buf_hash_commit(&cip->ci_bufs);
1565 		cip->ci_flags &= ~FMD_CF_DIRTY;
1566 	}
1567 
1568 	(void) pthread_mutex_unlock(&cip->ci_lock);
1569 }
1570 
1571 /*
1572  * On proxy side, send back repair/acquit/etc request to diagnosing side
1573  */
1574 void
1575 fmd_case_xprt_updated(fmd_case_t *cp)
1576 {
1577 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1578 	nvlist_t **nva;
1579 	uint8_t *ba;
1580 	int msg = B_TRUE;
1581 	int count = 0;
1582 	fmd_case_lst_t fcl;
1583 
1584 	ASSERT(cip->ci_xprt != NULL);
1585 	(void) pthread_mutex_lock(&cip->ci_lock);
1586 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
1587 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
1588 	fcl.fcl_countp = &count;
1589 	fcl.fcl_maxcount = cip->ci_nsuspects;
1590 	fcl.fcl_msgp = &msg;
1591 	fcl.fcl_ba = ba;
1592 	fcl.fcl_nva = nva;
1593 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
1594 	(void) pthread_mutex_unlock(&cip->ci_lock);
1595 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
1596 	    count);
1597 }
1598 
1599 /*
1600  * fmd_case_update_status() can be called on either the proxy side when a
1601  * list.suspect is received, or on the diagnosing side when an update request
1602  * is received from the proxy. It updates the status in the resource cache.
1603  */
1604 void
1605 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
1606     uint8_t *diag_asrup)
1607 {
1608 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1609 	int count = 0;
1610 	fmd_asru_update_status_t faus;
1611 
1612 	/*
1613 	 * update status of resource cache entries
1614 	 */
1615 	faus.faus_countp = &count;
1616 	faus.faus_maxcount = cip->ci_nsuspects;
1617 	faus.faus_ba = statusp;
1618 	faus.faus_proxy_asru = proxy_asrup;
1619 	faus.faus_diag_asru = diag_asrup;
1620 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
1621 	(void) pthread_mutex_lock(&cip->ci_lock);
1622 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
1623 	    &faus);
1624 	(void) pthread_mutex_unlock(&cip->ci_lock);
1625 }
1626 
1627 /*
1628  * Called on either the proxy side or the diag side when a repair has taken
1629  * place on the other side but this side may know the asru "contains"
1630  * relationships.
1631  */
1632 void
1633 fmd_case_update_containees(fmd_case_t *cp)
1634 {
1635 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1636 
1637 	(void) pthread_mutex_lock(&cip->ci_lock);
1638 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1639 	    fmd_asru_update_containees, NULL);
1640 	(void) pthread_mutex_unlock(&cip->ci_lock);
1641 }
1642 
1643 /*
1644  * fmd_case_close_status() is called on diagnosing side when proxy side
1645  * has had a uuclose. It updates the status in the resource cache.
1646  */
1647 void
1648 fmd_case_close_status(fmd_case_t *cp)
1649 {
1650 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1651 	int count = 0;
1652 	fmd_asru_close_status_t facs;
1653 
1654 	/*
1655 	 * update status of resource cache entries
1656 	 */
1657 	facs.facs_countp = &count;
1658 	facs.facs_maxcount = cip->ci_nsuspects;
1659 	(void) pthread_mutex_lock(&cip->ci_lock);
1660 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
1661 	    &facs);
1662 	(void) pthread_mutex_unlock(&cip->ci_lock);
1663 }
1664 
1665 /*
1666  * Indicate that the case may need to change state because one or more of the
1667  * ASRUs named as a suspect has changed state.  We examine all the suspects
1668  * and if none are still faulty, we initiate a case close transition.
1669  */
1670 void
1671 fmd_case_update(fmd_case_t *cp)
1672 {
1673 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1674 	uint_t cstate;
1675 	int faulty = 0;
1676 
1677 	(void) pthread_mutex_lock(&cip->ci_lock);
1678 	cstate = cip->ci_state;
1679 
1680 	if (cip->ci_state < FMD_CASE_SOLVED) {
1681 		(void) pthread_mutex_unlock(&cip->ci_lock);
1682 		return; /* update is not appropriate */
1683 	}
1684 
1685 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1686 		(void) pthread_mutex_unlock(&cip->ci_lock);
1687 		return; /* already repaired */
1688 	}
1689 
1690 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
1691 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1692 	(void) pthread_mutex_unlock(&cip->ci_lock);
1693 
1694 	if (faulty) {
1695 		nvlist_t *nvl;
1696 		fmd_event_t *e;
1697 		char *class;
1698 
1699 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
1700 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
1701 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1702 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1703 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1704 		fmd_log_append(fmd.d_fltlog, e, cp);
1705 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1706 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1707 		return; /* one or more suspects are still marked faulty */
1708 	}
1709 
1710 	if (cstate == FMD_CASE_CLOSED)
1711 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1712 	else
1713 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1714 }
1715 
1716 /*
1717  * Delete a closed case from the module's case list once the fmdo_close() entry
1718  * point has run to completion.  If the case is owned by a transport module,
1719  * tell the transport to proxy a case close on the other end of the transport.
1720  * Transition to the appropriate next state based on ci_flags.  This
1721  * function represents the end of CLOSE_WAIT and transitions the case to either
1722  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1723  * refer to the topmost block comment explaining the state machine for details.
1724  */
1725 void
1726 fmd_case_delete(fmd_case_t *cp)
1727 {
1728 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1729 	fmd_modstat_t *msp;
1730 	size_t buftotal;
1731 
1732 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
1733 	ASSERT(fmd_module_locked(cip->ci_mod));
1734 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1735 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1736 
1737 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1738 	msp = cip->ci_mod->mod_stats;
1739 
1740 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1741 	msp->ms_caseopen.fmds_value.ui64--;
1742 
1743 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1744 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1745 
1746 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1747 
1748 	if (cip->ci_xprt == NULL)
1749 		fmd_module_setcdirty(cip->ci_mod);
1750 
1751 	fmd_module_rele(cip->ci_mod);
1752 	cip->ci_mod = fmd.d_rmod;
1753 	fmd_module_hold(cip->ci_mod);
1754 
1755 	/*
1756 	 * If the case has been solved, then retain it
1757 	 * on the root module's case list at least until we're transitioned.
1758 	 * Otherwise free the case with our final fmd_case_rele() below.
1759 	 */
1760 	if (cip->ci_flags & FMD_CF_SOLVED) {
1761 		fmd_module_lock(cip->ci_mod);
1762 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1763 		fmd_module_unlock(cip->ci_mod);
1764 		fmd_case_hold(cp);
1765 	}
1766 
1767 	/*
1768 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
1769 	 * Note that for proxy case if we're transitioning to CLOSED it means
1770 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
1771 	 * the diagnosing side. No need to notify the diagnosing side if we are
1772 	 * transitioning to REPAIRED as we only do this when requested to do
1773 	 * so by the diagnosing side anyway.
1774 	 */
1775 	if (cip->ci_flags & FMD_CF_REPAIRED)
1776 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1777 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
1778 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1779 		if (cip->ci_xprt != NULL)
1780 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1781 	}
1782 
1783 	fmd_case_rele(cp);
1784 }
1785 
1786 void
1787 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
1788 {
1789 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1790 
1791 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1792 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1793 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1794 
1795 	ASSERT(fmd_module_locked(cip->ci_mod));
1796 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1797 	if (delete_from_asru_cache) {
1798 		(void) pthread_mutex_lock(&cip->ci_lock);
1799 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1800 		(void) pthread_mutex_unlock(&cip->ci_lock);
1801 	}
1802 	fmd_case_rele(cp);
1803 }
1804 
1805 /*
1806  * Indicate that the problem corresponding to a case has been repaired by
1807  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1808  * already been closed, this function initiates the transition to CLOSE_WAIT.
1809  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1810  * grab and drop ci_lock without the case being able to be freed in between.
1811  */
1812 int
1813 fmd_case_repair(fmd_case_t *cp)
1814 {
1815 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1816 	uint_t cstate;
1817 	fmd_asru_rep_arg_t fara;
1818 
1819 	(void) pthread_mutex_lock(&cip->ci_lock);
1820 	cstate = cip->ci_state;
1821 
1822 	if (cstate < FMD_CASE_SOLVED) {
1823 		(void) pthread_mutex_unlock(&cip->ci_lock);
1824 		return (fmd_set_errno(EFMD_CASE_STATE));
1825 	}
1826 
1827 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1828 		(void) pthread_mutex_unlock(&cip->ci_lock);
1829 		return (0); /* already repaired */
1830 	}
1831 
1832 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
1833 	fara.fara_reason = FMD_ASRU_REPAIRED;
1834 	fara.fara_bywhat = FARA_BY_CASE;
1835 	fara.fara_rval = NULL;
1836 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
1837 	(void) pthread_mutex_unlock(&cip->ci_lock);
1838 
1839 	/*
1840 	 * if this is a proxied case, send the repair across the transport.
1841 	 * The remote side will then do the repair and send a list.repaired back
1842 	 * again such that we can finally repair the case on this side.
1843 	 */
1844 	if (cip->ci_xprt != NULL) {
1845 		fmd_case_xprt_updated(cp);
1846 		return (0);
1847 	}
1848 
1849 	if (cstate == FMD_CASE_CLOSED)
1850 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1851 	else
1852 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1853 
1854 	return (0);
1855 }
1856 
1857 int
1858 fmd_case_acquit(fmd_case_t *cp)
1859 {
1860 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1861 	uint_t cstate;
1862 	fmd_asru_rep_arg_t fara;
1863 
1864 	(void) pthread_mutex_lock(&cip->ci_lock);
1865 	cstate = cip->ci_state;
1866 
1867 	if (cstate < FMD_CASE_SOLVED) {
1868 		(void) pthread_mutex_unlock(&cip->ci_lock);
1869 		return (fmd_set_errno(EFMD_CASE_STATE));
1870 	}
1871 
1872 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1873 		(void) pthread_mutex_unlock(&cip->ci_lock);
1874 		return (0); /* already repaired */
1875 	}
1876 
1877 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
1878 	fara.fara_reason = FMD_ASRU_ACQUITTED;
1879 	fara.fara_bywhat = FARA_BY_CASE;
1880 	fara.fara_rval = NULL;
1881 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
1882 	(void) pthread_mutex_unlock(&cip->ci_lock);
1883 
1884 	/*
1885 	 * if this is a proxied case, send the repair across the transport.
1886 	 * The remote side will then do the repair and send a list.repaired back
1887 	 * again such that we can finally repair the case on this side.
1888 	 */
1889 	if (cip->ci_xprt != NULL) {
1890 		fmd_case_xprt_updated(cp);
1891 		return (0);
1892 	}
1893 
1894 	if (cstate == FMD_CASE_CLOSED)
1895 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1896 	else
1897 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1898 
1899 	return (0);
1900 }
1901 
1902 int
1903 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1904 {
1905 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1906 	fmd_case_item_t *cit;
1907 	uint_t state;
1908 	int rv = 0;
1909 
1910 	(void) pthread_mutex_lock(&cip->ci_lock);
1911 
1912 	if (cip->ci_state >= FMD_CASE_SOLVED)
1913 		state = FMD_EVS_DIAGNOSED;
1914 	else
1915 		state = FMD_EVS_ACCEPTED;
1916 
1917 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1918 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1919 			break;
1920 	}
1921 
1922 	if (rv == 0 && cip->ci_principal != NULL)
1923 		rv = fmd_event_equal(ep, cip->ci_principal);
1924 
1925 	(void) pthread_mutex_unlock(&cip->ci_lock);
1926 
1927 	if (rv != 0)
1928 		fmd_event_transition(ep, state);
1929 
1930 	return (rv);
1931 }
1932 
1933 int
1934 fmd_case_orphaned(fmd_case_t *cp)
1935 {
1936 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1937 }
1938 
1939 void
1940 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1941 {
1942 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1943 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1944 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1945 }
1946 
1947 void
1948 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
1949 {
1950 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1951 
1952 	if (cip->ci_diag_de)
1953 		nvlist_free(cip->ci_diag_de);
1954 	cip->ci_diag_de = nvl;
1955 }
1956 
1957 void
1958 fmd_case_setcode(fmd_case_t *cp, char *code)
1959 {
1960 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1961 
1962 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1963 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1964 }
1965 
1966 /*ARGSUSED*/
1967 void
1968 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
1969 {
1970 	int not_faulty = 0;
1971 	int faulty = 0;
1972 	nvlist_t *nvl;
1973 	fmd_event_t *e;
1974 	char *class;
1975 	int any_unusable_and_present = 0;
1976 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1977 
1978 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
1979 		return;
1980 
1981 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1982 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
1983 	    &not_faulty);
1984 
1985 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
1986 		/*
1987 		 * If none of the suspects is faulty, replay the list.repaired.
1988 		 * If all suspects are already either usable or not present then
1989 		 * also transition straight to RESOLVED state.
1990 		 */
1991 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1992 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1993 		if (!any_unusable_and_present) {
1994 			fmd_module_lock(cip->ci_mod);
1995 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1996 			fmd_module_unlock(cip->ci_mod);
1997 			cip->ci_state = FMD_CASE_RESOLVED;
1998 
1999 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2000 			    cip->ci_uuid));
2001 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2002 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2003 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2004 			    class);
2005 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2006 
2007 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2008 			    cip->ci_uuid));
2009 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2010 			(void) pthread_mutex_lock(&cip->ci_lock);
2011 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2012 			(void) pthread_mutex_unlock(&cip->ci_lock);
2013 			fmd_case_rele(cp);
2014 		} else {
2015 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2016 			    cip->ci_uuid));
2017 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2018 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2019 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2020 			    class);
2021 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2022 		}
2023 	} else if (faulty && not_faulty) {
2024 		/*
2025 		 * if some but not all of the suspects are not faulty, replay
2026 		 * the list.updated.
2027 		 */
2028 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2029 		    cip->ci_uuid));
2030 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2031 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2032 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2033 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2034 	}
2035 }
2036 
2037 void
2038 fmd_case_repair_replay()
2039 {
2040 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2041 }
2042