xref: /titanic_51/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 87c5f7b3eef6309c168257f261ac6ace4581d234)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * FMD Case Subsystem
31  *
32  * Diagnosis engines are expected to group telemetry events related to the
33  * diagnosis of a particular problem on the system into a set of cases.  The
34  * diagnosis engine may have any number of cases open at a given point in time.
35  * Some cases may eventually be *solved* by associating a suspect list of one
36  * or more problems with the case, at which point fmd publishes a list.suspect
37  * event for the case and it becomes visible to administrators and agents.
38  *
39  * Every case is named using a UUID, and is globally visible in the case hash.
40  * Cases are reference-counted, except for the reference from the case hash
41  * itself.  Consumers of case references include modules, which store active
42  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
43  *
44  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
45  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
46  * or transport) and the case is referenced by the mod_cases list.  Once the
47  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
48  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
49  *
50  *			+------------+
51  *	     +----------|  UNSOLVED  |
52  *	     |		+------------+
53  *	   1 |	             4 |
54  *           |                 |
55  *	+----v---+ /-2->+------v-----+	  3	+--------+
56  *      | SOLVED |<     | CLOSE_WAIT |--------->| CLOSED |
57  *	+--------+ \-5->+------------+		+--------+
58  *	                       |                    |
59  *                           6 |                    | 7
60  *      		+------v-----+              |
61  *	                |  REPAIRED  |<-------------+
62  *			+------------+
63  *
64  * The state machine changes are triggered by calls to fmd_case_transition()
65  * from various locations inside of fmd, as described below:
66  *
67  * [1] Called by: fmd_case_solve()
68  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
69  *                conviction policy is applied to suspect list
70  *                suspects convicted are marked faulty (F) in R$
71  *                list.suspect event logged and dispatched
72  *
73  * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
74  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
75  *                suspects convicted (F) are marked unusable (U) in R$
76  *                diagnosis engine fmdo_close() entry point scheduled
77  *                case transitions to CLOSED [3] upon exit from CLOSE_WAIT
78  *
79  * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
80  *       Actions: list.isolated event dispatched
81  *                case deleted from module's list of open cases
82  *
83  * [4] Called by: fmd_case_close(), fmd_case_uuclose()
84  *       Actions: diagnosis engine fmdo_close() entry point scheduled
85  *                case is subsequently discarded by fmd_case_delete()
86  *
87  * [5] Called by: fmd_case_repair(), fmd_case_update()
88  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
89  *                diagnosis engine fmdo_close() entry point scheduled
90  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
91  *
92  * [6] Called by: fmd_case_repair(), fmd_case_update()
93  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
94  *                suspects convicted are marked non faulty (!F) in R$
95  *                list.repaired event dispatched
96  *
97  * [7] Called by: fmd_case_repair(), fmd_case_update()
98  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
99  *                suspects convicted are marked non faulty (!F) in R$
100  *                list.repaired event dispatched
101  */
102 
103 #include <sys/fm/protocol.h>
104 #include <uuid/uuid.h>
105 #include <alloca.h>
106 
107 #include <fmd_alloc.h>
108 #include <fmd_module.h>
109 #include <fmd_error.h>
110 #include <fmd_conf.h>
111 #include <fmd_case.h>
112 #include <fmd_string.h>
113 #include <fmd_subr.h>
114 #include <fmd_protocol.h>
115 #include <fmd_event.h>
116 #include <fmd_eventq.h>
117 #include <fmd_dispq.h>
118 #include <fmd_buf.h>
119 #include <fmd_log.h>
120 #include <fmd_asru.h>
121 #include <fmd_fmri.h>
122 #include <fmd_xprt.h>
123 
124 #include <fmd.h>
125 
126 static const char *const _fmd_case_snames[] = {
127 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
128 	"SOLVED",	/* FMD_CASE_SOLVED */
129 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
130 	"CLOSED",	/* FMD_CASE_CLOSED */
131 	"REPAIRED"	/* FMD_CASE_REPAIRED */
132 };
133 
134 extern volatile uint32_t fmd_asru_fake_not_present;
135 
136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
137 
138 fmd_case_hash_t *
139 fmd_case_hash_create(void)
140 {
141 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
142 
143 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
144 	chp->ch_hashlen = fmd.d_str_buckets;
145 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
146 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
147 	    FMD_SLEEP);
148 	chp->ch_count = 0;
149 
150 	return (chp);
151 }
152 
153 /*
154  * Destroy the case hash.  Unlike most of our hash tables, no active references
155  * are kept by the case hash itself; all references come from other subsystems.
156  * The hash must be destroyed after all modules are unloaded; if anything was
157  * present in the hash it would be by definition a reference count leak.
158  */
159 void
160 fmd_case_hash_destroy(fmd_case_hash_t *chp)
161 {
162 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
163 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
164 	fmd_free(chp, sizeof (fmd_case_hash_t));
165 }
166 
167 /*
168  * Take a snapshot of the case hash by placing an additional hold on each
169  * member in an auxiliary array, and then call 'func' for each case.
170  */
171 void
172 fmd_case_hash_apply(fmd_case_hash_t *chp,
173     void (*func)(fmd_case_t *, void *), void *arg)
174 {
175 	fmd_case_impl_t *cp, **cps, **cpp;
176 	uint_t cpc, i;
177 
178 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
179 
180 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
181 	cpc = chp->ch_count;
182 
183 	for (i = 0; i < chp->ch_hashlen; i++) {
184 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) {
185 			if (fmd_case_tryhold(cp) != NULL)
186 				*cpp++ = cp;
187 		}
188 	}
189 
190 	ASSERT(cpp == cps + cpc);
191 	(void) pthread_rwlock_unlock(&chp->ch_lock);
192 
193 	for (i = 0; i < cpc; i++) {
194 		func((fmd_case_t *)cps[i], arg);
195 		fmd_case_rele((fmd_case_t *)cps[i]);
196 	}
197 
198 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
199 }
200 
201 static void
202 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
203 {
204 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
205 
206 	cip->ci_code_next = chp->ch_code_hash[h];
207 	chp->ch_code_hash[h] = cip;
208 }
209 
210 static void
211 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
212 {
213 	fmd_case_impl_t **pp, *cp;
214 
215 	if (cip->ci_code) {
216 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
217 
218 		pp = &chp->ch_code_hash[h];
219 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
220 			if (cp != cip)
221 				pp = &cp->ci_code_next;
222 			else
223 				break;
224 		}
225 		if (cp != NULL) {
226 			*pp = cp->ci_code_next;
227 			cp->ci_code_next = NULL;
228 		}
229 	}
230 }
231 
232 /*
233  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
234  * were defined for this case or if the lookup fails, the event dictionary or
235  * module code is broken, and we set the event code to a precomputed default.
236  */
237 static const char *
238 fmd_case_mkcode(fmd_case_t *cp)
239 {
240 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
241 	fmd_case_susp_t *cis;
242 	fmd_case_hash_t *chp = fmd.d_cases;
243 
244 	char **keys, **keyp;
245 	const char *s;
246 
247 	ASSERT(MUTEX_HELD(&cip->ci_lock));
248 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
249 
250 	/*
251 	 * delete any existing entry from code hash if it is on it
252 	 */
253 	fmd_case_code_hash_delete(chp, cip);
254 
255 	fmd_free(cip->ci_code, cip->ci_codelen);
256 	cip->ci_codelen = cip->ci_mod->mod_codelen;
257 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
258 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
259 
260 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
261 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
262 			keyp++;
263 	}
264 
265 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
266 
267 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
268 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
269 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
270 		fmd_free(cip->ci_code, cip->ci_codelen);
271 		cip->ci_codelen = strlen(s) + 1;
272 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
273 		(void) strcpy(cip->ci_code, s);
274 	}
275 
276 	/*
277 	 * add into hash of solved cases
278 	 */
279 	fmd_case_code_hash_insert(chp, cip);
280 
281 	return (cip->ci_code);
282 }
283 
284 typedef struct {
285 	int	*fcl_countp;
286 	uint8_t *fcl_ba;
287 	nvlist_t **fcl_nva;
288 	int	*fcl_msgp;
289 } fmd_case_lst_t;
290 
291 static void
292 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
293 {
294 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
295 	boolean_t b;
296 	int state;
297 
298 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
299 	    &b) == 0 && b == B_FALSE)
300 		*entryp->fcl_msgp = B_FALSE;
301 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
302 	state = fmd_asru_al_getstate(alp);
303 	if (state & FMD_ASRU_UNUSABLE)
304 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
305 	if (state & FMD_ASRU_FAULTY)
306 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
307 	if (!(state & FMD_ASRU_PRESENT))
308 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
309 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
310 	(*entryp->fcl_countp)++;
311 }
312 
313 static void
314 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
315 {
316 	int *faultyp = (int *)arg;
317 
318 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
319 }
320 
321 static void
322 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
323 {
324 	int *usablep = (int *)arg;
325 
326 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
327 }
328 
329 nvlist_t *
330 fmd_case_mkevent(fmd_case_t *cp, const char *class)
331 {
332 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
333 	nvlist_t **nva, *nvl;
334 	uint8_t *ba;
335 	int msg = B_TRUE;
336 	const char *code;
337 	fmd_case_lst_t fcl;
338 	int count = 0;
339 
340 	(void) pthread_mutex_lock(&cip->ci_lock);
341 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
342 
343 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
344 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
345 
346 	/*
347 	 * For each suspect associated with the case, store its fault event
348 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
349 	 * have asked not to be messaged.  If any of them have made such a
350 	 * request, propagate that attribute to the composite list.* event.
351 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
352 	 */
353 	fcl.fcl_countp = &count;
354 	fcl.fcl_msgp = &msg;
355 	fcl.fcl_ba = ba;
356 	fcl.fcl_nva = nva;
357 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
358 
359 	if (cip->ci_code == NULL)
360 		(void) fmd_case_mkcode(cp);
361 	/*
362 	 * For repair event, we lookup diagcode from dict using key
363 	 * "list.repaired".
364 	 */
365 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
366 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
367 	else
368 		code = cip->ci_code;
369 
370 	if (msg == B_FALSE)
371 		cip->ci_flags |= FMD_CF_INVISIBLE;
372 
373 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
374 	    code, count, nva, ba, msg, &cip->ci_tv);
375 
376 	(void) pthread_mutex_unlock(&cip->ci_lock);
377 	return (nvl);
378 }
379 
380 static boolean_t
381 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
382 {
383 	nvlist_t *new_rsrc;
384 	nvlist_t *rsrc;
385 	char *new_name = NULL;
386 	char *name = NULL;
387 	ssize_t new_namelen;
388 	ssize_t namelen;
389 	int fmri_present = 1;
390 	int new_fmri_present = 1;
391 	int match = B_FALSE;
392 	fmd_topo_t *ftp = fmd_topo_hold();
393 
394 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
395 		fmri_present = 0;
396 	else {
397 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
398 			goto done;
399 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
400 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
401 			goto done;
402 	}
403 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
404 		new_fmri_present = 0;
405 	else {
406 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
407 			goto done;
408 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
409 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
410 			goto done;
411 	}
412 	match = (fmri_present == new_fmri_present &&
413 	    (fmri_present == 0 ||
414 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
415 done:
416 	if (name != NULL)
417 		fmd_free(name, namelen + 1);
418 	if (new_name != NULL)
419 		fmd_free(new_name, new_namelen + 1);
420 	fmd_topo_rele(ftp);
421 	return (match);
422 }
423 
424 static int
425 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
426 {
427 	char *class, *new_class;
428 
429 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
430 		return (0);
431 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
432 	    FM_FAULT_RESOURCE))
433 		return (0);
434 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
435 		return (0);
436 	(void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
437 	(void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
438 	return (strcmp(class, new_class) == 0);
439 }
440 
441 /*
442  * see if an identical suspect list already exists in the cache
443  */
444 static int
445 fmd_case_check_for_dups(fmd_case_t *cp)
446 {
447 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
448 	fmd_case_hash_t *chp = fmd.d_cases;
449 	fmd_case_susp_t *xcis, *cis;
450 	int match = 0, match_susp;
451 	uint_t h;
452 
453 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
454 
455 	/*
456 	 * Find all cases with this code
457 	 */
458 	h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
459 	for (xcip = chp->ch_code_hash[h]; xcip != NULL;
460 	    xcip = xcip->ci_code_next) {
461 		/*
462 		 * only look for any cases (apart from this one)
463 		 * whose code and number of suspects match
464 		 */
465 		if (xcip == cip || fmd_case_tryhold(xcip) == NULL)
466 			continue;
467 		if (strcmp(xcip->ci_code, cip->ci_code) != 0 ||
468 		    xcip->ci_nsuspects != cip->ci_nsuspects) {
469 			fmd_case_rele((fmd_case_t *)xcip);
470 			continue;
471 		}
472 
473 		/*
474 		 * For each suspect in one list, check if there
475 		 * is an identical suspect in the other list
476 		 */
477 		match = 1;
478 		for (xcis = xcip->ci_suspects; xcis != NULL;
479 		    xcis = xcis->cis_next) {
480 			match_susp = 0;
481 			for (cis = cip->ci_suspects; cis != NULL;
482 			    cis = cis->cis_next) {
483 				if (fmd_case_match_suspect(cis, xcis) == 1) {
484 					match_susp = 1;
485 					break;
486 				}
487 			}
488 			if (match_susp == 0) {
489 				match = 0;
490 				break;
491 			}
492 		}
493 		fmd_case_rele((fmd_case_t *)xcip);
494 		if (match) {
495 			(void) pthread_rwlock_unlock(&chp->ch_lock);
496 			return (1);
497 		}
498 	}
499 	(void) pthread_rwlock_unlock(&chp->ch_lock);
500 	return (0);
501 }
502 
503 /*
504  * Convict suspects in a case by applying a conviction policy and updating the
505  * resource cache prior to emitting the list.suspect event for the given case.
506  * At present, our policy is very simple: convict every suspect in the case.
507  * In the future, this policy can be extended and made configurable to permit:
508  *
509  * - convicting the suspect with the highest FIT rate
510  * - convicting the suspect with the cheapest FRU
511  * - convicting the suspect with the FRU that is in a depot's inventory
512  * - convicting the suspect with the longest lifetime
513  *
514  * and so forth.  A word to the wise: this problem is significantly harder that
515  * it seems at first glance.  Future work should heed the following advice:
516  *
517  * Hacking the policy into C code here is a very bad idea.  The policy needs to
518  * be decided upon very carefully and fundamentally encodes knowledge of what
519  * suspect list combinations can be emitted by what diagnosis engines.  As such
520  * fmd's code is the wrong location, because that would require fmd itself to
521  * be updated for every diagnosis engine change, defeating the entire design.
522  * The FMA Event Registry knows the suspect list combinations: policy inputs
523  * can be derived from it and used to produce per-module policy configuration.
524  *
525  * If the policy needs to be dynamic and not statically fixed at either fmd
526  * startup or module load time, any implementation of dynamic policy retrieval
527  * must employ some kind of caching mechanism or be part of a built-in module.
528  * The fmd_case_convict() function is called with locks held inside of fmd and
529  * is not a place where unbounded blocking on some inter-process or inter-
530  * system communication to another service (e.g. another daemon) can occur.
531  */
532 static int
533 fmd_case_convict(fmd_case_t *cp)
534 {
535 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
536 	fmd_asru_hash_t *ahp = fmd.d_asrus;
537 
538 	fmd_case_susp_t *cis;
539 	fmd_asru_link_t *alp;
540 
541 	(void) pthread_mutex_lock(&cip->ci_lock);
542 	(void) fmd_case_mkcode(cp);
543 	if (fmd_case_check_for_dups(cp) == 1) {
544 		(void) pthread_mutex_unlock(&cip->ci_lock);
545 		return (1);
546 	}
547 
548 	/*
549 	 * no suspect list already exists  - allocate new cache entries
550 	 */
551 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
552 		if ((alp = fmd_asru_hash_create_entry(ahp,
553 		    cp, cis->cis_nvl)) == NULL) {
554 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
555 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
556 			continue;
557 		}
558 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE);
559 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
560 	}
561 
562 	(void) pthread_mutex_unlock(&cip->ci_lock);
563 	return (0);
564 }
565 
566 void
567 fmd_case_publish(fmd_case_t *cp, uint_t state)
568 {
569 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
570 	fmd_event_t *e;
571 	nvlist_t *nvl;
572 	char *class;
573 
574 	if (state == FMD_CASE_CURRENT)
575 		state = cip->ci_state; /* use current state */
576 
577 	switch (state) {
578 	case FMD_CASE_SOLVED:
579 		(void) pthread_mutex_lock(&cip->ci_lock);
580 		if (cip->ci_tv_valid == 0) {
581 			fmd_time_gettimeofday(&cip->ci_tv);
582 			cip->ci_tv_valid = 1;
583 		}
584 		(void) pthread_mutex_unlock(&cip->ci_lock);
585 
586 		if (fmd_case_convict(cp) == 1) { /* dupclose */
587 			cip->ci_flags &= ~FMD_CF_SOLVED;
588 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
589 			break;
590 		}
591 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
592 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
593 
594 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
595 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
596 		fmd_log_append(fmd.d_fltlog, e, cp);
597 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
598 		fmd_dispq_dispatch(fmd.d_disp, e, class);
599 
600 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
601 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
602 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
603 
604 		break;
605 
606 	case FMD_CASE_CLOSE_WAIT:
607 		fmd_case_hold(cp);
608 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
609 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
610 
611 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
612 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
613 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
614 
615 		break;
616 
617 	case FMD_CASE_CLOSED:
618 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
619 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
620 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
621 		fmd_dispq_dispatch(fmd.d_disp, e, class);
622 		break;
623 
624 	case FMD_CASE_REPAIRED:
625 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
626 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
627 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
628 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
629 		fmd_log_append(fmd.d_fltlog, e, cp);
630 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
631 		fmd_dispq_dispatch(fmd.d_disp, e, class);
632 		break;
633 	}
634 }
635 
636 fmd_case_t *
637 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
638 {
639 	fmd_case_impl_t *cip;
640 	uint_t h;
641 
642 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
643 	h = fmd_strhash(uuid) % chp->ch_hashlen;
644 
645 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
646 		if (strcmp(cip->ci_uuid, uuid) == 0)
647 			break;
648 	}
649 
650 	/*
651 	 * If deleting bit is set, treat the case as if it doesn't exist.
652 	 */
653 	if (cip != NULL)
654 		cip = fmd_case_tryhold(cip);
655 
656 	if (cip == NULL)
657 		(void) fmd_set_errno(EFMD_CASE_INVAL);
658 
659 	(void) pthread_rwlock_unlock(&chp->ch_lock);
660 	return ((fmd_case_t *)cip);
661 }
662 
663 static fmd_case_impl_t *
664 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
665 {
666 	fmd_case_impl_t *eip;
667 	uint_t h;
668 
669 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
670 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
671 
672 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
673 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
674 		    fmd_case_tryhold(eip) != NULL) {
675 			(void) pthread_rwlock_unlock(&chp->ch_lock);
676 			return (eip); /* uuid already present */
677 		}
678 	}
679 
680 	cip->ci_next = chp->ch_hash[h];
681 	chp->ch_hash[h] = cip;
682 
683 	chp->ch_count++;
684 	ASSERT(chp->ch_count != 0);
685 
686 	(void) pthread_rwlock_unlock(&chp->ch_lock);
687 	return (cip);
688 }
689 
690 static void
691 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
692 {
693 	fmd_case_impl_t *cp, **pp;
694 	uint_t h;
695 
696 	ASSERT(MUTEX_HELD(&cip->ci_lock));
697 
698 	cip->ci_flags |= FMD_CF_DELETING;
699 	(void) pthread_mutex_unlock(&cip->ci_lock);
700 
701 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
702 
703 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
704 	pp = &chp->ch_hash[h];
705 
706 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
707 		if (cp != cip)
708 			pp = &cp->ci_next;
709 		else
710 			break;
711 	}
712 
713 	if (cp == NULL) {
714 		fmd_panic("case %p (%s) not found on hash chain %u\n",
715 		    (void *)cip, cip->ci_uuid, h);
716 	}
717 
718 	*pp = cp->ci_next;
719 	cp->ci_next = NULL;
720 
721 	/*
722 	 * delete from code hash if it is on it
723 	 */
724 	fmd_case_code_hash_delete(chp, cip);
725 
726 	ASSERT(chp->ch_count != 0);
727 	chp->ch_count--;
728 
729 	(void) pthread_rwlock_unlock(&chp->ch_lock);
730 
731 	(void) pthread_mutex_lock(&cip->ci_lock);
732 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
733 }
734 
735 fmd_case_t *
736 fmd_case_create(fmd_module_t *mp, void *data)
737 {
738 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
739 	fmd_case_impl_t *eip = NULL;
740 	uuid_t uuid;
741 
742 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
743 	fmd_buf_hash_create(&cip->ci_bufs);
744 
745 	fmd_module_hold(mp);
746 	cip->ci_mod = mp;
747 	cip->ci_refs = 1;
748 	cip->ci_state = FMD_CASE_UNSOLVED;
749 	cip->ci_flags = FMD_CF_DIRTY;
750 	cip->ci_data = data;
751 
752 	/*
753 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
754 	 * define any constant for the length of an unparse string, and do not
755 	 * permit the caller to specify a buffer length for safety.  The spec
756 	 * says it will be 36 bytes, but we make it tunable just in case.
757 	 */
758 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
759 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
760 
761 	/*
762 	 * We expect this loop to execute only once, but code it defensively
763 	 * against the possibility of libuuid bugs.  Keep generating uuids and
764 	 * attempting to do a hash insert until we get a unique one.
765 	 */
766 	do {
767 		if (eip != NULL)
768 			fmd_case_rele((fmd_case_t *)eip);
769 		uuid_generate(uuid);
770 		uuid_unparse(uuid, cip->ci_uuid);
771 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
772 
773 	ASSERT(fmd_module_locked(mp));
774 	fmd_list_append(&mp->mod_cases, cip);
775 	fmd_module_setcdirty(mp);
776 
777 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
778 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
779 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
780 
781 	return ((fmd_case_t *)cip);
782 }
783 
784 static void
785 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
786 {
787 	fmd_case_susp_t *cis, *ncis;
788 
789 	ASSERT(MUTEX_HELD(&cip->ci_lock));
790 
791 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
792 		ncis = cis->cis_next;
793 		nvlist_free(cis->cis_nvl);
794 		fmd_free(cis, sizeof (fmd_case_susp_t));
795 	}
796 
797 	cip->ci_suspects = NULL;
798 	cip->ci_nsuspects = 0;
799 }
800 
801 fmd_case_t *
802 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
803     uint_t state, const char *uuid, const char *code)
804 {
805 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
806 	fmd_case_impl_t *eip;
807 
808 	ASSERT(state < FMD_CASE_REPAIRED);
809 
810 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
811 	fmd_buf_hash_create(&cip->ci_bufs);
812 
813 	fmd_module_hold(mp);
814 	cip->ci_mod = mp;
815 	cip->ci_xprt = xp;
816 	cip->ci_refs = 1;
817 	cip->ci_state = state;
818 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
819 	cip->ci_uuidlen = strlen(cip->ci_uuid);
820 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
821 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
822 
823 	if (state > FMD_CASE_CLOSE_WAIT)
824 		cip->ci_flags |= FMD_CF_SOLVED;
825 
826 	/*
827 	 * Insert the case into the global case hash.  If the specified UUID is
828 	 * already present, check to see if it is an orphan: if so, reclaim it;
829 	 * otherwise if it is owned by a different module then return NULL.
830 	 */
831 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
832 		(void) pthread_mutex_lock(&cip->ci_lock);
833 		cip->ci_refs--; /* decrement to zero */
834 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
835 
836 		cip = eip; /* switch 'cip' to the existing case */
837 		(void) pthread_mutex_lock(&cip->ci_lock);
838 
839 		/*
840 		 * If the ASRU cache is trying to recreate an orphan, then just
841 		 * return the existing case that we found without changing it.
842 		 */
843 		if (mp == fmd.d_rmod) {
844 			(void) pthread_mutex_unlock(&cip->ci_lock);
845 			fmd_case_rele((fmd_case_t *)cip);
846 			return ((fmd_case_t *)cip);
847 		}
848 
849 		/*
850 		 * If the existing case isn't an orphan or is being proxied,
851 		 * then we have a UUID conflict: return failure to the caller.
852 		 */
853 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
854 			(void) pthread_mutex_unlock(&cip->ci_lock);
855 			fmd_case_rele((fmd_case_t *)cip);
856 			return (NULL);
857 		}
858 
859 		/*
860 		 * If the new module is reclaiming an orphaned case, remove
861 		 * the case from the root module, switch ci_mod, and then fall
862 		 * through to adding the case to the new owner module 'mp'.
863 		 */
864 		fmd_module_lock(cip->ci_mod);
865 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
866 		fmd_module_unlock(cip->ci_mod);
867 
868 		fmd_module_rele(cip->ci_mod);
869 		cip->ci_mod = mp;
870 		fmd_module_hold(mp);
871 
872 		fmd_case_destroy_suspects(cip);
873 		cip->ci_state = state;
874 
875 		(void) pthread_mutex_unlock(&cip->ci_lock);
876 		fmd_case_rele((fmd_case_t *)cip);
877 	} else {
878 		/*
879 		 * add into hash of solved cases
880 		 */
881 		if (cip->ci_code)
882 			fmd_case_code_hash_insert(fmd.d_cases, cip);
883 	}
884 
885 	ASSERT(fmd_module_locked(mp));
886 	fmd_list_append(&mp->mod_cases, cip);
887 
888 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
889 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
890 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
891 
892 	return ((fmd_case_t *)cip);
893 }
894 
895 void
896 fmd_case_destroy(fmd_case_t *cp, int visible)
897 {
898 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
899 	fmd_case_item_t *cit, *ncit;
900 
901 	ASSERT(MUTEX_HELD(&cip->ci_lock));
902 	ASSERT(cip->ci_refs == 0);
903 
904 	if (visible) {
905 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
906 		fmd_case_hash_delete(fmd.d_cases, cip);
907 	}
908 
909 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
910 		ncit = cit->cit_next;
911 		fmd_event_rele(cit->cit_event);
912 		fmd_free(cit, sizeof (fmd_case_item_t));
913 	}
914 
915 	fmd_case_destroy_suspects(cip);
916 
917 	if (cip->ci_principal != NULL)
918 		fmd_event_rele(cip->ci_principal);
919 
920 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
921 	fmd_free(cip->ci_code, cip->ci_codelen);
922 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
923 
924 	fmd_module_rele(cip->ci_mod);
925 	fmd_free(cip, sizeof (fmd_case_impl_t));
926 }
927 
928 void
929 fmd_case_hold(fmd_case_t *cp)
930 {
931 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
932 
933 	(void) pthread_mutex_lock(&cip->ci_lock);
934 	fmd_case_hold_locked(cp);
935 	(void) pthread_mutex_unlock(&cip->ci_lock);
936 }
937 
938 void
939 fmd_case_hold_locked(fmd_case_t *cp)
940 {
941 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
942 
943 	ASSERT(MUTEX_HELD(&cip->ci_lock));
944 	if (cip->ci_flags & FMD_CF_DELETING)
945 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
946 		    (void *)cip, cip->ci_uuid);
947 	cip->ci_refs++;
948 	ASSERT(cip->ci_refs != 0);
949 }
950 
951 static fmd_case_impl_t *
952 fmd_case_tryhold(fmd_case_impl_t *cip)
953 {
954 	/*
955 	 * If the case's "deleting" bit is unset, hold and return case,
956 	 * otherwise, return NULL.
957 	 */
958 	(void) pthread_mutex_lock(&cip->ci_lock);
959 	if (cip->ci_flags & FMD_CF_DELETING) {
960 		(void) pthread_mutex_unlock(&cip->ci_lock);
961 		cip = NULL;
962 	} else {
963 		fmd_case_hold_locked((fmd_case_t *)cip);
964 		(void) pthread_mutex_unlock(&cip->ci_lock);
965 	}
966 	return (cip);
967 }
968 
969 void
970 fmd_case_rele(fmd_case_t *cp)
971 {
972 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
973 
974 	(void) pthread_mutex_lock(&cip->ci_lock);
975 	ASSERT(cip->ci_refs != 0);
976 
977 	if (--cip->ci_refs == 0)
978 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
979 	else
980 		(void) pthread_mutex_unlock(&cip->ci_lock);
981 }
982 
983 void
984 fmd_case_rele_locked(fmd_case_t *cp)
985 {
986 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
987 
988 	ASSERT(MUTEX_HELD(&cip->ci_lock));
989 	--cip->ci_refs;
990 	ASSERT(cip->ci_refs != 0);
991 }
992 
993 int
994 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
995 {
996 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
997 	fmd_case_item_t *cit;
998 	fmd_event_t *oep;
999 	uint_t state;
1000 	int new;
1001 
1002 	fmd_event_hold(ep);
1003 	(void) pthread_mutex_lock(&cip->ci_lock);
1004 
1005 	if (cip->ci_flags & FMD_CF_SOLVED)
1006 		state = FMD_EVS_DIAGNOSED;
1007 	else
1008 		state = FMD_EVS_ACCEPTED;
1009 
1010 	oep = cip->ci_principal;
1011 	cip->ci_principal = ep;
1012 
1013 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1014 		if (cit->cit_event == ep)
1015 			break;
1016 	}
1017 
1018 	cip->ci_flags |= FMD_CF_DIRTY;
1019 	new = cit == NULL && ep != oep;
1020 
1021 	(void) pthread_mutex_unlock(&cip->ci_lock);
1022 
1023 	fmd_module_setcdirty(cip->ci_mod);
1024 	fmd_event_transition(ep, state);
1025 
1026 	if (oep != NULL)
1027 		fmd_event_rele(oep);
1028 
1029 	return (new);
1030 }
1031 
1032 int
1033 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1034 {
1035 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1036 	fmd_case_item_t *cit;
1037 	uint_t state;
1038 	int new;
1039 
1040 	(void) pthread_mutex_lock(&cip->ci_lock);
1041 
1042 	if (cip->ci_flags & FMD_CF_SOLVED)
1043 		state = FMD_EVS_DIAGNOSED;
1044 	else
1045 		state = FMD_EVS_ACCEPTED;
1046 
1047 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1048 		if (cit->cit_event == ep)
1049 			break;
1050 	}
1051 
1052 	new = cit == NULL && ep != cip->ci_principal;
1053 
1054 	/*
1055 	 * If the event is already in the case or the case is already solved,
1056 	 * there is no reason to save it: just transition it appropriately.
1057 	 */
1058 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1059 		(void) pthread_mutex_unlock(&cip->ci_lock);
1060 		fmd_event_transition(ep, state);
1061 		return (new);
1062 	}
1063 
1064 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1065 	fmd_event_hold(ep);
1066 
1067 	cit->cit_next = cip->ci_items;
1068 	cit->cit_event = ep;
1069 
1070 	cip->ci_items = cit;
1071 	cip->ci_nitems++;
1072 
1073 	cip->ci_flags |= FMD_CF_DIRTY;
1074 	(void) pthread_mutex_unlock(&cip->ci_lock);
1075 
1076 	fmd_module_setcdirty(cip->ci_mod);
1077 	fmd_event_transition(ep, state);
1078 
1079 	return (new);
1080 }
1081 
1082 void
1083 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1084 {
1085 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1086 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1087 
1088 	(void) pthread_mutex_lock(&cip->ci_lock);
1089 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1090 	cip->ci_flags |= FMD_CF_DIRTY;
1091 
1092 	cis->cis_next = cip->ci_suspects;
1093 	cis->cis_nvl = nvl;
1094 
1095 	cip->ci_suspects = cis;
1096 	cip->ci_nsuspects++;
1097 
1098 	(void) pthread_mutex_unlock(&cip->ci_lock);
1099 	fmd_module_setcdirty(cip->ci_mod);
1100 }
1101 
1102 void
1103 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1104 {
1105 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1106 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1107 	boolean_t b;
1108 
1109 	(void) pthread_mutex_lock(&cip->ci_lock);
1110 	ASSERT(cip->ci_state == FMD_CASE_CLOSED);
1111 	ASSERT(cip->ci_mod == fmd.d_rmod);
1112 
1113 	cis->cis_next = cip->ci_suspects;
1114 	cis->cis_nvl = nvl;
1115 
1116 	if (nvlist_lookup_boolean_value(nvl,
1117 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1118 		cip->ci_flags |= FMD_CF_INVISIBLE;
1119 
1120 	cip->ci_suspects = cis;
1121 	cip->ci_nsuspects++;
1122 
1123 	(void) pthread_mutex_unlock(&cip->ci_lock);
1124 }
1125 
1126 void
1127 fmd_case_reset_suspects(fmd_case_t *cp)
1128 {
1129 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1130 
1131 	(void) pthread_mutex_lock(&cip->ci_lock);
1132 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1133 
1134 	fmd_case_destroy_suspects(cip);
1135 	cip->ci_flags |= FMD_CF_DIRTY;
1136 
1137 	(void) pthread_mutex_unlock(&cip->ci_lock);
1138 	fmd_module_setcdirty(cip->ci_mod);
1139 }
1140 
1141 /*ARGSUSED*/
1142 static void
1143 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1144 {
1145 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1146 }
1147 
1148 /*
1149  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1150  * whatever actions and emit whatever events are appropriate for the state.
1151  * Refer to the topmost block comment explaining the state machine for details.
1152  */
1153 void
1154 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1155 {
1156 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1157 	fmd_case_item_t *cit;
1158 	fmd_event_t *e;
1159 
1160 	ASSERT(state <= FMD_CASE_REPAIRED);
1161 	(void) pthread_mutex_lock(&cip->ci_lock);
1162 
1163 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1164 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
1165 
1166 	cip->ci_flags |= flags;
1167 
1168 	if (cip->ci_state >= state) {
1169 		(void) pthread_mutex_unlock(&cip->ci_lock);
1170 		return; /* already in specified state */
1171 	}
1172 
1173 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1174 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1175 
1176 	cip->ci_state = state;
1177 	cip->ci_flags |= FMD_CF_DIRTY;
1178 
1179 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1180 		fmd_module_setcdirty(cip->ci_mod);
1181 
1182 	switch (state) {
1183 	case FMD_CASE_SOLVED:
1184 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1185 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1186 
1187 		if (cip->ci_principal != NULL) {
1188 			fmd_event_transition(cip->ci_principal,
1189 			    FMD_EVS_DIAGNOSED);
1190 		}
1191 		break;
1192 
1193 	case FMD_CASE_CLOSE_WAIT:
1194 		/*
1195 		 * If the case was never solved, do not change ASRUs.
1196 		 * If the case was never fmd_case_closed, do not change ASRUs.
1197 		 * If the case was repaired, do not change ASRUs.
1198 		 */
1199 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1200 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1201 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1202 			    fmd_case_unusable, NULL);
1203 
1204 		/*
1205 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1206 		 * module is no longer loaded: continue on to CASE_CLOSED.
1207 		 */
1208 		if (fmd_case_orphaned(cp))
1209 			state = cip->ci_state = FMD_CASE_CLOSED;
1210 		break;
1211 
1212 	case FMD_CASE_REPAIRED:
1213 		ASSERT(fmd_case_orphaned(cp));
1214 		fmd_module_lock(cip->ci_mod);
1215 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1216 		fmd_module_unlock(cip->ci_mod);
1217 		break;
1218 	}
1219 
1220 	(void) pthread_mutex_unlock(&cip->ci_lock);
1221 
1222 	/*
1223 	 * If the module has initialized, then publish the appropriate event
1224 	 * for the new case state.  If not, we are being called from the
1225 	 * checkpoint code during module load, in which case the module's
1226 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1227 	 * may not be open yet, which will prevent us from computing the event
1228 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1229 	 * event in our queue: this won't be processed until _fmd_init is done.
1230 	 */
1231 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1232 		fmd_case_publish(cp, state);
1233 	else {
1234 		fmd_case_hold(cp);
1235 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1236 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1237 	}
1238 
1239 	/*
1240 	 * If we transitioned to REPAIRED, adjust the reference count to
1241 	 * reflect our removal from fmd.d_rmod->mod_cases.  If the caller has
1242 	 * not placed an additional hold on the case, it will now be freed.
1243 	 */
1244 	if (state == FMD_CASE_REPAIRED) {
1245 		(void) pthread_mutex_lock(&cip->ci_lock);
1246 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1247 		(void) pthread_mutex_unlock(&cip->ci_lock);
1248 		fmd_case_rele(cp);
1249 	}
1250 }
1251 
1252 /*
1253  * Transition the specified case to *at least* the specified state by first
1254  * re-validating the suspect list using the resource cache.  This function is
1255  * employed by the checkpoint code when restoring a saved, solved case to see
1256  * if the state of the case has effectively changed while fmd was not running
1257  * or the module was not loaded.  If none of the suspects are present anymore,
1258  * advance the state to REPAIRED.  If none are usable, advance to CLOSE_WAIT.
1259  */
1260 void
1261 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1262 {
1263 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1264 
1265 	int faulty = 0;		/* are any suspects faulty? */
1266 	int usable = 0;		/* are any suspects usable? */
1267 
1268 	ASSERT(state >= FMD_CASE_SOLVED);
1269 	(void) pthread_mutex_lock(&cip->ci_lock);
1270 
1271 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1272 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1273 
1274 	(void) pthread_mutex_unlock(&cip->ci_lock);
1275 
1276 	/*
1277 	 * If none of the suspects were faulty, it implies they were either
1278 	 * repaired already or not present and the rsrc.age time has expired.
1279 	 * We can move the state on to repaired.
1280 	 */
1281 	if (!faulty) {
1282 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1283 		flags |= FMD_CF_REPAIRED;
1284 	} else if (!usable) {
1285 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1286 		flags |= FMD_CF_ISOLATED;
1287 	}
1288 
1289 	fmd_case_transition(cp, state, flags);
1290 }
1291 
1292 void
1293 fmd_case_setdirty(fmd_case_t *cp)
1294 {
1295 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1296 
1297 	(void) pthread_mutex_lock(&cip->ci_lock);
1298 	cip->ci_flags |= FMD_CF_DIRTY;
1299 	(void) pthread_mutex_unlock(&cip->ci_lock);
1300 
1301 	fmd_module_setcdirty(cip->ci_mod);
1302 }
1303 
1304 void
1305 fmd_case_clrdirty(fmd_case_t *cp)
1306 {
1307 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1308 
1309 	(void) pthread_mutex_lock(&cip->ci_lock);
1310 	cip->ci_flags &= ~FMD_CF_DIRTY;
1311 	(void) pthread_mutex_unlock(&cip->ci_lock);
1312 }
1313 
1314 void
1315 fmd_case_commit(fmd_case_t *cp)
1316 {
1317 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1318 	fmd_case_item_t *cit;
1319 
1320 	(void) pthread_mutex_lock(&cip->ci_lock);
1321 
1322 	if (cip->ci_flags & FMD_CF_DIRTY) {
1323 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1324 			fmd_event_commit(cit->cit_event);
1325 
1326 		if (cip->ci_principal != NULL)
1327 			fmd_event_commit(cip->ci_principal);
1328 
1329 		fmd_buf_hash_commit(&cip->ci_bufs);
1330 		cip->ci_flags &= ~FMD_CF_DIRTY;
1331 	}
1332 
1333 	(void) pthread_mutex_unlock(&cip->ci_lock);
1334 }
1335 
1336 /*
1337  * Indicate that the case may need to change state because one or more of the
1338  * ASRUs named as a suspect has changed state.  We examine all the suspects
1339  * and if none are still faulty, we initiate a case close transition.
1340  */
1341 void
1342 fmd_case_update(fmd_case_t *cp)
1343 {
1344 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1345 	uint_t cstate;
1346 	int faulty = 0;
1347 
1348 	(void) pthread_mutex_lock(&cip->ci_lock);
1349 	cstate = cip->ci_state;
1350 
1351 	if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1352 		(void) pthread_mutex_unlock(&cip->ci_lock);
1353 		return; /* update is not appropriate */
1354 	}
1355 
1356 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1357 		(void) pthread_mutex_unlock(&cip->ci_lock);
1358 		return; /* already repaired */
1359 	}
1360 
1361 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1362 	(void) pthread_mutex_unlock(&cip->ci_lock);
1363 
1364 	if (faulty)
1365 		return; /* one or more suspects are still marked faulty */
1366 
1367 	if (cstate == FMD_CASE_CLOSED)
1368 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1369 	else
1370 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1371 }
1372 
1373 /*
1374  * Delete a closed case from the module's case list once the fmdo_close() entry
1375  * point has run to completion.  If the case is owned by a transport module,
1376  * tell the transport to proxy a case close on the other end of the transport.
1377  * If not, transition to the appropriate next state based on ci_flags.  This
1378  * function represents the end of CLOSE_WAIT and transitions the case to either
1379  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1380  * refer to the topmost block comment explaining the state machine for details.
1381  */
1382 void
1383 fmd_case_delete(fmd_case_t *cp)
1384 {
1385 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1386 	fmd_modstat_t *msp;
1387 	size_t buftotal;
1388 
1389 	ASSERT(fmd_module_locked(cip->ci_mod));
1390 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1391 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1392 
1393 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1394 	msp = cip->ci_mod->mod_stats;
1395 
1396 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1397 	msp->ms_caseopen.fmds_value.ui64--;
1398 
1399 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1400 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1401 
1402 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1403 
1404 	if (cip->ci_xprt == NULL)
1405 		fmd_module_setcdirty(cip->ci_mod);
1406 
1407 	fmd_module_rele(cip->ci_mod);
1408 	cip->ci_mod = fmd.d_rmod;
1409 	fmd_module_hold(cip->ci_mod);
1410 
1411 	/*
1412 	 * If the case is not proxied and it has been solved, then retain it
1413 	 * on the root module's case list at least until we're transitioned.
1414 	 * Otherwise free the case with our final fmd_case_rele() below.
1415 	 */
1416 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1417 		fmd_module_lock(cip->ci_mod);
1418 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1419 		fmd_module_unlock(cip->ci_mod);
1420 		fmd_case_hold(cp);
1421 	}
1422 
1423 	/*
1424 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1425 	 * rather than orphaned because by definition it can have no entries
1426 	 * in the resource cache of the current fault manager.
1427 	 */
1428 	if (cip->ci_xprt != NULL)
1429 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1430 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1431 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1432 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1433 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1434 
1435 	fmd_case_rele(cp);
1436 }
1437 
1438 void
1439 fmd_case_discard(fmd_case_t *cp)
1440 {
1441 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1442 
1443 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1444 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1445 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1446 
1447 	ASSERT(fmd_module_locked(cip->ci_mod));
1448 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1449 	fmd_case_rele(cp);
1450 }
1451 
1452 /*
1453  * Indicate that the problem corresponding to a case has been repaired by
1454  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1455  * already been closed, this function initiates the transition to CLOSE_WAIT.
1456  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1457  * grab and drop ci_lock without the case being able to be freed in between.
1458  */
1459 int
1460 fmd_case_repair(fmd_case_t *cp)
1461 {
1462 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1463 	uint_t cstate;
1464 
1465 	(void) pthread_mutex_lock(&cip->ci_lock);
1466 	cstate = cip->ci_state;
1467 
1468 	if (cip->ci_xprt != NULL) {
1469 		(void) pthread_mutex_unlock(&cip->ci_lock);
1470 		return (fmd_set_errno(EFMD_CASE_OWNER));
1471 	}
1472 
1473 	if (cstate < FMD_CASE_SOLVED) {
1474 		(void) pthread_mutex_unlock(&cip->ci_lock);
1475 		return (fmd_set_errno(EFMD_CASE_STATE));
1476 	}
1477 
1478 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1479 		(void) pthread_mutex_unlock(&cip->ci_lock);
1480 		return (0); /* already repaired */
1481 	}
1482 
1483 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repair, NULL);
1484 	(void) pthread_mutex_unlock(&cip->ci_lock);
1485 
1486 	if (cstate == FMD_CASE_CLOSED)
1487 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1488 	else
1489 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1490 
1491 	return (0);
1492 }
1493 
1494 int
1495 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1496 {
1497 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1498 	fmd_case_item_t *cit;
1499 	uint_t state;
1500 	int rv = 0;
1501 
1502 	(void) pthread_mutex_lock(&cip->ci_lock);
1503 
1504 	if (cip->ci_state >= FMD_CASE_SOLVED)
1505 		state = FMD_EVS_DIAGNOSED;
1506 	else
1507 		state = FMD_EVS_ACCEPTED;
1508 
1509 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1510 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1511 			break;
1512 	}
1513 
1514 	if (rv == 0 && cip->ci_principal != NULL)
1515 		rv = fmd_event_equal(ep, cip->ci_principal);
1516 
1517 	(void) pthread_mutex_unlock(&cip->ci_lock);
1518 
1519 	if (rv != 0)
1520 		fmd_event_transition(ep, state);
1521 
1522 	return (rv);
1523 }
1524 
1525 int
1526 fmd_case_orphaned(fmd_case_t *cp)
1527 {
1528 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1529 }
1530 
1531 void
1532 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1533 {
1534 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1535 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1536 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1537 }
1538