xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_case.c (revision b249c65cf0a7400e86a36ddab5c3fce085809859)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * FMD Case Subsystem
31  *
32  * Diagnosis engines are expected to group telemetry events related to the
33  * diagnosis of a particular problem on the system into a set of cases.  The
34  * diagnosis engine may have any number of cases open at a given point in time.
35  * Some cases may eventually be *solved* by associating a suspect list of one
36  * or more problems with the case, at which point fmd publishes a list.suspect
37  * event for the case and it becomes visible to administrators and agents.
38  *
39  * Every case is named using a UUID, and is globally visible in the case hash.
40  * Cases are reference-counted, except for the reference from the case hash
41  * itself.  Consumers of case references include modules, which store active
42  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
43  *
44  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
45  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
46  * or transport) and the case is referenced by the mod_cases list.  Once the
47  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
48  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
49  *
50  *			+------------+
51  *	     +----------|  UNSOLVED  |
52  *	     |		+------------+
53  *	   1 |	             4 |
54  *           |                 |
55  *	+----v---+ /-2->+------v-----+	  3	+--------+
56  *      | SOLVED |<     | CLOSE_WAIT |--------->| CLOSED |
57  *	+--------+ \-5->+------------+		+--------+
58  *	                       |                    |
59  *                           6 |                    | 7
60  *      		+------v-----+              |
61  *	                |  REPAIRED  |<-------------+
62  *			+------------+
63  *
64  * The state machine changes are triggered by calls to fmd_case_transition()
65  * from various locations inside of fmd, as described below:
66  *
67  * [1] Called by: fmd_case_solve()
68  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
69  *                conviction policy is applied to suspect list
70  *                suspects convicted are marked faulty (F) in R$
71  *                list.suspect event logged and dispatched
72  *
73  * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
74  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
75  *                suspects convicted (F) are marked unusable (U) in R$
76  *                diagnosis engine fmdo_close() entry point scheduled
77  *                case transitions to CLOSED [3] upon exit from CLOSE_WAIT
78  *
79  * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
80  *       Actions: list.isolated event dispatched
81  *                case deleted from module's list of open cases
82  *
83  * [4] Called by: fmd_case_close(), fmd_case_uuclose()
84  *       Actions: diagnosis engine fmdo_close() entry point scheduled
85  *                case is subsequently discarded by fmd_case_delete()
86  *
87  * [5] Called by: fmd_case_repair(), fmd_case_update()
88  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
89  *                diagnosis engine fmdo_close() entry point scheduled
90  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
91  *
92  * [6] Called by: fmd_case_repair(), fmd_case_update()
93  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
94  *                suspects convicted are marked non faulty (!F) in R$
95  *                list.repaired event dispatched
96  *
97  * [7] Called by: fmd_case_repair(), fmd_case_update()
98  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
99  *                suspects convicted are marked non faulty (!F) in R$
100  *                list.repaired event dispatched
101  */
102 
103 #include <sys/fm/protocol.h>
104 #include <uuid/uuid.h>
105 #include <alloca.h>
106 
107 #include <fmd_alloc.h>
108 #include <fmd_module.h>
109 #include <fmd_error.h>
110 #include <fmd_conf.h>
111 #include <fmd_case.h>
112 #include <fmd_string.h>
113 #include <fmd_subr.h>
114 #include <fmd_protocol.h>
115 #include <fmd_event.h>
116 #include <fmd_eventq.h>
117 #include <fmd_dispq.h>
118 #include <fmd_buf.h>
119 #include <fmd_log.h>
120 #include <fmd_asru.h>
121 #include <fmd_fmri.h>
122 #include <fmd_xprt.h>
123 
124 #include <fmd.h>
125 
126 static const char *const _fmd_case_snames[] = {
127 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
128 	"SOLVED",	/* FMD_CASE_SOLVED */
129 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
130 	"CLOSED",	/* FMD_CASE_CLOSED */
131 	"REPAIRED"	/* FMD_CASE_REPAIRED */
132 };
133 
134 extern volatile uint32_t fmd_asru_fake_not_present;
135 
136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
137 
138 fmd_case_hash_t *
139 fmd_case_hash_create(void)
140 {
141 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
142 
143 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
144 	chp->ch_hashlen = fmd.d_str_buckets;
145 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
146 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
147 	    FMD_SLEEP);
148 	chp->ch_count = 0;
149 
150 	return (chp);
151 }
152 
153 /*
154  * Destroy the case hash.  Unlike most of our hash tables, no active references
155  * are kept by the case hash itself; all references come from other subsystems.
156  * The hash must be destroyed after all modules are unloaded; if anything was
157  * present in the hash it would be by definition a reference count leak.
158  */
159 void
160 fmd_case_hash_destroy(fmd_case_hash_t *chp)
161 {
162 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
163 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
164 	fmd_free(chp, sizeof (fmd_case_hash_t));
165 }
166 
167 /*
168  * Take a snapshot of the case hash by placing an additional hold on each
169  * member in an auxiliary array, and then call 'func' for each case.
170  */
171 void
172 fmd_case_hash_apply(fmd_case_hash_t *chp,
173     void (*func)(fmd_case_t *, void *), void *arg)
174 {
175 	fmd_case_impl_t *cp, **cps, **cpp;
176 	uint_t cpc, i;
177 
178 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
179 
180 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
181 	cpc = chp->ch_count;
182 
183 	for (i = 0; i < chp->ch_hashlen; i++) {
184 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) {
185 			fmd_case_hold((fmd_case_t *)cp);
186 			*cpp++ = cp;
187 		}
188 	}
189 
190 	ASSERT(cpp == cps + cpc);
191 	(void) pthread_rwlock_unlock(&chp->ch_lock);
192 
193 	for (i = 0; i < cpc; i++) {
194 		func((fmd_case_t *)cps[i], arg);
195 		fmd_case_rele((fmd_case_t *)cps[i]);
196 	}
197 
198 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
199 }
200 
201 static void
202 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
203 {
204 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
205 
206 	cip->ci_code_next = chp->ch_code_hash[h];
207 	chp->ch_code_hash[h] = cip;
208 }
209 
210 static void
211 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
212 {
213 	fmd_case_impl_t **pp, *cp;
214 
215 	if (cip->ci_code) {
216 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
217 
218 		pp = &chp->ch_code_hash[h];
219 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
220 			if (cp != cip)
221 				pp = &cp->ci_code_next;
222 			else
223 				break;
224 		}
225 		if (cp != NULL) {
226 			*pp = cp->ci_code_next;
227 			cp->ci_code_next = NULL;
228 		}
229 	}
230 }
231 
232 /*
233  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
234  * were defined for this case or if the lookup fails, the event dictionary or
235  * module code is broken, and we set the event code to a precomputed default.
236  */
237 static const char *
238 fmd_case_mkcode(fmd_case_t *cp)
239 {
240 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
241 	fmd_case_susp_t *cis;
242 	fmd_case_hash_t *chp = fmd.d_cases;
243 
244 	char **keys, **keyp;
245 	const char *s;
246 
247 	ASSERT(MUTEX_HELD(&cip->ci_lock));
248 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
249 
250 	/*
251 	 * delete any existing entry from code hash if it is on it
252 	 */
253 	fmd_case_code_hash_delete(chp, cip);
254 
255 	fmd_free(cip->ci_code, cip->ci_codelen);
256 	cip->ci_codelen = cip->ci_mod->mod_codelen;
257 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
258 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
259 
260 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
261 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
262 			keyp++;
263 	}
264 
265 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
266 
267 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
268 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
269 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
270 		fmd_free(cip->ci_code, cip->ci_codelen);
271 		cip->ci_codelen = strlen(s) + 1;
272 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
273 		(void) strcpy(cip->ci_code, s);
274 	}
275 
276 	/*
277 	 * add into hash of solved cases
278 	 */
279 	fmd_case_code_hash_insert(chp, cip);
280 
281 	return (cip->ci_code);
282 }
283 
284 typedef struct {
285 	int	*fcl_countp;
286 	uint8_t *fcl_ba;
287 	nvlist_t **fcl_nva;
288 	int	*fcl_msgp;
289 } fmd_case_lst_t;
290 
291 static void
292 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
293 {
294 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
295 	boolean_t b;
296 	int state;
297 
298 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
299 	    &b) == 0 && b == B_FALSE)
300 		*entryp->fcl_msgp = B_FALSE;
301 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
302 	state = fmd_asru_al_getstate(alp);
303 	if (state & FMD_ASRU_UNUSABLE)
304 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
305 	if (state & FMD_ASRU_FAULTY)
306 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
307 	if (!(state & FMD_ASRU_PRESENT))
308 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
309 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
310 	(*entryp->fcl_countp)++;
311 }
312 
313 static void
314 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
315 {
316 	int *faultyp = (int *)arg;
317 
318 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
319 }
320 
321 static void
322 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
323 {
324 	int *usablep = (int *)arg;
325 
326 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
327 }
328 
329 nvlist_t *
330 fmd_case_mkevent(fmd_case_t *cp, const char *class)
331 {
332 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
333 	nvlist_t **nva, *nvl;
334 	uint8_t *ba;
335 	int msg = B_TRUE;
336 	const char *code;
337 	fmd_case_lst_t fcl;
338 	int count = 0;
339 
340 	(void) pthread_mutex_lock(&cip->ci_lock);
341 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
342 
343 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
344 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
345 
346 	/*
347 	 * For each suspect associated with the case, store its fault event
348 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
349 	 * have asked not to be messaged.  If any of them have made such a
350 	 * request, propagate that attribute to the composite list.* event.
351 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
352 	 */
353 	fcl.fcl_countp = &count;
354 	fcl.fcl_msgp = &msg;
355 	fcl.fcl_ba = ba;
356 	fcl.fcl_nva = nva;
357 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
358 
359 	if (cip->ci_code == NULL)
360 		(void) fmd_case_mkcode(cp);
361 	/*
362 	 * For repair event, we lookup diagcode from dict using key
363 	 * "list.repaired".
364 	 */
365 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
366 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
367 	else
368 		code = cip->ci_code;
369 
370 	if (msg == B_FALSE)
371 		cip->ci_flags |= FMD_CF_INVISIBLE;
372 
373 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
374 	    code, count, nva, ba, msg, &cip->ci_tv);
375 
376 	(void) pthread_mutex_unlock(&cip->ci_lock);
377 	return (nvl);
378 }
379 
380 static boolean_t
381 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
382 {
383 	nvlist_t *new_rsrc;
384 	nvlist_t *rsrc;
385 	char *new_name = NULL;
386 	char *name = NULL;
387 	ssize_t new_namelen;
388 	ssize_t namelen;
389 	int fmri_present = 1;
390 	int new_fmri_present = 1;
391 	int match = B_FALSE;
392 	fmd_topo_t *ftp = fmd_topo_hold();
393 
394 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
395 		fmri_present = 0;
396 	else {
397 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
398 			goto done;
399 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
400 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
401 			goto done;
402 	}
403 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
404 		new_fmri_present = 0;
405 	else {
406 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
407 			goto done;
408 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
409 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
410 			goto done;
411 	}
412 	match = (fmri_present == new_fmri_present &&
413 	    (fmri_present == 0 ||
414 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
415 done:
416 	if (name != NULL)
417 		fmd_free(name, namelen + 1);
418 	if (new_name != NULL)
419 		fmd_free(new_name, new_namelen + 1);
420 	fmd_topo_rele(ftp);
421 	return (match);
422 }
423 
424 static int
425 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis)
426 {
427 	char *class, *new_class;
428 
429 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU))
430 		return (0);
431 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl,
432 	    FM_FAULT_RESOURCE))
433 		return (0);
434 	if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU))
435 		return (0);
436 	(void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class);
437 	(void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class);
438 	return (strcmp(class, new_class) == 0);
439 }
440 
441 /*
442  * see if an identical suspect list already exists in the cache
443  */
444 static int
445 fmd_case_check_for_dups(fmd_case_t *cp)
446 {
447 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip;
448 	fmd_case_hash_t *chp = fmd.d_cases;
449 	fmd_case_susp_t *xcis, *cis;
450 	int match = 0, match_susp;
451 	uint_t h;
452 
453 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
454 
455 	/*
456 	 * Find all cases with this code
457 	 */
458 	h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
459 	for (xcip = chp->ch_code_hash[h]; xcip != NULL;
460 	    xcip = xcip->ci_code_next) {
461 		/*
462 		 * only look for any cases (apart from this one)
463 		 * whose code and number of suspects match
464 		 */
465 		if (xcip == cip || strcmp(xcip->ci_code, cip->ci_code) != 0 ||
466 		    xcip->ci_nsuspects != cip->ci_nsuspects)
467 			continue;
468 
469 		/*
470 		 * For each suspect in one list, check if there
471 		 * is an identical suspect in the other list
472 		 */
473 		match = 1;
474 		fmd_case_hold((fmd_case_t *)xcip);
475 		for (xcis = xcip->ci_suspects; xcis != NULL;
476 		    xcis = xcis->cis_next) {
477 			match_susp = 0;
478 			for (cis = cip->ci_suspects; cis != NULL;
479 			    cis = cis->cis_next) {
480 				if (fmd_case_match_suspect(cis, xcis) == 1) {
481 					match_susp = 1;
482 					break;
483 				}
484 			}
485 			if (match_susp == 0) {
486 				match = 0;
487 				break;
488 			}
489 		}
490 		fmd_case_rele((fmd_case_t *)xcip);
491 		if (match) {
492 			(void) pthread_rwlock_unlock(&chp->ch_lock);
493 			return (1);
494 		}
495 	}
496 	(void) pthread_rwlock_unlock(&chp->ch_lock);
497 	return (0);
498 }
499 
500 /*
501  * Convict suspects in a case by applying a conviction policy and updating the
502  * resource cache prior to emitting the list.suspect event for the given case.
503  * At present, our policy is very simple: convict every suspect in the case.
504  * In the future, this policy can be extended and made configurable to permit:
505  *
506  * - convicting the suspect with the highest FIT rate
507  * - convicting the suspect with the cheapest FRU
508  * - convicting the suspect with the FRU that is in a depot's inventory
509  * - convicting the suspect with the longest lifetime
510  *
511  * and so forth.  A word to the wise: this problem is significantly harder that
512  * it seems at first glance.  Future work should heed the following advice:
513  *
514  * Hacking the policy into C code here is a very bad idea.  The policy needs to
515  * be decided upon very carefully and fundamentally encodes knowledge of what
516  * suspect list combinations can be emitted by what diagnosis engines.  As such
517  * fmd's code is the wrong location, because that would require fmd itself to
518  * be updated for every diagnosis engine change, defeating the entire design.
519  * The FMA Event Registry knows the suspect list combinations: policy inputs
520  * can be derived from it and used to produce per-module policy configuration.
521  *
522  * If the policy needs to be dynamic and not statically fixed at either fmd
523  * startup or module load time, any implementation of dynamic policy retrieval
524  * must employ some kind of caching mechanism or be part of a built-in module.
525  * The fmd_case_convict() function is called with locks held inside of fmd and
526  * is not a place where unbounded blocking on some inter-process or inter-
527  * system communication to another service (e.g. another daemon) can occur.
528  */
529 static int
530 fmd_case_convict(fmd_case_t *cp)
531 {
532 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
533 	fmd_asru_hash_t *ahp = fmd.d_asrus;
534 
535 	fmd_case_susp_t *cis;
536 	fmd_asru_link_t *alp;
537 
538 	(void) pthread_mutex_lock(&cip->ci_lock);
539 	(void) fmd_case_mkcode(cp);
540 	if (fmd_case_check_for_dups(cp) == 1) {
541 		(void) pthread_mutex_unlock(&cip->ci_lock);
542 		return (1);
543 	}
544 
545 	/*
546 	 * no suspect list already exists  - allocate new cache entries
547 	 */
548 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
549 		if ((alp = fmd_asru_hash_create_entry(ahp,
550 		    cp, cis->cis_nvl)) == NULL) {
551 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
552 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
553 			continue;
554 		}
555 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE);
556 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
557 	}
558 
559 	(void) pthread_mutex_unlock(&cip->ci_lock);
560 	return (0);
561 }
562 
563 void
564 fmd_case_publish(fmd_case_t *cp, uint_t state)
565 {
566 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
567 	fmd_event_t *e;
568 	nvlist_t *nvl;
569 	char *class;
570 
571 	if (state == FMD_CASE_CURRENT)
572 		state = cip->ci_state; /* use current state */
573 
574 	switch (state) {
575 	case FMD_CASE_SOLVED:
576 		(void) pthread_mutex_lock(&cip->ci_lock);
577 		if (cip->ci_tv_valid == 0) {
578 			fmd_time_gettimeofday(&cip->ci_tv);
579 			cip->ci_tv_valid = 1;
580 		}
581 		(void) pthread_mutex_unlock(&cip->ci_lock);
582 
583 		if (fmd_case_convict(cp) == 1) { /* dupclose */
584 			cip->ci_flags &= ~FMD_CF_SOLVED;
585 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
586 			break;
587 		}
588 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
589 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
590 
591 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
592 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
593 		fmd_log_append(fmd.d_fltlog, e, cp);
594 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
595 		fmd_dispq_dispatch(fmd.d_disp, e, class);
596 
597 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
598 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
599 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
600 
601 		break;
602 
603 	case FMD_CASE_CLOSE_WAIT:
604 		fmd_case_hold(cp);
605 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
606 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
607 
608 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
609 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
610 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
611 
612 		break;
613 
614 	case FMD_CASE_CLOSED:
615 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
616 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
617 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
618 		fmd_dispq_dispatch(fmd.d_disp, e, class);
619 		break;
620 
621 	case FMD_CASE_REPAIRED:
622 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
623 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
624 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
625 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
626 		fmd_log_append(fmd.d_fltlog, e, cp);
627 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
628 		fmd_dispq_dispatch(fmd.d_disp, e, class);
629 		break;
630 	}
631 }
632 
633 fmd_case_t *
634 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
635 {
636 	fmd_case_impl_t *cip;
637 	uint_t h;
638 
639 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
640 	h = fmd_strhash(uuid) % chp->ch_hashlen;
641 
642 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
643 		if (strcmp(cip->ci_uuid, uuid) == 0)
644 			break;
645 	}
646 
647 	/*
648 	 * If deleting bit is set, treat the case as if it doesn't exist.
649 	 */
650 	if (cip != NULL)
651 		cip = fmd_case_tryhold(cip);
652 
653 	if (cip == NULL)
654 		(void) fmd_set_errno(EFMD_CASE_INVAL);
655 
656 	(void) pthread_rwlock_unlock(&chp->ch_lock);
657 	return ((fmd_case_t *)cip);
658 }
659 
660 static fmd_case_impl_t *
661 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
662 {
663 	fmd_case_impl_t *eip;
664 	uint_t h;
665 
666 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
667 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
668 
669 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
670 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
671 		    fmd_case_tryhold(eip) != NULL) {
672 			(void) pthread_rwlock_unlock(&chp->ch_lock);
673 			return (eip); /* uuid already present */
674 		}
675 	}
676 
677 	cip->ci_next = chp->ch_hash[h];
678 	chp->ch_hash[h] = cip;
679 
680 	chp->ch_count++;
681 	ASSERT(chp->ch_count != 0);
682 
683 	(void) pthread_rwlock_unlock(&chp->ch_lock);
684 	return (cip);
685 }
686 
687 static void
688 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
689 {
690 	fmd_case_impl_t *cp, **pp;
691 	uint_t h;
692 
693 	ASSERT(MUTEX_HELD(&cip->ci_lock));
694 
695 	cip->ci_flags |= FMD_CF_DELETING;
696 	(void) pthread_mutex_unlock(&cip->ci_lock);
697 
698 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
699 
700 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
701 	pp = &chp->ch_hash[h];
702 
703 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
704 		if (cp != cip)
705 			pp = &cp->ci_next;
706 		else
707 			break;
708 	}
709 
710 	if (cp == NULL) {
711 		fmd_panic("case %p (%s) not found on hash chain %u\n",
712 		    (void *)cip, cip->ci_uuid, h);
713 	}
714 
715 	*pp = cp->ci_next;
716 	cp->ci_next = NULL;
717 
718 	/*
719 	 * delete from code hash if it is on it
720 	 */
721 	fmd_case_code_hash_delete(chp, cip);
722 
723 	ASSERT(chp->ch_count != 0);
724 	chp->ch_count--;
725 
726 	(void) pthread_rwlock_unlock(&chp->ch_lock);
727 
728 	(void) pthread_mutex_lock(&cip->ci_lock);
729 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
730 }
731 
732 fmd_case_t *
733 fmd_case_create(fmd_module_t *mp, void *data)
734 {
735 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
736 	fmd_case_impl_t *eip = NULL;
737 	uuid_t uuid;
738 
739 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
740 	fmd_buf_hash_create(&cip->ci_bufs);
741 
742 	fmd_module_hold(mp);
743 	cip->ci_mod = mp;
744 	cip->ci_refs = 1;
745 	cip->ci_state = FMD_CASE_UNSOLVED;
746 	cip->ci_flags = FMD_CF_DIRTY;
747 	cip->ci_data = data;
748 
749 	/*
750 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
751 	 * define any constant for the length of an unparse string, and do not
752 	 * permit the caller to specify a buffer length for safety.  The spec
753 	 * says it will be 36 bytes, but we make it tunable just in case.
754 	 */
755 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
756 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
757 
758 	/*
759 	 * We expect this loop to execute only once, but code it defensively
760 	 * against the possibility of libuuid bugs.  Keep generating uuids and
761 	 * attempting to do a hash insert until we get a unique one.
762 	 */
763 	do {
764 		if (eip != NULL)
765 			fmd_case_rele((fmd_case_t *)eip);
766 		uuid_generate(uuid);
767 		uuid_unparse(uuid, cip->ci_uuid);
768 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
769 
770 	ASSERT(fmd_module_locked(mp));
771 	fmd_list_append(&mp->mod_cases, cip);
772 	fmd_module_setcdirty(mp);
773 
774 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
775 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
776 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
777 
778 	return ((fmd_case_t *)cip);
779 }
780 
781 static void
782 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
783 {
784 	fmd_case_susp_t *cis, *ncis;
785 
786 	ASSERT(MUTEX_HELD(&cip->ci_lock));
787 
788 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
789 		ncis = cis->cis_next;
790 		nvlist_free(cis->cis_nvl);
791 		fmd_free(cis, sizeof (fmd_case_susp_t));
792 	}
793 
794 	cip->ci_suspects = NULL;
795 	cip->ci_nsuspects = 0;
796 }
797 
798 fmd_case_t *
799 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
800     uint_t state, const char *uuid, const char *code)
801 {
802 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
803 	fmd_case_impl_t *eip;
804 
805 	ASSERT(state < FMD_CASE_REPAIRED);
806 
807 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
808 	fmd_buf_hash_create(&cip->ci_bufs);
809 
810 	fmd_module_hold(mp);
811 	cip->ci_mod = mp;
812 	cip->ci_xprt = xp;
813 	cip->ci_refs = 1;
814 	cip->ci_state = state;
815 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
816 	cip->ci_uuidlen = strlen(cip->ci_uuid);
817 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
818 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
819 
820 	if (state > FMD_CASE_CLOSE_WAIT)
821 		cip->ci_flags |= FMD_CF_SOLVED;
822 
823 	/*
824 	 * Insert the case into the global case hash.  If the specified UUID is
825 	 * already present, check to see if it is an orphan: if so, reclaim it;
826 	 * otherwise if it is owned by a different module then return NULL.
827 	 */
828 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
829 		(void) pthread_mutex_lock(&cip->ci_lock);
830 		cip->ci_refs--; /* decrement to zero */
831 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
832 
833 		cip = eip; /* switch 'cip' to the existing case */
834 		(void) pthread_mutex_lock(&cip->ci_lock);
835 
836 		/*
837 		 * If the ASRU cache is trying to recreate an orphan, then just
838 		 * return the existing case that we found without changing it.
839 		 */
840 		if (mp == fmd.d_rmod) {
841 			(void) pthread_mutex_unlock(&cip->ci_lock);
842 			fmd_case_rele((fmd_case_t *)cip);
843 			return ((fmd_case_t *)cip);
844 		}
845 
846 		/*
847 		 * If the existing case isn't an orphan or is being proxied,
848 		 * then we have a UUID conflict: return failure to the caller.
849 		 */
850 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
851 			(void) pthread_mutex_unlock(&cip->ci_lock);
852 			fmd_case_rele((fmd_case_t *)cip);
853 			return (NULL);
854 		}
855 
856 		/*
857 		 * If the new module is reclaiming an orphaned case, remove
858 		 * the case from the root module, switch ci_mod, and then fall
859 		 * through to adding the case to the new owner module 'mp'.
860 		 */
861 		fmd_module_lock(cip->ci_mod);
862 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
863 		fmd_module_unlock(cip->ci_mod);
864 
865 		fmd_module_rele(cip->ci_mod);
866 		cip->ci_mod = mp;
867 		fmd_module_hold(mp);
868 
869 		fmd_case_destroy_suspects(cip);
870 		cip->ci_state = state;
871 
872 		(void) pthread_mutex_unlock(&cip->ci_lock);
873 		fmd_case_rele((fmd_case_t *)cip);
874 	} else {
875 		/*
876 		 * add into hash of solved cases
877 		 */
878 		if (cip->ci_code)
879 			fmd_case_code_hash_insert(fmd.d_cases, cip);
880 	}
881 
882 	ASSERT(fmd_module_locked(mp));
883 	fmd_list_append(&mp->mod_cases, cip);
884 
885 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
886 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
887 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
888 
889 	return ((fmd_case_t *)cip);
890 }
891 
892 void
893 fmd_case_destroy(fmd_case_t *cp, int visible)
894 {
895 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
896 	fmd_case_item_t *cit, *ncit;
897 
898 	ASSERT(MUTEX_HELD(&cip->ci_lock));
899 	ASSERT(cip->ci_refs == 0);
900 
901 	if (visible) {
902 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
903 		fmd_case_hash_delete(fmd.d_cases, cip);
904 	}
905 
906 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
907 		ncit = cit->cit_next;
908 		fmd_event_rele(cit->cit_event);
909 		fmd_free(cit, sizeof (fmd_case_item_t));
910 	}
911 
912 	fmd_case_destroy_suspects(cip);
913 
914 	if (cip->ci_principal != NULL)
915 		fmd_event_rele(cip->ci_principal);
916 
917 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
918 	fmd_free(cip->ci_code, cip->ci_codelen);
919 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
920 
921 	fmd_module_rele(cip->ci_mod);
922 	fmd_free(cip, sizeof (fmd_case_impl_t));
923 }
924 
925 void
926 fmd_case_hold(fmd_case_t *cp)
927 {
928 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
929 
930 	(void) pthread_mutex_lock(&cip->ci_lock);
931 	fmd_case_hold_locked(cp);
932 	(void) pthread_mutex_unlock(&cip->ci_lock);
933 }
934 
935 void
936 fmd_case_hold_locked(fmd_case_t *cp)
937 {
938 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
939 
940 	ASSERT(MUTEX_HELD(&cip->ci_lock));
941 	if (cip->ci_flags & FMD_CF_DELETING)
942 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
943 		    (void *)cip, cip->ci_uuid);
944 	cip->ci_refs++;
945 	ASSERT(cip->ci_refs != 0);
946 }
947 
948 static fmd_case_impl_t *
949 fmd_case_tryhold(fmd_case_impl_t *cip)
950 {
951 	/*
952 	 * If the case's "deleting" bit is unset, hold and return case,
953 	 * otherwise, return NULL.
954 	 */
955 	(void) pthread_mutex_lock(&cip->ci_lock);
956 	if (cip->ci_flags & FMD_CF_DELETING) {
957 		(void) pthread_mutex_unlock(&cip->ci_lock);
958 		cip = NULL;
959 	} else {
960 		fmd_case_hold_locked((fmd_case_t *)cip);
961 		(void) pthread_mutex_unlock(&cip->ci_lock);
962 	}
963 	return (cip);
964 }
965 
966 void
967 fmd_case_rele(fmd_case_t *cp)
968 {
969 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
970 
971 	(void) pthread_mutex_lock(&cip->ci_lock);
972 	ASSERT(cip->ci_refs != 0);
973 
974 	if (--cip->ci_refs == 0)
975 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
976 	else
977 		(void) pthread_mutex_unlock(&cip->ci_lock);
978 }
979 
980 void
981 fmd_case_rele_locked(fmd_case_t *cp)
982 {
983 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
984 
985 	ASSERT(MUTEX_HELD(&cip->ci_lock));
986 	--cip->ci_refs;
987 	ASSERT(cip->ci_refs != 0);
988 }
989 
990 int
991 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
992 {
993 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
994 	fmd_case_item_t *cit;
995 	fmd_event_t *oep;
996 	uint_t state;
997 	int new;
998 
999 	fmd_event_hold(ep);
1000 	(void) pthread_mutex_lock(&cip->ci_lock);
1001 
1002 	if (cip->ci_flags & FMD_CF_SOLVED)
1003 		state = FMD_EVS_DIAGNOSED;
1004 	else
1005 		state = FMD_EVS_ACCEPTED;
1006 
1007 	oep = cip->ci_principal;
1008 	cip->ci_principal = ep;
1009 
1010 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1011 		if (cit->cit_event == ep)
1012 			break;
1013 	}
1014 
1015 	cip->ci_flags |= FMD_CF_DIRTY;
1016 	new = cit == NULL && ep != oep;
1017 
1018 	(void) pthread_mutex_unlock(&cip->ci_lock);
1019 
1020 	fmd_module_setcdirty(cip->ci_mod);
1021 	fmd_event_transition(ep, state);
1022 
1023 	if (oep != NULL)
1024 		fmd_event_rele(oep);
1025 
1026 	return (new);
1027 }
1028 
1029 int
1030 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1031 {
1032 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1033 	fmd_case_item_t *cit;
1034 	uint_t state;
1035 	int new;
1036 
1037 	(void) pthread_mutex_lock(&cip->ci_lock);
1038 
1039 	if (cip->ci_flags & FMD_CF_SOLVED)
1040 		state = FMD_EVS_DIAGNOSED;
1041 	else
1042 		state = FMD_EVS_ACCEPTED;
1043 
1044 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1045 		if (cit->cit_event == ep)
1046 			break;
1047 	}
1048 
1049 	new = cit == NULL && ep != cip->ci_principal;
1050 
1051 	/*
1052 	 * If the event is already in the case or the case is already solved,
1053 	 * there is no reason to save it: just transition it appropriately.
1054 	 */
1055 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1056 		(void) pthread_mutex_unlock(&cip->ci_lock);
1057 		fmd_event_transition(ep, state);
1058 		return (new);
1059 	}
1060 
1061 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1062 	fmd_event_hold(ep);
1063 
1064 	cit->cit_next = cip->ci_items;
1065 	cit->cit_event = ep;
1066 
1067 	cip->ci_items = cit;
1068 	cip->ci_nitems++;
1069 
1070 	cip->ci_flags |= FMD_CF_DIRTY;
1071 	(void) pthread_mutex_unlock(&cip->ci_lock);
1072 
1073 	fmd_module_setcdirty(cip->ci_mod);
1074 	fmd_event_transition(ep, state);
1075 
1076 	return (new);
1077 }
1078 
1079 void
1080 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1081 {
1082 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1083 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1084 
1085 	(void) pthread_mutex_lock(&cip->ci_lock);
1086 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1087 	cip->ci_flags |= FMD_CF_DIRTY;
1088 
1089 	cis->cis_next = cip->ci_suspects;
1090 	cis->cis_nvl = nvl;
1091 
1092 	cip->ci_suspects = cis;
1093 	cip->ci_nsuspects++;
1094 
1095 	(void) pthread_mutex_unlock(&cip->ci_lock);
1096 	fmd_module_setcdirty(cip->ci_mod);
1097 }
1098 
1099 void
1100 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1101 {
1102 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1103 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1104 	boolean_t b;
1105 
1106 	(void) pthread_mutex_lock(&cip->ci_lock);
1107 	ASSERT(cip->ci_state == FMD_CASE_CLOSED);
1108 	ASSERT(cip->ci_mod == fmd.d_rmod);
1109 
1110 	cis->cis_next = cip->ci_suspects;
1111 	cis->cis_nvl = nvl;
1112 
1113 	if (nvlist_lookup_boolean_value(nvl,
1114 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1115 		cip->ci_flags |= FMD_CF_INVISIBLE;
1116 
1117 	cip->ci_suspects = cis;
1118 	cip->ci_nsuspects++;
1119 
1120 	(void) pthread_mutex_unlock(&cip->ci_lock);
1121 }
1122 
1123 void
1124 fmd_case_reset_suspects(fmd_case_t *cp)
1125 {
1126 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1127 
1128 	(void) pthread_mutex_lock(&cip->ci_lock);
1129 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1130 
1131 	fmd_case_destroy_suspects(cip);
1132 	cip->ci_flags |= FMD_CF_DIRTY;
1133 
1134 	(void) pthread_mutex_unlock(&cip->ci_lock);
1135 	fmd_module_setcdirty(cip->ci_mod);
1136 }
1137 
1138 /*ARGSUSED*/
1139 static void
1140 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1141 {
1142 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1143 }
1144 
1145 /*
1146  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1147  * whatever actions and emit whatever events are appropriate for the state.
1148  * Refer to the topmost block comment explaining the state machine for details.
1149  */
1150 void
1151 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1152 {
1153 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1154 	fmd_case_item_t *cit;
1155 	fmd_event_t *e;
1156 
1157 	ASSERT(state <= FMD_CASE_REPAIRED);
1158 	(void) pthread_mutex_lock(&cip->ci_lock);
1159 
1160 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1161 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
1162 
1163 	cip->ci_flags |= flags;
1164 
1165 	if (cip->ci_state >= state) {
1166 		(void) pthread_mutex_unlock(&cip->ci_lock);
1167 		return; /* already in specified state */
1168 	}
1169 
1170 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1171 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1172 
1173 	cip->ci_state = state;
1174 	cip->ci_flags |= FMD_CF_DIRTY;
1175 
1176 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1177 		fmd_module_setcdirty(cip->ci_mod);
1178 
1179 	switch (state) {
1180 	case FMD_CASE_SOLVED:
1181 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1182 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1183 
1184 		if (cip->ci_principal != NULL) {
1185 			fmd_event_transition(cip->ci_principal,
1186 			    FMD_EVS_DIAGNOSED);
1187 		}
1188 		break;
1189 
1190 	case FMD_CASE_CLOSE_WAIT:
1191 		/*
1192 		 * If the case was never solved, do not change ASRUs.
1193 		 * If the case was never fmd_case_closed, do not change ASRUs.
1194 		 * If the case was repaired, do not change ASRUs.
1195 		 */
1196 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1197 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1198 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1199 			    fmd_case_unusable, NULL);
1200 
1201 		/*
1202 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1203 		 * module is no longer loaded: continue on to CASE_CLOSED.
1204 		 */
1205 		if (fmd_case_orphaned(cp))
1206 			state = cip->ci_state = FMD_CASE_CLOSED;
1207 		break;
1208 
1209 	case FMD_CASE_REPAIRED:
1210 		ASSERT(fmd_case_orphaned(cp));
1211 		fmd_module_lock(cip->ci_mod);
1212 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1213 		fmd_module_unlock(cip->ci_mod);
1214 		break;
1215 	}
1216 
1217 	(void) pthread_mutex_unlock(&cip->ci_lock);
1218 
1219 	/*
1220 	 * If the module has initialized, then publish the appropriate event
1221 	 * for the new case state.  If not, we are being called from the
1222 	 * checkpoint code during module load, in which case the module's
1223 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1224 	 * may not be open yet, which will prevent us from computing the event
1225 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1226 	 * event in our queue: this won't be processed until _fmd_init is done.
1227 	 */
1228 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1229 		fmd_case_publish(cp, state);
1230 	else {
1231 		fmd_case_hold(cp);
1232 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1233 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1234 	}
1235 
1236 	/*
1237 	 * If we transitioned to REPAIRED, adjust the reference count to
1238 	 * reflect our removal from fmd.d_rmod->mod_cases.  If the caller has
1239 	 * not placed an additional hold on the case, it will now be freed.
1240 	 */
1241 	if (state == FMD_CASE_REPAIRED) {
1242 		(void) pthread_mutex_lock(&cip->ci_lock);
1243 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1244 		(void) pthread_mutex_unlock(&cip->ci_lock);
1245 		fmd_case_rele(cp);
1246 	}
1247 }
1248 
1249 /*
1250  * Transition the specified case to *at least* the specified state by first
1251  * re-validating the suspect list using the resource cache.  This function is
1252  * employed by the checkpoint code when restoring a saved, solved case to see
1253  * if the state of the case has effectively changed while fmd was not running
1254  * or the module was not loaded.  If none of the suspects are present anymore,
1255  * advance the state to REPAIRED.  If none are usable, advance to CLOSE_WAIT.
1256  */
1257 void
1258 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1259 {
1260 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1261 
1262 	int faulty = 0;		/* are any suspects faulty? */
1263 	int usable = 0;		/* are any suspects usable? */
1264 
1265 	ASSERT(state >= FMD_CASE_SOLVED);
1266 	(void) pthread_mutex_lock(&cip->ci_lock);
1267 
1268 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1269 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1270 
1271 	(void) pthread_mutex_unlock(&cip->ci_lock);
1272 
1273 	/*
1274 	 * If none of the suspects were faulty, it implies they were either
1275 	 * repaired already or not present and the rsrc.age time has expired.
1276 	 * We can move the state on to repaired.
1277 	 */
1278 	if (!faulty) {
1279 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1280 		flags |= FMD_CF_REPAIRED;
1281 	} else if (!usable) {
1282 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1283 		flags |= FMD_CF_ISOLATED;
1284 	}
1285 
1286 	fmd_case_transition(cp, state, flags);
1287 }
1288 
1289 void
1290 fmd_case_setdirty(fmd_case_t *cp)
1291 {
1292 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1293 
1294 	(void) pthread_mutex_lock(&cip->ci_lock);
1295 	cip->ci_flags |= FMD_CF_DIRTY;
1296 	(void) pthread_mutex_unlock(&cip->ci_lock);
1297 
1298 	fmd_module_setcdirty(cip->ci_mod);
1299 }
1300 
1301 void
1302 fmd_case_clrdirty(fmd_case_t *cp)
1303 {
1304 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1305 
1306 	(void) pthread_mutex_lock(&cip->ci_lock);
1307 	cip->ci_flags &= ~FMD_CF_DIRTY;
1308 	(void) pthread_mutex_unlock(&cip->ci_lock);
1309 }
1310 
1311 void
1312 fmd_case_commit(fmd_case_t *cp)
1313 {
1314 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1315 	fmd_case_item_t *cit;
1316 
1317 	(void) pthread_mutex_lock(&cip->ci_lock);
1318 
1319 	if (cip->ci_flags & FMD_CF_DIRTY) {
1320 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1321 			fmd_event_commit(cit->cit_event);
1322 
1323 		if (cip->ci_principal != NULL)
1324 			fmd_event_commit(cip->ci_principal);
1325 
1326 		fmd_buf_hash_commit(&cip->ci_bufs);
1327 		cip->ci_flags &= ~FMD_CF_DIRTY;
1328 	}
1329 
1330 	(void) pthread_mutex_unlock(&cip->ci_lock);
1331 }
1332 
1333 /*
1334  * Indicate that the case may need to change state because one or more of the
1335  * ASRUs named as a suspect has changed state.  We examine all the suspects
1336  * and if none are still faulty, we initiate a case close transition.
1337  */
1338 void
1339 fmd_case_update(fmd_case_t *cp)
1340 {
1341 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1342 	uint_t cstate;
1343 	int faulty = 0;
1344 
1345 	(void) pthread_mutex_lock(&cip->ci_lock);
1346 	cstate = cip->ci_state;
1347 
1348 	if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1349 		(void) pthread_mutex_unlock(&cip->ci_lock);
1350 		return; /* update is not appropriate */
1351 	}
1352 
1353 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1354 		(void) pthread_mutex_unlock(&cip->ci_lock);
1355 		return; /* already repaired */
1356 	}
1357 
1358 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
1359 	(void) pthread_mutex_unlock(&cip->ci_lock);
1360 
1361 	if (faulty)
1362 		return; /* one or more suspects are still marked faulty */
1363 
1364 	if (cstate == FMD_CASE_CLOSED)
1365 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1366 	else
1367 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1368 }
1369 
1370 /*
1371  * Delete a closed case from the module's case list once the fmdo_close() entry
1372  * point has run to completion.  If the case is owned by a transport module,
1373  * tell the transport to proxy a case close on the other end of the transport.
1374  * If not, transition to the appropriate next state based on ci_flags.  This
1375  * function represents the end of CLOSE_WAIT and transitions the case to either
1376  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1377  * refer to the topmost block comment explaining the state machine for details.
1378  */
1379 void
1380 fmd_case_delete(fmd_case_t *cp)
1381 {
1382 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1383 	fmd_modstat_t *msp;
1384 	size_t buftotal;
1385 
1386 	ASSERT(fmd_module_locked(cip->ci_mod));
1387 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1388 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1389 
1390 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1391 	msp = cip->ci_mod->mod_stats;
1392 
1393 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1394 	msp->ms_caseopen.fmds_value.ui64--;
1395 
1396 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1397 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1398 
1399 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1400 
1401 	if (cip->ci_xprt == NULL)
1402 		fmd_module_setcdirty(cip->ci_mod);
1403 
1404 	fmd_module_rele(cip->ci_mod);
1405 	cip->ci_mod = fmd.d_rmod;
1406 	fmd_module_hold(cip->ci_mod);
1407 
1408 	/*
1409 	 * If the case is not proxied and it has been solved, then retain it
1410 	 * on the root module's case list at least until we're transitioned.
1411 	 * Otherwise free the case with our final fmd_case_rele() below.
1412 	 */
1413 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1414 		fmd_module_lock(cip->ci_mod);
1415 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1416 		fmd_module_unlock(cip->ci_mod);
1417 		fmd_case_hold(cp);
1418 	}
1419 
1420 	/*
1421 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1422 	 * rather than orphaned because by definition it can have no entries
1423 	 * in the resource cache of the current fault manager.
1424 	 */
1425 	if (cip->ci_xprt != NULL)
1426 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1427 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1428 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1429 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1430 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1431 
1432 	fmd_case_rele(cp);
1433 }
1434 
1435 void
1436 fmd_case_discard(fmd_case_t *cp)
1437 {
1438 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1439 
1440 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1441 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1442 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1443 
1444 	ASSERT(fmd_module_locked(cip->ci_mod));
1445 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1446 	fmd_case_rele(cp);
1447 }
1448 
1449 /*
1450  * Indicate that the problem corresponding to a case has been repaired by
1451  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1452  * already been closed, this function initiates the transition to CLOSE_WAIT.
1453  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1454  * grab and drop ci_lock without the case being able to be freed in between.
1455  */
1456 int
1457 fmd_case_repair(fmd_case_t *cp)
1458 {
1459 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1460 	uint_t cstate;
1461 
1462 	(void) pthread_mutex_lock(&cip->ci_lock);
1463 	cstate = cip->ci_state;
1464 
1465 	if (cip->ci_xprt != NULL) {
1466 		(void) pthread_mutex_unlock(&cip->ci_lock);
1467 		return (fmd_set_errno(EFMD_CASE_OWNER));
1468 	}
1469 
1470 	if (cstate < FMD_CASE_SOLVED) {
1471 		(void) pthread_mutex_unlock(&cip->ci_lock);
1472 		return (fmd_set_errno(EFMD_CASE_STATE));
1473 	}
1474 
1475 	if (cip->ci_flags & FMD_CF_REPAIRED) {
1476 		(void) pthread_mutex_unlock(&cip->ci_lock);
1477 		return (0); /* already repaired */
1478 	}
1479 
1480 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repair, NULL);
1481 	(void) pthread_mutex_unlock(&cip->ci_lock);
1482 
1483 	if (cstate == FMD_CASE_CLOSED)
1484 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1485 	else
1486 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1487 
1488 	return (0);
1489 }
1490 
1491 int
1492 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1493 {
1494 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1495 	fmd_case_item_t *cit;
1496 	uint_t state;
1497 	int rv = 0;
1498 
1499 	(void) pthread_mutex_lock(&cip->ci_lock);
1500 
1501 	if (cip->ci_state >= FMD_CASE_SOLVED)
1502 		state = FMD_EVS_DIAGNOSED;
1503 	else
1504 		state = FMD_EVS_ACCEPTED;
1505 
1506 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1507 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1508 			break;
1509 	}
1510 
1511 	if (rv == 0 && cip->ci_principal != NULL)
1512 		rv = fmd_event_equal(ep, cip->ci_principal);
1513 
1514 	(void) pthread_mutex_unlock(&cip->ci_lock);
1515 
1516 	if (rv != 0)
1517 		fmd_event_transition(ep, state);
1518 
1519 	return (rv);
1520 }
1521 
1522 int
1523 fmd_case_orphaned(fmd_case_t *cp)
1524 {
1525 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1526 }
1527 
1528 void
1529 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1530 {
1531 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1532 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1533 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1534 }
1535