xref: /titanic_50/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 55d1b5d7069300479d864cdc34c9d3e0a759ccb5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * FMD Case Subsystem
31  *
32  * Diagnosis engines are expected to group telemetry events related to the
33  * diagnosis of a particular problem on the system into a set of cases.  The
34  * diagnosis engine may have any number of cases open at a given point in time.
35  * Some cases may eventually be *solved* by associating a suspect list of one
36  * or more problems with the case, at which point fmd publishes a list.suspect
37  * event for the case and it becomes visible to administrators and agents.
38  *
39  * Every case is named using a UUID, and is globally visible in the case hash.
40  * Cases are reference-counted, except for the reference from the case hash
41  * itself.  Consumers of case references include modules, which store active
42  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
43  *
44  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
45  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
46  * or transport) and the case is referenced by the mod_cases list.  Once the
47  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
48  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
49  *
50  *			+------------+
51  *	     +----------|  UNSOLVED  |
52  *	     |		+------------+
53  *	   1 |	             4 |
54  *           |                 |
55  *	+----v---+ /-2->+------v-----+	  3	+--------+
56  *      | SOLVED |<     | CLOSE_WAIT |--------->| CLOSED |
57  *	+--------+ \-5->+------------+		+--------+
58  *	                       |                    |
59  *                           6 |                    | 7
60  *      		+------v-----+              |
61  *	                |  REPAIRED  |<-------------+
62  *			+------------+
63  *
64  * The state machine changes are triggered by calls to fmd_case_transition()
65  * from various locations inside of fmd, as described below:
66  *
67  * [1] Called by: fmd_case_solve()
68  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
69  *                conviction policy is applied to suspect list
70  *                suspects convicted are marked faulty (F) in R$
71  *                list.suspect event logged and dispatched
72  *
73  * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
74  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
75  *                suspects convicted (F) are marked unusable (U) in R$
76  *                diagnosis engine fmdo_close() entry point scheduled
77  *                case transitions to CLOSED [3] upon exit from CLOSE_WAIT
78  *
79  * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
80  *       Actions: list.isolated event dispatched
81  *                case deleted from module's list of open cases
82  *
83  * [4] Called by: fmd_case_close(), fmd_case_uuclose()
84  *       Actions: diagnosis engine fmdo_close() entry point scheduled
85  *                case is subsequently discarded by fmd_case_delete()
86  *
87  * [5] Called by: fmd_case_repair(), fmd_case_update()
88  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
89  *                diagnosis engine fmdo_close() entry point scheduled
90  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
91  *
92  * [6] Called by: fmd_case_repair(), fmd_case_update()
93  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
94  *                suspects convicted are marked non faulty (!F) in R$
95  *                list.repaired event dispatched
96  *
97  * [7] Called by: fmd_case_repair(), fmd_case_update()
98  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
99  *                suspects convicted are marked non faulty (!F) in R$
100  *                list.repaired event dispatched
101  */
102 
103 #include <sys/fm/protocol.h>
104 #include <uuid/uuid.h>
105 #include <alloca.h>
106 
107 #include <fmd_alloc.h>
108 #include <fmd_module.h>
109 #include <fmd_error.h>
110 #include <fmd_conf.h>
111 #include <fmd_case.h>
112 #include <fmd_string.h>
113 #include <fmd_subr.h>
114 #include <fmd_protocol.h>
115 #include <fmd_event.h>
116 #include <fmd_eventq.h>
117 #include <fmd_dispq.h>
118 #include <fmd_buf.h>
119 #include <fmd_log.h>
120 #include <fmd_asru.h>
121 #include <fmd_fmri.h>
122 #include <fmd_xprt.h>
123 
124 #include <fmd.h>
125 
126 static const char *const _fmd_case_snames[] = {
127 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
128 	"SOLVED",	/* FMD_CASE_SOLVED */
129 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
130 	"CLOSED",	/* FMD_CASE_CLOSED */
131 	"REPAIRED"	/* FMD_CASE_REPAIRED */
132 };
133 
134 extern volatile uint32_t fmd_asru_fake_not_present;
135 
136 fmd_case_hash_t *
137 fmd_case_hash_create(void)
138 {
139 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
140 
141 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
142 	chp->ch_hashlen = fmd.d_str_buckets;
143 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
144 	chp->ch_count = 0;
145 
146 	return (chp);
147 }
148 
149 /*
150  * Destroy the case hash.  Unlike most of our hash tables, no active references
151  * are kept by the case hash itself; all references come from other subsystems.
152  * The hash must be destroyed after all modules are unloaded; if anything was
153  * present in the hash it would be by definition a reference count leak.
154  */
155 void
156 fmd_case_hash_destroy(fmd_case_hash_t *chp)
157 {
158 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
159 	fmd_free(chp, sizeof (fmd_case_hash_t));
160 }
161 
162 /*
163  * Take a snapshot of the case hash by placing an additional hold on each
164  * member in an auxiliary array, and then call 'func' for each case.
165  */
166 void
167 fmd_case_hash_apply(fmd_case_hash_t *chp,
168     void (*func)(fmd_case_t *, void *), void *arg)
169 {
170 	fmd_case_impl_t *cp, **cps, **cpp;
171 	uint_t cpc, i;
172 
173 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
174 
175 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
176 	cpc = chp->ch_count;
177 
178 	for (i = 0; i < chp->ch_hashlen; i++) {
179 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) {
180 			fmd_case_hold((fmd_case_t *)cp);
181 			*cpp++ = cp;
182 		}
183 	}
184 
185 	ASSERT(cpp == cps + cpc);
186 	(void) pthread_rwlock_unlock(&chp->ch_lock);
187 
188 	for (i = 0; i < cpc; i++) {
189 		func((fmd_case_t *)cps[i], arg);
190 		fmd_case_rele((fmd_case_t *)cps[i]);
191 	}
192 
193 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
194 }
195 
196 /*
197  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
198  * were defined for this case or if the lookup fails, the event dictionary or
199  * module code is broken, and we set the event code to a precomputed default.
200  */
201 static const char *
202 fmd_case_mkcode(fmd_case_t *cp)
203 {
204 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
205 	fmd_case_susp_t *cis;
206 
207 	char **keys, **keyp;
208 	const char *s;
209 
210 	ASSERT(MUTEX_HELD(&cip->ci_lock));
211 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
212 
213 	fmd_free(cip->ci_code, cip->ci_codelen);
214 	cip->ci_codelen = cip->ci_mod->mod_codelen;
215 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
216 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
217 
218 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
219 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
220 			keyp++;
221 	}
222 
223 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
224 
225 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
226 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
227 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
228 		fmd_free(cip->ci_code, cip->ci_codelen);
229 		cip->ci_codelen = strlen(s) + 1;
230 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
231 		(void) strcpy(cip->ci_code, s);
232 	}
233 
234 	return (cip->ci_code);
235 }
236 
237 nvlist_t *
238 fmd_case_mkevent(fmd_case_t *cp, const char *class)
239 {
240 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
241 	fmd_case_susp_t *cis;
242 
243 	fmd_asru_hash_t *ahp = fmd.d_asrus;
244 	fmd_asru_t *asru;
245 
246 	nvlist_t **nva, **nvp, *nvl, *fmri;
247 	uint8_t *ba, *bp;
248 
249 	int msg = B_TRUE;
250 	boolean_t b;
251 
252 	(void) pthread_mutex_lock(&cip->ci_lock);
253 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
254 
255 	nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
256 	ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
257 
258 	/*
259 	 * For each suspect associated with the case, store its fault event
260 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
261 	 * have asked not to be messaged.  If any of them have made such a
262 	 * request, propagate that attribute to the composite list.* event.
263 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
264 	 */
265 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
266 		if (nvlist_lookup_boolean_value(cis->cis_nvl,
267 		    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
268 			msg = B_FALSE;
269 
270 		if (nvlist_lookup_nvlist(cis->cis_nvl,
271 		    FM_FAULT_ASRU, &fmri) == 0 && (asru =
272 		    fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) {
273 			*bp = 0;
274 			if (fmd_asru_fake_not_present ||
275 			    !fmd_fmri_present(asru->asru_fmri))
276 				*bp |= FM_SUSPECT_NOT_PRESENT;
277 			if (fmd_asru_fake_not_present ||
278 			    fmd_fmri_unusable(asru->asru_fmri))
279 				*bp |= FM_SUSPECT_UNUSABLE;
280 			if (asru->asru_flags & FMD_ASRU_FAULTY)
281 				*bp |= FM_SUSPECT_FAULTY;
282 			bp++;
283 			fmd_asru_hash_release(ahp, asru);
284 		} else
285 			*bp++ = 0;
286 
287 		*nvp++ = cis->cis_nvl;
288 	}
289 
290 	if (cip->ci_code == NULL)
291 		(void) fmd_case_mkcode(cp);
292 
293 	if (msg == B_FALSE)
294 		cip->ci_flags |= FMD_CF_INVISIBLE;
295 
296 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
297 	    cip->ci_code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv);
298 
299 	(void) pthread_mutex_unlock(&cip->ci_lock);
300 	return (nvl);
301 }
302 
303 /*
304  * Convict suspects in a case by applying a conviction policy and updating the
305  * resource cache prior to emitting the list.suspect event for the given case.
306  * At present, our policy is very simple: convict every suspect in the case.
307  * In the future, this policy can be extended and made configurable to permit:
308  *
309  * - convicting the suspect with the highest FIT rate
310  * - convicting the suspect with the cheapest FRU
311  * - convicting the suspect with the FRU that is in a depot's inventory
312  * - convicting the suspect with the longest lifetime
313  *
314  * and so forth.  A word to the wise: this problem is significantly harder that
315  * it seems at first glance.  Future work should heed the following advice:
316  *
317  * Hacking the policy into C code here is a very bad idea.  The policy needs to
318  * be decided upon very carefully and fundamentally encodes knowledge of what
319  * suspect list combinations can be emitted by what diagnosis engines.  As such
320  * fmd's code is the wrong location, because that would require fmd itself to
321  * be updated for every diagnosis engine change, defeating the entire design.
322  * The FMA Event Registry knows the suspect list combinations: policy inputs
323  * can be derived from it and used to produce per-module policy configuration.
324  *
325  * If the policy needs to be dynamic and not statically fixed at either fmd
326  * startup or module load time, any implementation of dynamic policy retrieval
327  * must employ some kind of caching mechanism or be part of a built-in module.
328  * The fmd_case_convict() function is called with locks held inside of fmd and
329  * is not a place where unbounded blocking on some inter-process or inter-
330  * system communication to another service (e.g. another daemon) can occur.
331  */
332 static void
333 fmd_case_convict(fmd_case_t *cp)
334 {
335 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
336 	fmd_asru_hash_t *ahp = fmd.d_asrus;
337 
338 	fmd_case_susp_t *cis;
339 	fmd_asru_t *asru;
340 	nvlist_t *fmri;
341 
342 	(void) pthread_mutex_lock(&cip->ci_lock);
343 	(void) fmd_case_mkcode(cp);
344 
345 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
346 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri))
347 			continue; /* no ASRU provided by diagnosis engine */
348 
349 		if ((asru = fmd_asru_hash_lookup_nvl(ahp,
350 		    fmri, FMD_B_TRUE)) == NULL) {
351 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
352 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
353 			continue;
354 		}
355 
356 		(void) fmd_asru_clrflags(asru,
357 		    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
358 		(void) fmd_asru_setflags(asru,
359 		    FMD_ASRU_FAULTY, cp, cis->cis_nvl);
360 
361 		fmd_asru_hash_release(ahp, asru);
362 	}
363 
364 	(void) pthread_mutex_unlock(&cip->ci_lock);
365 }
366 
367 void
368 fmd_case_publish(fmd_case_t *cp, uint_t state)
369 {
370 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
371 	fmd_event_t *e;
372 	nvlist_t *nvl;
373 	char *class;
374 
375 	if (state == FMD_CASE_CURRENT)
376 		state = cip->ci_state; /* use current state */
377 
378 	switch (state) {
379 	case FMD_CASE_SOLVED:
380 		(void) pthread_mutex_lock(&cip->ci_lock);
381 		if (cip->ci_tv_valid == 0) {
382 			fmd_time_gettimeofday(&cip->ci_tv);
383 			cip->ci_tv_valid = 1;
384 		}
385 		(void) pthread_mutex_unlock(&cip->ci_lock);
386 		fmd_case_convict(cp);
387 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
388 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
389 
390 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
391 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
392 		fmd_log_append(fmd.d_fltlog, e, cp);
393 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
394 		fmd_dispq_dispatch(fmd.d_disp, e, class);
395 
396 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
397 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
398 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
399 
400 		break;
401 
402 	case FMD_CASE_CLOSE_WAIT:
403 		fmd_case_hold(cp);
404 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
405 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
406 
407 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
408 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
409 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
410 
411 		break;
412 
413 	case FMD_CASE_CLOSED:
414 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
415 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
416 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
417 		fmd_dispq_dispatch(fmd.d_disp, e, class);
418 		break;
419 
420 	case FMD_CASE_REPAIRED:
421 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
422 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
423 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
424 		fmd_dispq_dispatch(fmd.d_disp, e, class);
425 		break;
426 	}
427 }
428 
429 fmd_case_t *
430 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
431 {
432 	fmd_case_impl_t *cip;
433 	uint_t h;
434 
435 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
436 	h = fmd_strhash(uuid) % chp->ch_hashlen;
437 
438 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
439 		if (strcmp(cip->ci_uuid, uuid) == 0)
440 			break;
441 	}
442 
443 	if (cip != NULL)
444 		fmd_case_hold((fmd_case_t *)cip);
445 	else
446 		(void) fmd_set_errno(EFMD_CASE_INVAL);
447 
448 	(void) pthread_rwlock_unlock(&chp->ch_lock);
449 	return ((fmd_case_t *)cip);
450 }
451 
452 static fmd_case_impl_t *
453 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
454 {
455 	fmd_case_impl_t *eip;
456 	uint_t h;
457 
458 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
459 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
460 
461 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
462 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0) {
463 			fmd_case_hold((fmd_case_t *)eip);
464 			(void) pthread_rwlock_unlock(&chp->ch_lock);
465 			return (eip); /* uuid already present */
466 		}
467 	}
468 
469 	cip->ci_next = chp->ch_hash[h];
470 	chp->ch_hash[h] = cip;
471 
472 	chp->ch_count++;
473 	ASSERT(chp->ch_count != 0);
474 
475 	(void) pthread_rwlock_unlock(&chp->ch_lock);
476 	return (cip);
477 }
478 
479 static void
480 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
481 {
482 	fmd_case_impl_t *cp, **pp;
483 	uint_t h;
484 
485 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
486 
487 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
488 	pp = &chp->ch_hash[h];
489 
490 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
491 		if (cp != cip)
492 			pp = &cp->ci_next;
493 		else
494 			break;
495 	}
496 
497 	if (cp == NULL) {
498 		fmd_panic("case %p (%s) not found on hash chain %u\n",
499 		    (void *)cip, cip->ci_uuid, h);
500 	}
501 
502 	*pp = cp->ci_next;
503 	cp->ci_next = NULL;
504 
505 	ASSERT(chp->ch_count != 0);
506 	chp->ch_count--;
507 
508 	(void) pthread_rwlock_unlock(&chp->ch_lock);
509 }
510 
511 fmd_case_t *
512 fmd_case_create(fmd_module_t *mp, void *data)
513 {
514 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
515 	fmd_case_impl_t *eip = NULL;
516 	uuid_t uuid;
517 
518 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
519 	fmd_buf_hash_create(&cip->ci_bufs);
520 
521 	fmd_module_hold(mp);
522 	cip->ci_mod = mp;
523 	cip->ci_refs = 1;
524 	cip->ci_state = FMD_CASE_UNSOLVED;
525 	cip->ci_flags = FMD_CF_DIRTY;
526 	cip->ci_data = data;
527 
528 	/*
529 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
530 	 * define any constant for the length of an unparse string, and do not
531 	 * permit the caller to specify a buffer length for safety.  The spec
532 	 * says it will be 36 bytes, but we make it tunable just in case.
533 	 */
534 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
535 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
536 
537 	/*
538 	 * We expect this loop to execute only once, but code it defensively
539 	 * against the possibility of libuuid bugs.  Keep generating uuids and
540 	 * attempting to do a hash insert until we get a unique one.
541 	 */
542 	do {
543 		if (eip != NULL)
544 			fmd_case_rele((fmd_case_t *)eip);
545 		uuid_generate(uuid);
546 		uuid_unparse(uuid, cip->ci_uuid);
547 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
548 
549 	ASSERT(fmd_module_locked(mp));
550 	fmd_list_append(&mp->mod_cases, cip);
551 	fmd_module_setcdirty(mp);
552 
553 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
554 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
555 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
556 
557 	return ((fmd_case_t *)cip);
558 }
559 
560 static void
561 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
562 {
563 	fmd_case_susp_t *cis, *ncis;
564 
565 	ASSERT(MUTEX_HELD(&cip->ci_lock));
566 
567 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
568 		ncis = cis->cis_next;
569 		nvlist_free(cis->cis_nvl);
570 		fmd_free(cis, sizeof (fmd_case_susp_t));
571 	}
572 
573 	cip->ci_suspects = NULL;
574 	cip->ci_nsuspects = 0;
575 }
576 
577 fmd_case_t *
578 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
579     uint_t state, const char *uuid, const char *code)
580 {
581 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
582 	fmd_case_impl_t *eip;
583 
584 	ASSERT(state < FMD_CASE_REPAIRED);
585 
586 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
587 	fmd_buf_hash_create(&cip->ci_bufs);
588 
589 	fmd_module_hold(mp);
590 	cip->ci_mod = mp;
591 	cip->ci_xprt = xp;
592 	cip->ci_refs = 1;
593 	cip->ci_state = state;
594 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
595 	cip->ci_uuidlen = strlen(cip->ci_uuid);
596 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
597 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
598 
599 	if (state > FMD_CASE_CLOSE_WAIT)
600 		cip->ci_flags |= FMD_CF_SOLVED;
601 
602 	/*
603 	 * Insert the case into the global case hash.  If the specified UUID is
604 	 * already present, check to see if it is an orphan: if so, reclaim it;
605 	 * otherwise if it is owned by a different module then return NULL.
606 	 */
607 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
608 		(void) pthread_mutex_lock(&cip->ci_lock);
609 		cip->ci_refs--; /* decrement to zero */
610 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
611 
612 		cip = eip; /* switch 'cip' to the existing case */
613 		(void) pthread_mutex_lock(&cip->ci_lock);
614 
615 		/*
616 		 * If the ASRU cache is trying to recreate an orphan, then just
617 		 * return the existing case that we found without changing it.
618 		 */
619 		if (mp == fmd.d_rmod) {
620 			(void) pthread_mutex_unlock(&cip->ci_lock);
621 			fmd_case_rele((fmd_case_t *)cip);
622 			return ((fmd_case_t *)cip);
623 		}
624 
625 		/*
626 		 * If the existing case isn't an orphan or is being proxied,
627 		 * then we have a UUID conflict: return failure to the caller.
628 		 */
629 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
630 			(void) pthread_mutex_unlock(&cip->ci_lock);
631 			fmd_case_rele((fmd_case_t *)cip);
632 			return (NULL);
633 		}
634 
635 		/*
636 		 * If the new module is reclaiming an orphaned case, remove
637 		 * the case from the root module, switch ci_mod, and then fall
638 		 * through to adding the case to the new owner module 'mp'.
639 		 */
640 		fmd_module_lock(cip->ci_mod);
641 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
642 		fmd_module_unlock(cip->ci_mod);
643 
644 		fmd_module_rele(cip->ci_mod);
645 		cip->ci_mod = mp;
646 		fmd_module_hold(mp);
647 
648 		fmd_case_destroy_suspects(cip);
649 		cip->ci_state = state;
650 
651 		(void) pthread_mutex_unlock(&cip->ci_lock);
652 		fmd_case_rele((fmd_case_t *)cip);
653 	}
654 
655 	ASSERT(fmd_module_locked(mp));
656 	fmd_list_append(&mp->mod_cases, cip);
657 
658 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
659 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
660 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
661 
662 	return ((fmd_case_t *)cip);
663 }
664 
665 void
666 fmd_case_destroy(fmd_case_t *cp, int visible)
667 {
668 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
669 	fmd_case_item_t *cit, *ncit;
670 
671 	ASSERT(MUTEX_HELD(&cip->ci_lock));
672 	ASSERT(cip->ci_refs == 0);
673 
674 	if (visible) {
675 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
676 		fmd_case_hash_delete(fmd.d_cases, cip);
677 	}
678 
679 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
680 		ncit = cit->cit_next;
681 		fmd_event_rele(cit->cit_event);
682 		fmd_free(cit, sizeof (fmd_case_item_t));
683 	}
684 
685 	fmd_case_destroy_suspects(cip);
686 
687 	if (cip->ci_principal != NULL)
688 		fmd_event_rele(cip->ci_principal);
689 
690 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
691 	fmd_free(cip->ci_code, cip->ci_codelen);
692 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
693 
694 	fmd_module_rele(cip->ci_mod);
695 	fmd_free(cip, sizeof (fmd_case_impl_t));
696 }
697 
698 void
699 fmd_case_hold(fmd_case_t *cp)
700 {
701 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
702 
703 	(void) pthread_mutex_lock(&cip->ci_lock);
704 	cip->ci_refs++;
705 	ASSERT(cip->ci_refs != 0);
706 	(void) pthread_mutex_unlock(&cip->ci_lock);
707 }
708 
709 void
710 fmd_case_hold_locked(fmd_case_t *cp)
711 {
712 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
713 
714 	ASSERT(MUTEX_HELD(&cip->ci_lock));
715 	cip->ci_refs++;
716 	ASSERT(cip->ci_refs != 0);
717 }
718 
719 void
720 fmd_case_rele(fmd_case_t *cp)
721 {
722 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
723 
724 	(void) pthread_mutex_lock(&cip->ci_lock);
725 	ASSERT(cip->ci_refs != 0);
726 
727 	if (--cip->ci_refs == 0)
728 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
729 	else
730 		(void) pthread_mutex_unlock(&cip->ci_lock);
731 }
732 
733 int
734 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
735 {
736 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
737 	fmd_case_item_t *cit;
738 	fmd_event_t *oep;
739 	uint_t state;
740 	int new;
741 
742 	fmd_event_hold(ep);
743 	(void) pthread_mutex_lock(&cip->ci_lock);
744 
745 	if (cip->ci_flags & FMD_CF_SOLVED)
746 		state = FMD_EVS_DIAGNOSED;
747 	else
748 		state = FMD_EVS_ACCEPTED;
749 
750 	oep = cip->ci_principal;
751 	cip->ci_principal = ep;
752 
753 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
754 		if (cit->cit_event == ep)
755 			break;
756 	}
757 
758 	cip->ci_flags |= FMD_CF_DIRTY;
759 	new = cit == NULL && ep != oep;
760 
761 	(void) pthread_mutex_unlock(&cip->ci_lock);
762 
763 	fmd_module_setcdirty(cip->ci_mod);
764 	fmd_event_transition(ep, state);
765 
766 	if (oep != NULL)
767 		fmd_event_rele(oep);
768 
769 	return (new);
770 }
771 
772 int
773 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
774 {
775 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
776 	fmd_case_item_t *cit;
777 	uint_t state;
778 	int new;
779 
780 	(void) pthread_mutex_lock(&cip->ci_lock);
781 
782 	if (cip->ci_flags & FMD_CF_SOLVED)
783 		state = FMD_EVS_DIAGNOSED;
784 	else
785 		state = FMD_EVS_ACCEPTED;
786 
787 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
788 		if (cit->cit_event == ep)
789 			break;
790 	}
791 
792 	new = cit == NULL && ep != cip->ci_principal;
793 
794 	/*
795 	 * If the event is already in the case or the case is already solved,
796 	 * there is no reason to save it: just transition it appropriately.
797 	 */
798 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
799 		(void) pthread_mutex_unlock(&cip->ci_lock);
800 		fmd_event_transition(ep, state);
801 		return (new);
802 	}
803 
804 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
805 	fmd_event_hold(ep);
806 
807 	cit->cit_next = cip->ci_items;
808 	cit->cit_event = ep;
809 
810 	cip->ci_items = cit;
811 	cip->ci_nitems++;
812 
813 	cip->ci_flags |= FMD_CF_DIRTY;
814 	(void) pthread_mutex_unlock(&cip->ci_lock);
815 
816 	fmd_module_setcdirty(cip->ci_mod);
817 	fmd_event_transition(ep, state);
818 
819 	return (new);
820 }
821 
822 void
823 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
824 {
825 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
826 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
827 
828 	(void) pthread_mutex_lock(&cip->ci_lock);
829 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
830 	cip->ci_flags |= FMD_CF_DIRTY;
831 
832 	cis->cis_next = cip->ci_suspects;
833 	cis->cis_nvl = nvl;
834 
835 	cip->ci_suspects = cis;
836 	cip->ci_nsuspects++;
837 
838 	(void) pthread_mutex_unlock(&cip->ci_lock);
839 	fmd_module_setcdirty(cip->ci_mod);
840 }
841 
842 void
843 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
844 {
845 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
846 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
847 	boolean_t b;
848 
849 	(void) pthread_mutex_lock(&cip->ci_lock);
850 	ASSERT(cip->ci_state == FMD_CASE_CLOSED);
851 	ASSERT(cip->ci_mod == fmd.d_rmod);
852 
853 	cis->cis_next = cip->ci_suspects;
854 	cis->cis_nvl = nvl;
855 
856 	if (nvlist_lookup_boolean_value(nvl,
857 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
858 		cip->ci_flags |= FMD_CF_INVISIBLE;
859 
860 	cip->ci_suspects = cis;
861 	cip->ci_nsuspects++;
862 
863 	(void) pthread_mutex_unlock(&cip->ci_lock);
864 }
865 
866 void
867 fmd_case_reset_suspects(fmd_case_t *cp)
868 {
869 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
870 
871 	(void) pthread_mutex_lock(&cip->ci_lock);
872 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
873 
874 	fmd_case_destroy_suspects(cip);
875 	cip->ci_flags |= FMD_CF_DIRTY;
876 
877 	(void) pthread_mutex_unlock(&cip->ci_lock);
878 	fmd_module_setcdirty(cip->ci_mod);
879 }
880 
881 /*
882  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
883  * whatever actions and emit whatever events are appropriate for the state.
884  * Refer to the topmost block comment explaining the state machine for details.
885  */
886 void
887 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
888 {
889 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
890 
891 	fmd_case_susp_t *cis;
892 	fmd_case_item_t *cit;
893 	fmd_asru_t *asru;
894 	fmd_event_t *e;
895 	nvlist_t *nvl;
896 
897 	ASSERT(state <= FMD_CASE_REPAIRED);
898 	(void) pthread_mutex_lock(&cip->ci_lock);
899 
900 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
901 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
902 
903 	cip->ci_flags |= flags;
904 
905 	if (cip->ci_state >= state) {
906 		(void) pthread_mutex_unlock(&cip->ci_lock);
907 		return; /* already in specified state */
908 	}
909 
910 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
911 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
912 
913 	cip->ci_state = state;
914 	cip->ci_flags |= FMD_CF_DIRTY;
915 
916 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
917 		fmd_module_setcdirty(cip->ci_mod);
918 
919 	switch (state) {
920 	case FMD_CASE_SOLVED:
921 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
922 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
923 
924 		if (cip->ci_principal != NULL) {
925 			fmd_event_transition(cip->ci_principal,
926 			    FMD_EVS_DIAGNOSED);
927 		}
928 		break;
929 
930 	case FMD_CASE_CLOSE_WAIT:
931 		/*
932 		 * If the case was never solved, do not change ASRUs.
933 		 * If the case was never fmd_case_closed, do not change ASRUs.
934 		 * If the case was repaired, do not change ASRUs.
935 		 */
936 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
937 		    FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED))
938 			goto close_wait_finish;
939 
940 		/*
941 		 * For each fault event in the suspect list, attempt to look up
942 		 * the corresponding ASRU in the ASRU dictionary.  If the ASRU
943 		 * is found there and is marked faulty, we now mark it unusable
944 		 * and record the case meta-data and fault event with the ASRU.
945 		 */
946 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
947 			if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
948 			    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
949 			    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
950 				(void) fmd_asru_setflags(asru,
951 				    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
952 				fmd_asru_hash_release(fmd.d_asrus, asru);
953 			}
954 		}
955 
956 	close_wait_finish:
957 		/*
958 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
959 		 * module is no longer loaded: continue on to CASE_CLOSED.
960 		 */
961 		if (fmd_case_orphaned(cp))
962 			state = cip->ci_state = FMD_CASE_CLOSED;
963 		break;
964 
965 	case FMD_CASE_REPAIRED:
966 		ASSERT(fmd_case_orphaned(cp));
967 		fmd_module_lock(cip->ci_mod);
968 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
969 		fmd_module_unlock(cip->ci_mod);
970 		break;
971 	}
972 
973 	(void) pthread_mutex_unlock(&cip->ci_lock);
974 
975 	/*
976 	 * If the module has initialized, then publish the appropriate event
977 	 * for the new case state.  If not, we are being called from the
978 	 * checkpoint code during module load, in which case the module's
979 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
980 	 * may not be open yet, which will prevent us from computing the event
981 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
982 	 * event in our queue: this won't be processed until _fmd_init is done.
983 	 */
984 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
985 		fmd_case_publish(cp, state);
986 	else {
987 		fmd_case_hold(cp);
988 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
989 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
990 	}
991 
992 	/*
993 	 * If we transitioned to REPAIRED, adjust the reference count to
994 	 * reflect our removal from fmd.d_rmod->mod_cases.  If the caller has
995 	 * not placed an additional hold on the case, it will now be freed.
996 	 */
997 	if (state == FMD_CASE_REPAIRED)
998 		fmd_case_rele(cp);
999 }
1000 
1001 /*
1002  * Transition the specified case to *at least* the specified state by first
1003  * re-validating the suspect list using the resource cache.  This function is
1004  * employed by the checkpoint code when restoring a saved, solved case to see
1005  * if the state of the case has effectively changed while fmd was not running
1006  * or the module was not loaded.  If none of the suspects are present anymore,
1007  * advance the state to REPAIRED.  If none are usable, advance to CLOSE_WAIT.
1008  */
1009 void
1010 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1011 {
1012 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1013 	fmd_case_susp_t *cis;
1014 	fmd_asru_t *asru;
1015 	nvlist_t *nvl;
1016 
1017 	int faulty = 0;		/* are any suspects faulty? */
1018 	int usable = 0;		/* are any suspects usable? */
1019 
1020 	ASSERT(state >= FMD_CASE_SOLVED);
1021 	(void) pthread_mutex_lock(&cip->ci_lock);
1022 
1023 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
1024 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
1025 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
1026 		    fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) {
1027 
1028 			if (asru->asru_flags & FMD_ASRU_FAULTY)
1029 				faulty++;
1030 
1031 			if (fmd_asru_fake_not_present == 0 &&
1032 			    fmd_fmri_unusable(asru->asru_fmri) <= 0)
1033 				usable++;
1034 
1035 			fmd_asru_hash_release(fmd.d_asrus, asru);
1036 		}
1037 	}
1038 
1039 	(void) pthread_mutex_unlock(&cip->ci_lock);
1040 
1041 	/*
1042 	 * If none of the suspects were faulty, it implies they were either
1043 	 * repaired already or not present and the rsrc.age time has expired.
1044 	 * We can move the state on to repaired.
1045 	 */
1046 	if (!faulty) {
1047 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1048 		flags |= FMD_CF_REPAIRED;
1049 	} else if (!usable) {
1050 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1051 		flags |= FMD_CF_ISOLATED;
1052 	}
1053 
1054 	fmd_case_transition(cp, state, flags);
1055 }
1056 
1057 void
1058 fmd_case_setdirty(fmd_case_t *cp)
1059 {
1060 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1061 
1062 	(void) pthread_mutex_lock(&cip->ci_lock);
1063 	cip->ci_flags |= FMD_CF_DIRTY;
1064 	(void) pthread_mutex_unlock(&cip->ci_lock);
1065 
1066 	fmd_module_setcdirty(cip->ci_mod);
1067 }
1068 
1069 void
1070 fmd_case_clrdirty(fmd_case_t *cp)
1071 {
1072 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1073 
1074 	(void) pthread_mutex_lock(&cip->ci_lock);
1075 	cip->ci_flags &= ~FMD_CF_DIRTY;
1076 	(void) pthread_mutex_unlock(&cip->ci_lock);
1077 }
1078 
1079 void
1080 fmd_case_commit(fmd_case_t *cp)
1081 {
1082 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1083 	fmd_case_item_t *cit;
1084 
1085 	(void) pthread_mutex_lock(&cip->ci_lock);
1086 
1087 	if (cip->ci_flags & FMD_CF_DIRTY) {
1088 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1089 			fmd_event_commit(cit->cit_event);
1090 
1091 		if (cip->ci_principal != NULL)
1092 			fmd_event_commit(cip->ci_principal);
1093 
1094 		fmd_buf_hash_commit(&cip->ci_bufs);
1095 		cip->ci_flags &= ~FMD_CF_DIRTY;
1096 	}
1097 
1098 	(void) pthread_mutex_unlock(&cip->ci_lock);
1099 }
1100 
1101 /*
1102  * Indicate that the case may need to change state because one or more of the
1103  * ASRUs named as a suspect has changed state.  We examine all the suspects
1104  * and if none are still faulty, we initiate a case close transition.
1105  */
1106 void
1107 fmd_case_update(fmd_case_t *cp)
1108 {
1109 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1110 	fmd_case_susp_t *cis;
1111 	fmd_asru_t *asru;
1112 	nvlist_t *nvl;
1113 
1114 	int astate = 0;
1115 	uint_t cstate;
1116 
1117 	(void) pthread_mutex_lock(&cip->ci_lock);
1118 	cstate = cip->ci_state;
1119 
1120 	if ((cip->ci_flags & FMD_CF_REPAIRING) ||
1121 	    cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1122 		(void) pthread_mutex_unlock(&cip->ci_lock);
1123 		return; /* update is not appropriate */
1124 	}
1125 
1126 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
1127 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
1128 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
1129 		    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
1130 			astate |= (asru->asru_flags & FMD_ASRU_STATE);
1131 			fmd_asru_hash_release(fmd.d_asrus, asru);
1132 		}
1133 	}
1134 
1135 	(void) pthread_mutex_unlock(&cip->ci_lock);
1136 
1137 	if (astate & FMD_ASRU_FAULTY)
1138 		return; /* one or more suspects are still marked faulty */
1139 
1140 	if (cstate == FMD_CASE_CLOSED)
1141 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1142 	else
1143 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1144 }
1145 
1146 /*
1147  * Delete a closed case from the module's case list once the fmdo_close() entry
1148  * point has run to completion.  If the case is owned by a transport module,
1149  * tell the transport to proxy a case close on the other end of the transport.
1150  * If not, transition to the appropriate next state based on ci_flags.  This
1151  * function represents the end of CLOSE_WAIT and transitions the case to either
1152  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1153  * refer to the topmost block comment explaining the state machine for details.
1154  */
1155 void
1156 fmd_case_delete(fmd_case_t *cp)
1157 {
1158 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1159 	fmd_modstat_t *msp;
1160 	size_t buftotal;
1161 
1162 	ASSERT(fmd_module_locked(cip->ci_mod));
1163 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1164 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1165 
1166 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1167 	msp = cip->ci_mod->mod_stats;
1168 
1169 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1170 	msp->ms_caseopen.fmds_value.ui64--;
1171 
1172 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1173 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1174 
1175 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1176 
1177 	if (cip->ci_xprt == NULL)
1178 		fmd_module_setcdirty(cip->ci_mod);
1179 
1180 	fmd_module_rele(cip->ci_mod);
1181 	cip->ci_mod = fmd.d_rmod;
1182 	fmd_module_hold(cip->ci_mod);
1183 
1184 	/*
1185 	 * If the case is not proxied and it has been solved, then retain it
1186 	 * on the root module's case list at least until we're transitioned.
1187 	 * Otherwise free the case with our final fmd_case_rele() below.
1188 	 */
1189 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1190 		fmd_module_lock(cip->ci_mod);
1191 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1192 		fmd_module_unlock(cip->ci_mod);
1193 		fmd_case_hold(cp);
1194 	}
1195 
1196 	/*
1197 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1198 	 * rather than orphaned because by definition it can have no entries
1199 	 * in the resource cache of the current fault manager.
1200 	 */
1201 	if (cip->ci_xprt != NULL)
1202 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1203 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1204 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1205 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1206 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1207 
1208 	fmd_case_rele(cp);
1209 }
1210 
1211 void
1212 fmd_case_discard(fmd_case_t *cp)
1213 {
1214 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1215 
1216 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1217 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1218 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1219 
1220 	ASSERT(fmd_module_locked(cip->ci_mod));
1221 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1222 	fmd_case_rele(cp);
1223 }
1224 
1225 static void
1226 fmd_case_repair_containee(fmd_asru_t *ee, void *er)
1227 {
1228 	if ((ee->asru_flags & FMD_ASRU_FAULTY) &&
1229 	    fmd_fmri_contains(er, ee->asru_fmri) > 0)
1230 		(void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL);
1231 }
1232 
1233 /*
1234  * Indicate that the problem corresponding to a case has been repaired by
1235  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1236  * already been closed, this function initiates the transition to CLOSE_WAIT.
1237  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1238  * grab and drop ci_lock without the case being able to be freed in between.
1239  */
1240 int
1241 fmd_case_repair(fmd_case_t *cp)
1242 {
1243 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1244 	fmd_case_susp_t *cis;
1245 	nvlist_t *nvl;
1246 	uint_t cstate;
1247 
1248 	fmd_asru_hash_t *ahp = fmd.d_asrus;
1249 	fmd_asru_t **aa;
1250 	uint_t i, an;
1251 
1252 	(void) pthread_mutex_lock(&cip->ci_lock);
1253 	cstate = cip->ci_state;
1254 
1255 	if (cip->ci_xprt != NULL) {
1256 		(void) pthread_mutex_unlock(&cip->ci_lock);
1257 		return (fmd_set_errno(EFMD_CASE_OWNER));
1258 	}
1259 
1260 	if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) {
1261 		(void) pthread_mutex_unlock(&cip->ci_lock);
1262 		return (fmd_set_errno(EFMD_CASE_STATE));
1263 	}
1264 
1265 	/*
1266 	 * Take a snapshot of any ASRUs referenced by the case that are present
1267 	 * in the resource cache.  Then drop ci_lock and clear the faulty bit
1268 	 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held).
1269 	 */
1270 	an = cip->ci_nsuspects;
1271 	aa = alloca(sizeof (fmd_asru_t *) * an);
1272 	bzero(aa, sizeof (fmd_asru_t *) * an);
1273 
1274 	for (i = 0, cis = cip->ci_suspects;
1275 	    cis != NULL; cis = cis->cis_next, i++) {
1276 		if (nvlist_lookup_nvlist(cis->cis_nvl,
1277 		    FM_FAULT_ASRU, &nvl) == 0)
1278 			aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE);
1279 	}
1280 
1281 	cip->ci_flags |= FMD_CF_REPAIRING;
1282 	(void) pthread_mutex_unlock(&cip->ci_lock);
1283 
1284 	/*
1285 	 * For each suspect ASRU, if the case associated with this ASRU matches
1286 	 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY.  Note
1287 	 * that at present, we're assuming that when a given resource FMRI R1
1288 	 * contains another R2, that any faults are related by a common
1289 	 * diagnosis engine.  This is true in our current architecture, but may
1290 	 * not always be true, at which point we'll need more cleverness here.
1291 	 */
1292 	for (i = 0; i < an; i++) {
1293 		if (aa[i] == NULL)
1294 			continue; /* no asru was found */
1295 
1296 		if (aa[i]->asru_case == cp) {
1297 			fmd_asru_hash_apply(fmd.d_asrus,
1298 			    fmd_case_repair_containee, aa[i]->asru_fmri);
1299 			(void) fmd_asru_clrflags(aa[i],
1300 			    FMD_ASRU_FAULTY, NULL, NULL);
1301 		}
1302 
1303 		fmd_asru_hash_release(ahp, aa[i]);
1304 	}
1305 
1306 	(void) pthread_mutex_lock(&cip->ci_lock);
1307 	cip->ci_flags &= ~FMD_CF_REPAIRING;
1308 	(void) pthread_mutex_unlock(&cip->ci_lock);
1309 
1310 	if (cstate == FMD_CASE_CLOSED)
1311 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1312 	else
1313 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1314 
1315 	return (0);
1316 }
1317 
1318 int
1319 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1320 {
1321 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1322 	fmd_case_item_t *cit;
1323 	uint_t state;
1324 	int rv = 0;
1325 
1326 	(void) pthread_mutex_lock(&cip->ci_lock);
1327 
1328 	if (cip->ci_state >= FMD_CASE_SOLVED)
1329 		state = FMD_EVS_DIAGNOSED;
1330 	else
1331 		state = FMD_EVS_ACCEPTED;
1332 
1333 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1334 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1335 			break;
1336 	}
1337 
1338 	if (rv == 0 && cip->ci_principal != NULL)
1339 		rv = fmd_event_equal(ep, cip->ci_principal);
1340 
1341 	(void) pthread_mutex_unlock(&cip->ci_lock);
1342 
1343 	if (rv != 0)
1344 		fmd_event_transition(ep, state);
1345 
1346 	return (rv);
1347 }
1348 
1349 int
1350 fmd_case_orphaned(fmd_case_t *cp)
1351 {
1352 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1353 }
1354 
1355 void
1356 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
1357 {
1358 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
1359 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
1360 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
1361 }
1362