xref: /titanic_51/usr/src/cmd/fm/fmd/common/fmd_case.c (revision ccbf80fa3b6bf6b986dca9037e5ad9d6c9f9fa65)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * FMD Case Subsystem
31  *
32  * Diagnosis engines are expected to group telemetry events related to the
33  * diagnosis of a particular problem on the system into a set of cases.  The
34  * diagnosis engine may have any number of cases open at a given point in time.
35  * Some cases may eventually be *solved* by associating a suspect list of one
36  * or more problems with the case, at which point fmd publishes a list.suspect
37  * event for the case and it becomes visible to administrators and agents.
38  *
39  * Every case is named using a UUID, and is globally visible in the case hash.
40  * Cases are reference-counted, except for the reference from the case hash
41  * itself.  Consumers of case references include modules, which store active
42  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
43  *
44  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
45  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
46  * or transport) and the case is referenced by the mod_cases list.  Once the
47  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
48  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
49  *
50  *			+------------+
51  *	     +----------|  UNSOLVED  |
52  *	     |		+------------+
53  *	   1 |	             4 |
54  *           |                 |
55  *	+----v---+ /-2->+------v-----+	  3	+--------+
56  *      | SOLVED |<     | CLOSE_WAIT |--------->| CLOSED |
57  *	+--------+ \-5->+------------+		+--------+
58  *	                       |                    |
59  *                           6 |                    | 7
60  *      		+------v-----+              |
61  *	                |  REPAIRED  |<-------------+
62  *			+------------+
63  *
64  * The state machine changes are triggered by calls to fmd_case_transition()
65  * from various locations inside of fmd, as described below:
66  *
67  * [1] Called by: fmd_case_solve()
68  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
69  *                conviction policy is applied to suspect list
70  *                suspects convicted are marked faulty (F) in R$
71  *                list.suspect event logged and dispatched
72  *
73  * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
74  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
75  *                suspects convicted (F) are marked unusable (U) in R$
76  *                diagnosis engine fmdo_close() entry point scheduled
77  *                case transitions to CLOSED [3] upon exit from CLOSE_WAIT
78  *
79  * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
80  *       Actions: list.isolated event dispatched
81  *                case deleted from module's list of open cases
82  *
83  * [4] Called by: fmd_case_close(), fmd_case_uuclose()
84  *       Actions: diagnosis engine fmdo_close() entry point scheduled
85  *                case is subsequently discarded by fmd_case_delete()
86  *
87  * [5] Called by: fmd_case_repair(), fmd_case_update()
88  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
89  *                diagnosis engine fmdo_close() entry point scheduled
90  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
91  *
92  * [6] Called by: fmd_case_repair(), fmd_case_update()
93  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
94  *                suspects convicted are marked non faulty (!F) in R$
95  *                list.repaired event dispatched
96  *
97  * [7] Called by: fmd_case_repair(), fmd_case_update()
98  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
99  *                suspects convicted are marked non faulty (!F) in R$
100  *                list.repaired event dispatched
101  */
102 
103 #include <sys/fm/protocol.h>
104 #include <uuid/uuid.h>
105 #include <alloca.h>
106 
107 #include <fmd_alloc.h>
108 #include <fmd_module.h>
109 #include <fmd_error.h>
110 #include <fmd_conf.h>
111 #include <fmd_case.h>
112 #include <fmd_string.h>
113 #include <fmd_subr.h>
114 #include <fmd_protocol.h>
115 #include <fmd_event.h>
116 #include <fmd_eventq.h>
117 #include <fmd_dispq.h>
118 #include <fmd_buf.h>
119 #include <fmd_log.h>
120 #include <fmd_asru.h>
121 #include <fmd_fmri.h>
122 #include <fmd_xprt.h>
123 
124 #include <fmd.h>
125 
126 static const char *const _fmd_case_snames[] = {
127 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
128 	"SOLVED",	/* FMD_CASE_SOLVED */
129 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
130 	"CLOSED",	/* FMD_CASE_CLOSED */
131 	"REPAIRED"	/* FMD_CASE_REPAIRED */
132 };
133 
134 fmd_case_hash_t *
135 fmd_case_hash_create(void)
136 {
137 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
138 
139 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
140 	chp->ch_hashlen = fmd.d_str_buckets;
141 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
142 	chp->ch_count = 0;
143 
144 	return (chp);
145 }
146 
147 /*
148  * Destroy the case hash.  Unlike most of our hash tables, no active references
149  * are kept by the case hash itself; all references come from other subsystems.
150  * The hash must be destroyed after all modules are unloaded; if anything was
151  * present in the hash it would be by definition a reference count leak.
152  */
153 void
154 fmd_case_hash_destroy(fmd_case_hash_t *chp)
155 {
156 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
157 	fmd_free(chp, sizeof (fmd_case_hash_t));
158 }
159 
160 /*
161  * Take a snapshot of the case hash by placing an additional hold on each
162  * member in an auxiliary array, and then call 'func' for each case.
163  */
164 void
165 fmd_case_hash_apply(fmd_case_hash_t *chp,
166     void (*func)(fmd_case_t *, void *), void *arg)
167 {
168 	fmd_case_impl_t *cp, **cps, **cpp;
169 	uint_t cpc, i;
170 
171 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
172 
173 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
174 	cpc = chp->ch_count;
175 
176 	for (i = 0; i < chp->ch_hashlen; i++) {
177 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) {
178 			fmd_case_hold((fmd_case_t *)cp);
179 			*cpp++ = cp;
180 		}
181 	}
182 
183 	ASSERT(cpp == cps + cpc);
184 	(void) pthread_rwlock_unlock(&chp->ch_lock);
185 
186 	for (i = 0; i < cpc; i++) {
187 		func((fmd_case_t *)cps[i], arg);
188 		fmd_case_rele((fmd_case_t *)cps[i]);
189 	}
190 
191 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
192 }
193 
194 /*
195  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
196  * were defined for this case or if the lookup fails, the event dictionary or
197  * module code is broken, and we set the event code to a precomputed default.
198  */
199 static const char *
200 fmd_case_mkcode(fmd_case_t *cp)
201 {
202 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
203 	fmd_case_susp_t *cis;
204 
205 	char **keys, **keyp;
206 	const char *s;
207 
208 	ASSERT(MUTEX_HELD(&cip->ci_lock));
209 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
210 
211 	fmd_free(cip->ci_code, cip->ci_codelen);
212 	cip->ci_codelen = cip->ci_mod->mod_codelen;
213 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
214 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
215 
216 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
217 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
218 			keyp++;
219 	}
220 
221 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
222 
223 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
224 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
225 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
226 		fmd_free(cip->ci_code, cip->ci_codelen);
227 		cip->ci_codelen = strlen(s) + 1;
228 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
229 		(void) strcpy(cip->ci_code, s);
230 	}
231 
232 	return (cip->ci_code);
233 }
234 
235 nvlist_t *
236 fmd_case_mkevent(fmd_case_t *cp, const char *class)
237 {
238 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
239 	fmd_case_susp_t *cis;
240 
241 	fmd_asru_hash_t *ahp = fmd.d_asrus;
242 	fmd_asru_t *asru;
243 
244 	nvlist_t **nva, **nvp, *nvl, *fmri;
245 	uint8_t *ba, *bp;
246 
247 	int msg = B_TRUE;
248 	boolean_t b;
249 
250 	(void) pthread_mutex_lock(&cip->ci_lock);
251 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
252 
253 	nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
254 	ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
255 
256 	/*
257 	 * For each suspect associated with the case, store its fault event
258 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
259 	 * have asked not to be messaged.  If any of them have made such a
260 	 * request, propagate that attribute to the composite list.* event.
261 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
262 	 */
263 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
264 		if (nvlist_lookup_boolean_value(cis->cis_nvl,
265 		    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
266 			msg = B_FALSE;
267 
268 		if (nvlist_lookup_nvlist(cis->cis_nvl,
269 		    FM_FAULT_ASRU, &fmri) == 0 && (asru =
270 		    fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) {
271 			*bp++ = (asru->asru_flags & FMD_ASRU_FAULTY) != 0;
272 			fmd_asru_hash_release(ahp, asru);
273 		} else
274 			*bp++ = 0;
275 
276 		*nvp++ = cis->cis_nvl;
277 	}
278 
279 	if (cip->ci_code == NULL)
280 		(void) fmd_case_mkcode(cp);
281 
282 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri,
283 	    cip->ci_uuid, cip->ci_code, cip->ci_nsuspects, nva, ba, msg);
284 
285 	(void) pthread_mutex_unlock(&cip->ci_lock);
286 	return (nvl);
287 }
288 
289 /*
290  * Convict suspects in a case by applying a conviction policy and updating the
291  * resource cache prior to emitting the list.suspect event for the given case.
292  * At present, our policy is very simple: convict every suspect in the case.
293  * In the future, this policy can be extended and made configurable to permit:
294  *
295  * - convicting the suspect with the highest FIT rate
296  * - convicting the suspect with the cheapest FRU
297  * - convicting the suspect with the FRU that is in a depot's inventory
298  * - convicting the suspect with the longest lifetime
299  *
300  * and so forth.  A word to the wise: this problem is significantly harder that
301  * it seems at first glance.  Future work should heed the following advice:
302  *
303  * Hacking the policy into C code here is a very bad idea.  The policy needs to
304  * be decided upon very carefully and fundamentally encodes knowledge of what
305  * suspect list combinations can be emitted by what diagnosis engines.  As such
306  * fmd's code is the wrong location, because that would require fmd itself to
307  * be updated for every diagnosis engine change, defeating the entire design.
308  * The FMA Event Registry knows the suspect list combinations: policy inputs
309  * can be derived from it and used to produce per-module policy configuration.
310  *
311  * If the policy needs to be dynamic and not statically fixed at either fmd
312  * startup or module load time, any implementation of dynamic policy retrieval
313  * must employ some kind of caching mechanism or be part of a built-in module.
314  * The fmd_case_convict() function is called with locks held inside of fmd and
315  * is not a place where unbounded blocking on some inter-process or inter-
316  * system communication to another service (e.g. another daemon) can occur.
317  */
318 static void
319 fmd_case_convict(fmd_case_t *cp)
320 {
321 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
322 	fmd_asru_hash_t *ahp = fmd.d_asrus;
323 
324 	fmd_case_susp_t *cis;
325 	fmd_asru_t *asru;
326 	nvlist_t *fmri;
327 
328 	(void) pthread_mutex_lock(&cip->ci_lock);
329 	(void) fmd_case_mkcode(cp);
330 
331 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
332 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri))
333 			continue; /* no ASRU provided by diagnosis engine */
334 
335 		if ((asru = fmd_asru_hash_lookup_nvl(ahp,
336 		    fmri, FMD_B_TRUE)) == NULL) {
337 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
338 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
339 			continue;
340 		}
341 
342 		(void) fmd_asru_clrflags(asru,
343 		    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
344 		(void) fmd_asru_setflags(asru,
345 		    FMD_ASRU_FAULTY, cp, cis->cis_nvl);
346 
347 		fmd_asru_hash_release(ahp, asru);
348 	}
349 
350 	(void) pthread_mutex_unlock(&cip->ci_lock);
351 }
352 
353 void
354 fmd_case_publish(fmd_case_t *cp, uint_t state)
355 {
356 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
357 	fmd_event_t *e;
358 	nvlist_t *nvl;
359 	char *class;
360 
361 	if (state == FMD_CASE_CURRENT)
362 		state = cip->ci_state; /* use current state */
363 
364 	switch (state) {
365 	case FMD_CASE_SOLVED:
366 		fmd_case_convict(cp);
367 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
368 		(void) pthread_mutex_lock(&cip->ci_lock);
369 		if (cip->ci_diag == NULL)
370 			(void) nvlist_xdup(nvl, &cip->ci_diag, &fmd.d_nva);
371 		(void) pthread_mutex_unlock(&cip->ci_lock);
372 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
373 
374 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
375 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
376 		fmd_log_append(fmd.d_fltlog, e, cp);
377 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
378 		fmd_dispq_dispatch(fmd.d_disp, e, class);
379 
380 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
381 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
382 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
383 
384 		break;
385 
386 	case FMD_CASE_CLOSE_WAIT:
387 		fmd_case_hold(cp);
388 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
389 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
390 
391 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
392 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
393 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
394 
395 		break;
396 
397 	case FMD_CASE_CLOSED:
398 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
399 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
400 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
401 		fmd_dispq_dispatch(fmd.d_disp, e, class);
402 		break;
403 
404 	case FMD_CASE_REPAIRED:
405 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
406 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
407 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
408 		fmd_dispq_dispatch(fmd.d_disp, e, class);
409 		break;
410 	}
411 }
412 
413 fmd_case_t *
414 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
415 {
416 	fmd_case_impl_t *cip;
417 	uint_t h;
418 
419 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
420 	h = fmd_strhash(uuid) % chp->ch_hashlen;
421 
422 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
423 		if (strcmp(cip->ci_uuid, uuid) == 0)
424 			break;
425 	}
426 
427 	if (cip != NULL)
428 		fmd_case_hold((fmd_case_t *)cip);
429 	else
430 		(void) fmd_set_errno(EFMD_CASE_INVAL);
431 
432 	(void) pthread_rwlock_unlock(&chp->ch_lock);
433 	return ((fmd_case_t *)cip);
434 }
435 
436 static fmd_case_impl_t *
437 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
438 {
439 	fmd_case_impl_t *eip;
440 	uint_t h;
441 
442 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
443 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
444 
445 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
446 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0) {
447 			fmd_case_hold((fmd_case_t *)eip);
448 			(void) pthread_rwlock_unlock(&chp->ch_lock);
449 			return (eip); /* uuid already present */
450 		}
451 	}
452 
453 	cip->ci_next = chp->ch_hash[h];
454 	chp->ch_hash[h] = cip;
455 
456 	chp->ch_count++;
457 	ASSERT(chp->ch_count != 0);
458 
459 	(void) pthread_rwlock_unlock(&chp->ch_lock);
460 	return (cip);
461 }
462 
463 static void
464 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
465 {
466 	fmd_case_impl_t *cp, **pp;
467 	uint_t h;
468 
469 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
470 
471 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
472 	pp = &chp->ch_hash[h];
473 
474 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
475 		if (cp != cip)
476 			pp = &cp->ci_next;
477 		else
478 			break;
479 	}
480 
481 	if (cp == NULL) {
482 		fmd_panic("case %p (%s) not found on hash chain %u\n",
483 		    (void *)cip, cip->ci_uuid, h);
484 	}
485 
486 	*pp = cp->ci_next;
487 	cp->ci_next = NULL;
488 
489 	ASSERT(chp->ch_count != 0);
490 	chp->ch_count--;
491 
492 	(void) pthread_rwlock_unlock(&chp->ch_lock);
493 }
494 
495 fmd_case_t *
496 fmd_case_create(fmd_module_t *mp, void *data)
497 {
498 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
499 	fmd_case_impl_t *eip = NULL;
500 	uuid_t uuid;
501 
502 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
503 	fmd_buf_hash_create(&cip->ci_bufs);
504 
505 	fmd_module_hold(mp);
506 	cip->ci_mod = mp;
507 	cip->ci_refs = 1;
508 	cip->ci_state = FMD_CASE_UNSOLVED;
509 	cip->ci_flags = FMD_CF_DIRTY;
510 	cip->ci_data = data;
511 
512 	/*
513 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
514 	 * define any constant for the length of an unparse string, and do not
515 	 * permit the caller to specify a buffer length for safety.  The spec
516 	 * says it will be 36 bytes, but we make it tunable just in case.
517 	 */
518 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
519 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
520 
521 	/*
522 	 * We expect this loop to execute only once, but code it defensively
523 	 * against the possibility of libuuid bugs.  Keep generating uuids and
524 	 * attempting to do a hash insert until we get a unique one.
525 	 */
526 	do {
527 		if (eip != NULL)
528 			fmd_case_rele((fmd_case_t *)eip);
529 		uuid_generate(uuid);
530 		uuid_unparse(uuid, cip->ci_uuid);
531 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
532 
533 	ASSERT(fmd_module_locked(mp));
534 	fmd_list_append(&mp->mod_cases, cip);
535 	fmd_module_setcdirty(mp);
536 
537 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
538 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
539 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
540 
541 	return ((fmd_case_t *)cip);
542 }
543 
544 static void
545 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
546 {
547 	fmd_case_susp_t *cis, *ncis;
548 
549 	ASSERT(MUTEX_HELD(&cip->ci_lock));
550 
551 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
552 		ncis = cis->cis_next;
553 		nvlist_free(cis->cis_nvl);
554 		fmd_free(cis, sizeof (fmd_case_susp_t));
555 	}
556 
557 	cip->ci_suspects = NULL;
558 	cip->ci_nsuspects = 0;
559 }
560 
561 fmd_case_t *
562 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
563     uint_t state, const char *uuid, const char *code)
564 {
565 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
566 	fmd_case_impl_t *eip;
567 
568 	ASSERT(state < FMD_CASE_REPAIRED);
569 
570 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
571 	fmd_buf_hash_create(&cip->ci_bufs);
572 
573 	fmd_module_hold(mp);
574 	cip->ci_mod = mp;
575 	cip->ci_xprt = xp;
576 	cip->ci_refs = 1;
577 	cip->ci_state = state;
578 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
579 	cip->ci_uuidlen = strlen(cip->ci_uuid);
580 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
581 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
582 
583 	if (state > FMD_CASE_CLOSE_WAIT)
584 		cip->ci_flags |= FMD_CF_SOLVED;
585 
586 	/*
587 	 * Insert the case into the global case hash.  If the specified UUID is
588 	 * already present, check to see if it is an orphan: if so, reclaim it;
589 	 * otherwise if it is owned by a different module then return NULL.
590 	 */
591 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
592 		(void) pthread_mutex_lock(&cip->ci_lock);
593 		cip->ci_refs--; /* decrement to zero */
594 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
595 
596 		cip = eip; /* switch 'cip' to the existing case */
597 		(void) pthread_mutex_lock(&cip->ci_lock);
598 
599 		/*
600 		 * If the ASRU cache is trying to recreate an orphan, then just
601 		 * return the existing case that we found without changing it.
602 		 */
603 		if (mp == fmd.d_rmod) {
604 			(void) pthread_mutex_unlock(&cip->ci_lock);
605 			fmd_case_rele((fmd_case_t *)cip);
606 			return ((fmd_case_t *)cip);
607 		}
608 
609 		/*
610 		 * If the existing case isn't an orphan or is being proxied,
611 		 * then we have a UUID conflict: return failure to the caller.
612 		 */
613 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
614 			(void) pthread_mutex_unlock(&cip->ci_lock);
615 			fmd_case_rele((fmd_case_t *)cip);
616 			return (NULL);
617 		}
618 
619 		/*
620 		 * If the new module is reclaiming an orphaned case, remove
621 		 * the case from the root module, switch ci_mod, and then fall
622 		 * through to adding the case to the new owner module 'mp'.
623 		 */
624 		fmd_module_lock(cip->ci_mod);
625 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
626 		fmd_module_unlock(cip->ci_mod);
627 
628 		fmd_module_rele(cip->ci_mod);
629 		cip->ci_mod = mp;
630 		fmd_module_hold(mp);
631 
632 		fmd_case_destroy_suspects(cip);
633 		cip->ci_state = state;
634 
635 		(void) pthread_mutex_unlock(&cip->ci_lock);
636 		fmd_case_rele((fmd_case_t *)cip);
637 	}
638 
639 	ASSERT(fmd_module_locked(mp));
640 	fmd_list_append(&mp->mod_cases, cip);
641 
642 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
643 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
644 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
645 
646 	return ((fmd_case_t *)cip);
647 }
648 
649 void
650 fmd_case_destroy(fmd_case_t *cp, int visible)
651 {
652 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
653 	fmd_case_item_t *cit, *ncit;
654 
655 	ASSERT(MUTEX_HELD(&cip->ci_lock));
656 	ASSERT(cip->ci_refs == 0);
657 
658 	if (visible) {
659 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
660 		fmd_case_hash_delete(fmd.d_cases, cip);
661 	}
662 
663 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
664 		ncit = cit->cit_next;
665 		fmd_event_rele(cit->cit_event);
666 		fmd_free(cit, sizeof (fmd_case_item_t));
667 	}
668 
669 	fmd_case_destroy_suspects(cip);
670 
671 	if (cip->ci_principal != NULL)
672 		fmd_event_rele(cip->ci_principal);
673 
674 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
675 	fmd_free(cip->ci_code, cip->ci_codelen);
676 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
677 
678 	if (cip->ci_diag != NULL)
679 		nvlist_free(cip->ci_diag);
680 
681 	fmd_module_rele(cip->ci_mod);
682 	fmd_free(cip, sizeof (fmd_case_impl_t));
683 }
684 
685 void
686 fmd_case_hold(fmd_case_t *cp)
687 {
688 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
689 
690 	(void) pthread_mutex_lock(&cip->ci_lock);
691 	cip->ci_refs++;
692 	ASSERT(cip->ci_refs != 0);
693 	(void) pthread_mutex_unlock(&cip->ci_lock);
694 }
695 
696 void
697 fmd_case_hold_locked(fmd_case_t *cp)
698 {
699 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
700 
701 	ASSERT(MUTEX_HELD(&cip->ci_lock));
702 	cip->ci_refs++;
703 	ASSERT(cip->ci_refs != 0);
704 }
705 
706 void
707 fmd_case_rele(fmd_case_t *cp)
708 {
709 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
710 
711 	(void) pthread_mutex_lock(&cip->ci_lock);
712 	ASSERT(cip->ci_refs != 0);
713 
714 	if (--cip->ci_refs == 0)
715 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
716 	else
717 		(void) pthread_mutex_unlock(&cip->ci_lock);
718 }
719 
720 int
721 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
722 {
723 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
724 	fmd_case_item_t *cit;
725 	fmd_event_t *oep;
726 	uint_t state;
727 	int new;
728 
729 	fmd_event_hold(ep);
730 	(void) pthread_mutex_lock(&cip->ci_lock);
731 
732 	if (cip->ci_flags & FMD_CF_SOLVED)
733 		state = FMD_EVS_DIAGNOSED;
734 	else
735 		state = FMD_EVS_ACCEPTED;
736 
737 	oep = cip->ci_principal;
738 	cip->ci_principal = ep;
739 
740 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
741 		if (cit->cit_event == ep)
742 			break;
743 	}
744 
745 	cip->ci_flags |= FMD_CF_DIRTY;
746 	new = cit == NULL && ep != oep;
747 
748 	(void) pthread_mutex_unlock(&cip->ci_lock);
749 
750 	fmd_module_setcdirty(cip->ci_mod);
751 	fmd_event_transition(ep, state);
752 
753 	if (oep != NULL)
754 		fmd_event_rele(oep);
755 
756 	return (new);
757 }
758 
759 int
760 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
761 {
762 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
763 	fmd_case_item_t *cit;
764 	uint_t state;
765 	int new;
766 
767 	(void) pthread_mutex_lock(&cip->ci_lock);
768 
769 	if (cip->ci_flags & FMD_CF_SOLVED)
770 		state = FMD_EVS_DIAGNOSED;
771 	else
772 		state = FMD_EVS_ACCEPTED;
773 
774 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
775 		if (cit->cit_event == ep)
776 			break;
777 	}
778 
779 	new = cit == NULL && ep != cip->ci_principal;
780 
781 	/*
782 	 * If the event is already in the case or the case is already solved,
783 	 * there is no reason to save it: just transition it appropriately.
784 	 */
785 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
786 		(void) pthread_mutex_unlock(&cip->ci_lock);
787 		fmd_event_transition(ep, state);
788 		return (new);
789 	}
790 
791 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
792 	fmd_event_hold(ep);
793 
794 	cit->cit_next = cip->ci_items;
795 	cit->cit_event = ep;
796 
797 	cip->ci_items = cit;
798 	cip->ci_nitems++;
799 
800 	cip->ci_flags |= FMD_CF_DIRTY;
801 	(void) pthread_mutex_unlock(&cip->ci_lock);
802 
803 	fmd_module_setcdirty(cip->ci_mod);
804 	fmd_event_transition(ep, state);
805 
806 	return (new);
807 }
808 
809 void
810 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
811 {
812 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
813 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
814 
815 	(void) pthread_mutex_lock(&cip->ci_lock);
816 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
817 	cip->ci_flags |= FMD_CF_DIRTY;
818 
819 	cis->cis_next = cip->ci_suspects;
820 	cis->cis_nvl = nvl;
821 
822 	cip->ci_suspects = cis;
823 	cip->ci_nsuspects++;
824 
825 	(void) pthread_mutex_unlock(&cip->ci_lock);
826 	fmd_module_setcdirty(cip->ci_mod);
827 }
828 
829 void
830 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
831 {
832 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
833 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
834 
835 	(void) pthread_mutex_lock(&cip->ci_lock);
836 	ASSERT(cip->ci_state == FMD_CASE_CLOSED);
837 	ASSERT(cip->ci_mod == fmd.d_rmod);
838 
839 	cis->cis_next = cip->ci_suspects;
840 	cis->cis_nvl = nvl;
841 
842 	cip->ci_suspects = cis;
843 	cip->ci_nsuspects++;
844 
845 	(void) pthread_mutex_unlock(&cip->ci_lock);
846 }
847 
848 void
849 fmd_case_reset_suspects(fmd_case_t *cp)
850 {
851 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
852 
853 	(void) pthread_mutex_lock(&cip->ci_lock);
854 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
855 
856 	fmd_case_destroy_suspects(cip);
857 	cip->ci_flags |= FMD_CF_DIRTY;
858 
859 	(void) pthread_mutex_unlock(&cip->ci_lock);
860 	fmd_module_setcdirty(cip->ci_mod);
861 }
862 
863 /*
864  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
865  * whatever actions and emit whatever events are appropriate for the state.
866  * Refer to the topmost block comment explaining the state machine for details.
867  */
868 void
869 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
870 {
871 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
872 
873 	fmd_case_susp_t *cis;
874 	fmd_case_item_t *cit;
875 	fmd_asru_t *asru;
876 	fmd_event_t *e;
877 	nvlist_t *nvl;
878 
879 	ASSERT(state <= FMD_CASE_REPAIRED);
880 	(void) pthread_mutex_lock(&cip->ci_lock);
881 
882 	if (!(cip->ci_flags & FMD_CF_SOLVED))
883 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
884 
885 	cip->ci_flags |= flags;
886 
887 	if (cip->ci_state >= state) {
888 		(void) pthread_mutex_unlock(&cip->ci_lock);
889 		return; /* already in specified state */
890 	}
891 
892 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
893 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
894 
895 	cip->ci_state = state;
896 	cip->ci_flags |= FMD_CF_DIRTY;
897 
898 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
899 		fmd_module_setcdirty(cip->ci_mod);
900 
901 	switch (state) {
902 	case FMD_CASE_SOLVED:
903 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
904 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
905 
906 		if (cip->ci_principal != NULL) {
907 			fmd_event_transition(cip->ci_principal,
908 			    FMD_EVS_DIAGNOSED);
909 		}
910 		break;
911 
912 	case FMD_CASE_CLOSE_WAIT:
913 		/*
914 		 * If the case was never solved, do not change ASRUs.
915 		 * If the case was never fmd_case_closed, do not change ASRUs.
916 		 * If the case was repaired, do not change ASRUs.
917 		 */
918 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
919 		    FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED))
920 			goto close_wait_finish;
921 
922 		/*
923 		 * For each fault event in the suspect list, attempt to look up
924 		 * the corresponding ASRU in the ASRU dictionary.  If the ASRU
925 		 * is found there and is marked faulty, we now mark it unusable
926 		 * and record the case meta-data and fault event with the ASRU.
927 		 */
928 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
929 			if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
930 			    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
931 			    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
932 				(void) fmd_asru_setflags(asru,
933 				    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
934 				fmd_asru_hash_release(fmd.d_asrus, asru);
935 			}
936 		}
937 
938 	close_wait_finish:
939 		/*
940 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
941 		 * module is no longer loaded: continue on to CASE_CLOSED.
942 		 */
943 		if (fmd_case_orphaned(cp))
944 			state = cip->ci_state = FMD_CASE_CLOSED;
945 		break;
946 
947 	case FMD_CASE_REPAIRED:
948 		ASSERT(fmd_case_orphaned(cp));
949 		fmd_module_lock(cip->ci_mod);
950 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
951 		fmd_module_unlock(cip->ci_mod);
952 		break;
953 	}
954 
955 	(void) pthread_mutex_unlock(&cip->ci_lock);
956 
957 	/*
958 	 * If the module has initialized, then publish the appropriate event
959 	 * for the new case state.  If not, we are being called from the
960 	 * checkpoint code during module load, in which case the module's
961 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
962 	 * may not be open yet, which will prevent us from computing the event
963 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
964 	 * event in our queue: this won't be processed until _fmd_init is done.
965 	 */
966 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
967 		fmd_case_publish(cp, state);
968 	else {
969 		fmd_case_hold(cp);
970 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
971 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
972 	}
973 
974 	/*
975 	 * If we transitioned to REPAIRED, adjust the reference count to
976 	 * reflect our removal from fmd.d_rmod->mod_cases.  If the caller has
977 	 * not placed an additional hold on the case, it will now be freed.
978 	 */
979 	if (state == FMD_CASE_REPAIRED)
980 		fmd_case_rele(cp);
981 }
982 
983 /*
984  * Transition the specified case to *at least* the specified state by first
985  * re-validating the suspect list using the resource cache.  This function is
986  * employed by the checkpoint code when restoring a saved, solved case to see
987  * if the state of the case has effectively changed while fmd was not running
988  * or the module was not loaded.  If none of the suspects are present anymore,
989  * advance the state to REPAIRED.  If none are usable, advance to CLOSE_WAIT.
990  */
991 void
992 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
993 {
994 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
995 	fmd_case_susp_t *cis;
996 	fmd_asru_t *asru;
997 	nvlist_t *nvl;
998 
999 	int present = 0;	/* are any suspects present? */
1000 	int usable = 0;		/* are any suspects usable? */
1001 
1002 	ASSERT(state >= FMD_CASE_SOLVED);
1003 	(void) pthread_mutex_lock(&cip->ci_lock);
1004 
1005 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
1006 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
1007 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
1008 		    fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) {
1009 
1010 			if ((asru->asru_flags & FMD_ASRU_INTERNAL) ||
1011 			    fmd_fmri_present(asru->asru_fmri) > 0)
1012 				present++;
1013 
1014 			if (fmd_fmri_unusable(asru->asru_fmri) <= 0)
1015 				usable++;
1016 
1017 			fmd_asru_hash_release(fmd.d_asrus, asru);
1018 		}
1019 	}
1020 
1021 	(void) pthread_mutex_unlock(&cip->ci_lock);
1022 
1023 	if (!present) {
1024 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1025 		flags |= FMD_CF_REPAIRED;
1026 	} else if (!usable) {
1027 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1028 		flags |= FMD_CF_ISOLATED;
1029 	}
1030 
1031 	fmd_case_transition(cp, state, flags);
1032 }
1033 
1034 void
1035 fmd_case_setdirty(fmd_case_t *cp)
1036 {
1037 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1038 
1039 	(void) pthread_mutex_lock(&cip->ci_lock);
1040 	cip->ci_flags |= FMD_CF_DIRTY;
1041 	(void) pthread_mutex_unlock(&cip->ci_lock);
1042 
1043 	fmd_module_setcdirty(cip->ci_mod);
1044 }
1045 
1046 void
1047 fmd_case_clrdirty(fmd_case_t *cp)
1048 {
1049 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1050 
1051 	(void) pthread_mutex_lock(&cip->ci_lock);
1052 	cip->ci_flags &= ~FMD_CF_DIRTY;
1053 	(void) pthread_mutex_unlock(&cip->ci_lock);
1054 }
1055 
1056 void
1057 fmd_case_commit(fmd_case_t *cp)
1058 {
1059 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1060 	fmd_case_item_t *cit;
1061 
1062 	(void) pthread_mutex_lock(&cip->ci_lock);
1063 
1064 	if (cip->ci_flags & FMD_CF_DIRTY) {
1065 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1066 			fmd_event_commit(cit->cit_event);
1067 
1068 		if (cip->ci_principal != NULL)
1069 			fmd_event_commit(cip->ci_principal);
1070 
1071 		fmd_buf_hash_commit(&cip->ci_bufs);
1072 		cip->ci_flags &= ~FMD_CF_DIRTY;
1073 	}
1074 
1075 	(void) pthread_mutex_unlock(&cip->ci_lock);
1076 }
1077 
1078 /*
1079  * Indicate that the case may need to change state because one or more of the
1080  * ASRUs named as a suspect has changed state.  We examine all the suspects
1081  * and if none are still faulty, we initiate a case close transition.
1082  */
1083 void
1084 fmd_case_update(fmd_case_t *cp)
1085 {
1086 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1087 	fmd_case_susp_t *cis;
1088 	fmd_asru_t *asru;
1089 	nvlist_t *nvl;
1090 
1091 	int astate = 0;
1092 	uint_t cstate;
1093 
1094 	(void) pthread_mutex_lock(&cip->ci_lock);
1095 	cstate = cip->ci_state;
1096 
1097 	if ((cip->ci_flags & FMD_CF_REPAIRING) ||
1098 	    cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
1099 		(void) pthread_mutex_unlock(&cip->ci_lock);
1100 		return; /* update is not appropriate */
1101 	}
1102 
1103 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
1104 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
1105 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
1106 		    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
1107 			astate |= fmd_asru_getstate(asru);
1108 			fmd_asru_hash_release(fmd.d_asrus, asru);
1109 		}
1110 	}
1111 
1112 	(void) pthread_mutex_unlock(&cip->ci_lock);
1113 
1114 	if (astate & FMD_ASRU_FAULTY)
1115 		return; /* one or more suspects are still marked faulty */
1116 
1117 	if (cstate == FMD_CASE_CLOSED)
1118 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1119 	else
1120 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1121 }
1122 
1123 /*
1124  * Delete a closed case from the module's case list once the fmdo_close() entry
1125  * point has run to completion.  If the case is owned by a transport module,
1126  * tell the transport to proxy a case close on the other end of the transport.
1127  * If not, transition to the appropriate next state based on ci_flags.  This
1128  * function represents the end of CLOSE_WAIT and transitions the case to either
1129  * CLOSED or REPAIRED or discards it entirely because it was never solved;
1130  * refer to the topmost block comment explaining the state machine for details.
1131  */
1132 void
1133 fmd_case_delete(fmd_case_t *cp)
1134 {
1135 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1136 	fmd_modstat_t *msp;
1137 	size_t buftotal;
1138 
1139 	ASSERT(fmd_module_locked(cip->ci_mod));
1140 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1141 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
1142 
1143 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1144 	msp = cip->ci_mod->mod_stats;
1145 
1146 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
1147 	msp->ms_caseopen.fmds_value.ui64--;
1148 
1149 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
1150 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
1151 
1152 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1153 
1154 	if (cip->ci_xprt == NULL)
1155 		fmd_module_setcdirty(cip->ci_mod);
1156 
1157 	fmd_module_rele(cip->ci_mod);
1158 	cip->ci_mod = fmd.d_rmod;
1159 	fmd_module_hold(cip->ci_mod);
1160 
1161 	/*
1162 	 * If the case is not proxied and it has been solved, then retain it
1163 	 * on the root module's case list at least until we're transitioned.
1164 	 * Otherwise free the case with our final fmd_case_rele() below.
1165 	 */
1166 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
1167 		fmd_module_lock(cip->ci_mod);
1168 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
1169 		fmd_module_unlock(cip->ci_mod);
1170 		fmd_case_hold(cp);
1171 	}
1172 
1173 	/*
1174 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
1175 	 * rather than orphaned because by definition it can have no entries
1176 	 * in the resource cache of the current fault manager.
1177 	 */
1178 	if (cip->ci_xprt != NULL)
1179 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
1180 	else if (cip->ci_flags & FMD_CF_REPAIRED)
1181 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
1182 	else if (cip->ci_flags & FMD_CF_ISOLATED)
1183 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
1184 
1185 	fmd_case_rele(cp);
1186 }
1187 
1188 void
1189 fmd_case_discard(fmd_case_t *cp)
1190 {
1191 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1192 
1193 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1194 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
1195 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1196 
1197 	ASSERT(fmd_module_locked(cip->ci_mod));
1198 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1199 	fmd_case_rele(cp);
1200 }
1201 
1202 static void
1203 fmd_case_repair_containee(fmd_asru_t *ee, void *er)
1204 {
1205 	if ((ee->asru_flags & FMD_ASRU_FAULTY) &&
1206 	    fmd_fmri_contains(er, ee->asru_fmri) > 0)
1207 		(void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL);
1208 }
1209 
1210 /*
1211  * Indicate that the problem corresponding to a case has been repaired by
1212  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
1213  * already been closed, this function initiates the transition to CLOSE_WAIT.
1214  * The caller must have the case held from fmd_case_hash_lookup(), so we can
1215  * grab and drop ci_lock without the case being able to be freed in between.
1216  */
1217 int
1218 fmd_case_repair(fmd_case_t *cp)
1219 {
1220 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1221 	fmd_case_susp_t *cis;
1222 	nvlist_t *nvl;
1223 	uint_t cstate;
1224 
1225 	fmd_asru_hash_t *ahp = fmd.d_asrus;
1226 	fmd_asru_t **aa;
1227 	uint_t i, an;
1228 
1229 	(void) pthread_mutex_lock(&cip->ci_lock);
1230 	cstate = cip->ci_state;
1231 
1232 	if (cip->ci_xprt != NULL) {
1233 		(void) pthread_mutex_unlock(&cip->ci_lock);
1234 		return (fmd_set_errno(EFMD_CASE_OWNER));
1235 	}
1236 
1237 	if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) {
1238 		(void) pthread_mutex_unlock(&cip->ci_lock);
1239 		return (fmd_set_errno(EFMD_CASE_STATE));
1240 	}
1241 
1242 	/*
1243 	 * Take a snapshot of any ASRUs referenced by the case that are present
1244 	 * in the resource cache.  Then drop ci_lock and clear the faulty bit
1245 	 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held).
1246 	 */
1247 	an = cip->ci_nsuspects;
1248 	aa = alloca(sizeof (fmd_asru_t *) * an);
1249 	bzero(aa, sizeof (fmd_asru_t *) * an);
1250 
1251 	for (i = 0, cis = cip->ci_suspects;
1252 	    cis != NULL; cis = cis->cis_next, i++) {
1253 		if (nvlist_lookup_nvlist(cis->cis_nvl,
1254 		    FM_FAULT_ASRU, &nvl) == 0)
1255 			aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE);
1256 	}
1257 
1258 	cip->ci_flags |= FMD_CF_REPAIRING;
1259 	(void) pthread_mutex_unlock(&cip->ci_lock);
1260 
1261 	/*
1262 	 * For each suspect ASRU, if the case associated with this ASRU matches
1263 	 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY.  Note
1264 	 * that at present, we're assuming that when a given resource FMRI R1
1265 	 * contains another R2, that any faults are related by a common
1266 	 * diagnosis engine.  This is true in our current architecture, but may
1267 	 * not always be true, at which point we'll need more cleverness here.
1268 	 */
1269 	for (i = 0; i < an; i++) {
1270 		if (aa[i] == NULL)
1271 			continue; /* no asru was found */
1272 
1273 		if (aa[i]->asru_case == cp) {
1274 			fmd_asru_hash_apply(fmd.d_asrus,
1275 			    fmd_case_repair_containee, aa[i]->asru_fmri);
1276 			(void) fmd_asru_clrflags(aa[i],
1277 			    FMD_ASRU_FAULTY, NULL, NULL);
1278 		}
1279 
1280 		fmd_asru_hash_release(ahp, aa[i]);
1281 	}
1282 
1283 	(void) pthread_mutex_lock(&cip->ci_lock);
1284 	cip->ci_flags &= ~FMD_CF_REPAIRING;
1285 	(void) pthread_mutex_unlock(&cip->ci_lock);
1286 
1287 	if (cstate == FMD_CASE_CLOSED)
1288 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
1289 	else
1290 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
1291 
1292 	return (0);
1293 }
1294 
1295 int
1296 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
1297 {
1298 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1299 	fmd_case_item_t *cit;
1300 	uint_t state;
1301 	int rv = 0;
1302 
1303 	(void) pthread_mutex_lock(&cip->ci_lock);
1304 
1305 	if (cip->ci_state >= FMD_CASE_SOLVED)
1306 		state = FMD_EVS_DIAGNOSED;
1307 	else
1308 		state = FMD_EVS_ACCEPTED;
1309 
1310 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1311 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
1312 			break;
1313 	}
1314 
1315 	if (rv == 0 && cip->ci_principal != NULL)
1316 		rv = fmd_event_equal(ep, cip->ci_principal);
1317 
1318 	(void) pthread_mutex_unlock(&cip->ci_lock);
1319 
1320 	if (rv != 0)
1321 		fmd_event_transition(ep, state);
1322 
1323 	return (rv);
1324 }
1325 
1326 int
1327 fmd_case_orphaned(fmd_case_t *cp)
1328 {
1329 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
1330 }
1331