xref: /titanic_41/usr/src/cmd/fm/fmd/common/fmd_case.c (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * FMD Case Subsystem
28  *
29  * Diagnosis engines are expected to group telemetry events related to the
30  * diagnosis of a particular problem on the system into a set of cases.  The
31  * diagnosis engine may have any number of cases open at a given point in time.
32  * Some cases may eventually be *solved* by associating a suspect list of one
33  * or more problems with the case, at which point fmd publishes a list.suspect
34  * event for the case and it becomes visible to administrators and agents.
35  *
36  * Every case is named using a UUID, and is globally visible in the case hash.
37  * Cases are reference-counted, except for the reference from the case hash
38  * itself.  Consumers of case references include modules, which store active
39  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
40  *
41  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
42  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
43  * or transport) and the case is referenced by the mod_cases list.  Once the
44  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
45  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
46  *
47  *			+------------+
48  *	     +----------|  UNSOLVED  |
49  *	     |		+------------+
50  *	     |		      1 |
51  *	     |			|
52  *	     |		+-------v----+
53  *	   2 |		|    SOLVED  |
54  *	     |		+------------+
55  *	     |		    3 |  5 |
56  *	     +------------+   |    |
57  *			  |   |    |
58  *			+-v---v----v-+
59  *			| CLOSE_WAIT |
60  *			+------------+
61  *			  |   |    |
62  *	      +-----------+   |    +------------+
63  *	      |		    4 |			|
64  *	      v		+-----v------+		|
65  *	   discard      |   CLOSED   |	      6	|
66  *			+------------+		|
67  *			      |			|
68  *			      |	   +------------+
69  *			    7 |	   |
70  *			+-----v----v-+
71  *			|  REPAIRED  |
72  *			+------------+
73  *			      |
74  *			    8 |
75  *			+-----v------+
76  *			|  RESOLVED  |
77  *			+------------+
78  *			      |
79  *			      v
80  *			   discard
81  *
82  * The state machine changes are triggered by calls to fmd_case_transition()
83  * from various locations inside of fmd, as described below:
84  *
85  * [1] Called by: fmd_case_solve()
86  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
87  *                conviction policy is applied to suspect list
88  *                suspects convicted are marked faulty (F) in R$
89  *                list.suspect event logged and dispatched
90  *
91  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
92  *       Actions: diagnosis engine fmdo_close() entry point scheduled
93  *                case discarded upon exit from CLOSE_WAIT
94  *
95  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
96  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
97  *                suspects convicted (F) are marked unusable (U) in R$
98  *                diagnosis engine fmdo_close() entry point scheduled
99  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
100  *
101  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
102  *       Actions: list.isolated event dispatched
103  *                case deleted from module's list of open cases
104  *
105  * [5] Called by: fmd_case_repair(), fmd_case_update()
106  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
107  *                diagnosis engine fmdo_close() entry point scheduled
108  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
109  *
110  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
111  *       Actions: suspects convicted are marked non faulty (!F) in R$
112  *                list.repaired or list.updated event dispatched
113  *
114  * [7] Called by: fmd_case_repair(), fmd_case_update()
115  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
116  *                suspects convicted are marked non faulty (!F) in R$
117  *                list.repaired or list.updated event dispatched
118  *
119  * [8] Called by: fmd_case_uuresolve()
120  *       Actions: list.resolved event dispatched
121  *		  case is discarded
122  */
123 
124 #include <sys/fm/protocol.h>
125 #include <uuid/uuid.h>
126 #include <alloca.h>
127 
128 #include <fmd_alloc.h>
129 #include <fmd_module.h>
130 #include <fmd_error.h>
131 #include <fmd_conf.h>
132 #include <fmd_case.h>
133 #include <fmd_string.h>
134 #include <fmd_subr.h>
135 #include <fmd_protocol.h>
136 #include <fmd_event.h>
137 #include <fmd_eventq.h>
138 #include <fmd_dispq.h>
139 #include <fmd_buf.h>
140 #include <fmd_log.h>
141 #include <fmd_asru.h>
142 #include <fmd_fmri.h>
143 #include <fmd_xprt.h>
144 
145 #include <fmd.h>
146 
147 static const char *const _fmd_case_snames[] = {
148 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
149 	"SOLVED",	/* FMD_CASE_SOLVED */
150 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
151 	"CLOSED",	/* FMD_CASE_CLOSED */
152 	"REPAIRED",	/* FMD_CASE_REPAIRED */
153 	"RESOLVED"	/* FMD_CASE_RESOLVED */
154 };
155 
156 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
157 
158 fmd_case_hash_t *
fmd_case_hash_create(void)159 fmd_case_hash_create(void)
160 {
161 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
162 
163 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
164 	chp->ch_hashlen = fmd.d_str_buckets;
165 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
166 	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
167 	    FMD_SLEEP);
168 	chp->ch_count = 0;
169 
170 	return (chp);
171 }
172 
173 /*
174  * Destroy the case hash.  Unlike most of our hash tables, no active references
175  * are kept by the case hash itself; all references come from other subsystems.
176  * The hash must be destroyed after all modules are unloaded; if anything was
177  * present in the hash it would be by definition a reference count leak.
178  */
179 void
fmd_case_hash_destroy(fmd_case_hash_t * chp)180 fmd_case_hash_destroy(fmd_case_hash_t *chp)
181 {
182 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
183 	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
184 	fmd_free(chp, sizeof (fmd_case_hash_t));
185 }
186 
187 /*
188  * Take a snapshot of the case hash by placing an additional hold on each
189  * member in an auxiliary array, and then call 'func' for each case.
190  */
191 void
fmd_case_hash_apply(fmd_case_hash_t * chp,void (* func)(fmd_case_t *,void *),void * arg)192 fmd_case_hash_apply(fmd_case_hash_t *chp,
193     void (*func)(fmd_case_t *, void *), void *arg)
194 {
195 	fmd_case_impl_t *cp, **cps, **cpp;
196 	uint_t cpc, i;
197 
198 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
199 
200 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
201 	cpc = chp->ch_count;
202 
203 	for (i = 0; i < chp->ch_hashlen; i++) {
204 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
205 			*cpp++ = fmd_case_tryhold(cp);
206 	}
207 
208 	ASSERT(cpp == cps + cpc);
209 	(void) pthread_rwlock_unlock(&chp->ch_lock);
210 
211 	for (i = 0; i < cpc; i++) {
212 		if (cps[i] != NULL) {
213 			func((fmd_case_t *)cps[i], arg);
214 			fmd_case_rele((fmd_case_t *)cps[i]);
215 		}
216 	}
217 
218 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
219 }
220 
221 static void
fmd_case_code_hash_insert(fmd_case_hash_t * chp,fmd_case_impl_t * cip)222 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
223 {
224 	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
225 
226 	cip->ci_code_next = chp->ch_code_hash[h];
227 	chp->ch_code_hash[h] = cip;
228 }
229 
230 static void
fmd_case_code_hash_delete(fmd_case_hash_t * chp,fmd_case_impl_t * cip)231 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
232 {
233 	fmd_case_impl_t **pp, *cp;
234 
235 	if (cip->ci_code) {
236 		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
237 
238 		pp = &chp->ch_code_hash[h];
239 		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
240 			if (cp != cip)
241 				pp = &cp->ci_code_next;
242 			else
243 				break;
244 		}
245 		if (cp != NULL) {
246 			*pp = cp->ci_code_next;
247 			cp->ci_code_next = NULL;
248 		}
249 	}
250 }
251 
252 /*
253  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
254  * were defined for this case or if the lookup fails, the event dictionary or
255  * module code is broken, and we set the event code to a precomputed default.
256  */
257 static const char *
fmd_case_mkcode(fmd_case_t * cp)258 fmd_case_mkcode(fmd_case_t *cp)
259 {
260 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
261 	fmd_case_susp_t *cis;
262 	fmd_case_hash_t *chp = fmd.d_cases;
263 
264 	char **keys, **keyp;
265 	const char *s;
266 
267 	ASSERT(MUTEX_HELD(&cip->ci_lock));
268 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
269 
270 	/*
271 	 * delete any existing entry from code hash if it is on it
272 	 */
273 	fmd_case_code_hash_delete(chp, cip);
274 
275 	fmd_free(cip->ci_code, cip->ci_codelen);
276 	cip->ci_codelen = cip->ci_mod->mod_codelen;
277 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
278 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
279 
280 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
281 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
282 			keyp++;
283 	}
284 
285 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
286 
287 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
288 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
289 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
290 		fmd_free(cip->ci_code, cip->ci_codelen);
291 		cip->ci_codelen = strlen(s) + 1;
292 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
293 		(void) strcpy(cip->ci_code, s);
294 	}
295 
296 	/*
297 	 * add into hash of solved cases
298 	 */
299 	fmd_case_code_hash_insert(chp, cip);
300 
301 	return (cip->ci_code);
302 }
303 
304 typedef struct {
305 	int	*fcl_countp;
306 	int	fcl_maxcount;
307 	uint8_t *fcl_ba;
308 	nvlist_t **fcl_nva;
309 	int	*fcl_msgp;
310 } fmd_case_lst_t;
311 
312 static void
fmd_case_set_lst(fmd_asru_link_t * alp,void * arg)313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
314 {
315 	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
316 	boolean_t b;
317 	int state;
318 
319 	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
320 		return;
321 	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
322 	    &b) == 0 && b == B_FALSE)
323 		*entryp->fcl_msgp = B_FALSE;
324 	entryp->fcl_ba[*entryp->fcl_countp] = 0;
325 	state = fmd_asru_al_getstate(alp);
326 	if (state & FMD_ASRU_DEGRADED)
327 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
328 	if (state & FMD_ASRU_UNUSABLE)
329 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
330 	if (state & FMD_ASRU_FAULTY)
331 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
332 	if (!(state & FMD_ASRU_PRESENT))
333 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
334 	if (alp->al_reason == FMD_ASRU_REPAIRED)
335 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
336 	else if (alp->al_reason == FMD_ASRU_REPLACED)
337 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
338 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
339 		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
340 	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
341 	(*entryp->fcl_countp)++;
342 }
343 
344 static void
fmd_case_faulty(fmd_asru_link_t * alp,void * arg)345 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
346 {
347 	int *faultyp = (int *)arg;
348 
349 	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
350 }
351 
352 static void
fmd_case_usable(fmd_asru_link_t * alp,void * arg)353 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
354 {
355 	int *usablep = (int *)arg;
356 
357 	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
358 }
359 
360 static void
fmd_case_not_faulty(fmd_asru_link_t * alp,void * arg)361 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
362 {
363 	int *not_faultyp = (int *)arg;
364 
365 	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
366 }
367 
368 /*
369  * Have we got any suspects with an asru that are still unusable and present?
370  */
371 static void
fmd_case_unusable_and_present(fmd_asru_link_t * alp,void * arg)372 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
373 {
374 	int *rvalp = (int *)arg;
375 	int state;
376 	nvlist_t *asru;
377 
378 	/*
379 	 * if this a proxy case and this suspect doesn't have an local asru
380 	 * then state is unknown so we must assume it may still be unusable.
381 	 */
382 	if ((alp->al_flags & FMD_ASRU_PROXY) &&
383 	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
384 		*rvalp |= B_TRUE;
385 		return;
386 	}
387 
388 	state = fmd_asru_al_getstate(alp);
389 	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
390 		return;
391 	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
392 }
393 
394 nvlist_t *
fmd_case_mkevent(fmd_case_t * cp,const char * class)395 fmd_case_mkevent(fmd_case_t *cp, const char *class)
396 {
397 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
398 	nvlist_t **nva, *nvl;
399 	uint8_t *ba;
400 	int msg = B_TRUE;
401 	const char *code;
402 	fmd_case_lst_t fcl;
403 	int count = 0;
404 
405 	(void) pthread_mutex_lock(&cip->ci_lock);
406 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
407 
408 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
409 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
410 
411 	/*
412 	 * For each suspect associated with the case, store its fault event
413 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
414 	 * have asked not to be messaged.  If any of them have made such a
415 	 * request, propagate that attribute to the composite list.* event.
416 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
417 	 */
418 	fcl.fcl_countp = &count;
419 	fcl.fcl_maxcount = cip->ci_nsuspects;
420 	fcl.fcl_msgp = &msg;
421 	fcl.fcl_ba = ba;
422 	fcl.fcl_nva = nva;
423 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
424 
425 	if (cip->ci_code == NULL)
426 		(void) fmd_case_mkcode(cp);
427 	/*
428 	 * For repair and updated event, we lookup diagcode from dict using key
429 	 * "list.repaired" or "list.updated" or "list.resolved".
430 	 */
431 	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
432 		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
433 	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
434 		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
435 	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
436 		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
437 	else
438 		code = cip->ci_code;
439 
440 	if (msg == B_FALSE)
441 		cip->ci_flags |= FMD_CF_INVISIBLE;
442 
443 	/*
444 	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
445 	 * Otherwise use the authority for the current module.
446 	 */
447 	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
448 	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
449 	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
450 
451 	(void) pthread_mutex_unlock(&cip->ci_lock);
452 	return (nvl);
453 }
454 
455 static int fmd_case_match_on_faulty_overlap = 1;
456 static int fmd_case_match_on_acquit_overlap = 1;
457 static int fmd_case_auto_acquit_isolated = 1;
458 static int fmd_case_auto_acquit_non_acquitted = 1;
459 static int fmd_case_too_recent = 10; /* time in seconds */
460 
461 static boolean_t
fmd_case_compare_elem(nvlist_t * nvl,nvlist_t * xnvl,const char * elem)462 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
463 {
464 	nvlist_t *new_rsrc;
465 	nvlist_t *rsrc;
466 	char *new_name = NULL;
467 	char *name = NULL;
468 	ssize_t new_namelen;
469 	ssize_t namelen;
470 	int fmri_present = 1;
471 	int new_fmri_present = 1;
472 	int match = B_FALSE;
473 	fmd_topo_t *ftp = fmd_topo_hold();
474 
475 	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
476 		fmri_present = 0;
477 	else {
478 		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
479 			goto done;
480 		name = fmd_alloc(namelen + 1, FMD_SLEEP);
481 		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
482 			goto done;
483 	}
484 	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
485 		new_fmri_present = 0;
486 	else {
487 		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
488 			goto done;
489 		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
490 		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
491 			goto done;
492 	}
493 	match = (fmri_present == new_fmri_present &&
494 	    (fmri_present == 0 ||
495 	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
496 done:
497 	if (name != NULL)
498 		fmd_free(name, namelen + 1);
499 	if (new_name != NULL)
500 		fmd_free(new_name, new_namelen + 1);
501 	fmd_topo_rele(ftp);
502 	return (match);
503 }
504 
505 static int
fmd_case_match_suspect(nvlist_t * nvl1,nvlist_t * nvl2)506 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
507 {
508 	char *class, *new_class;
509 
510 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
511 		return (0);
512 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
513 		return (0);
514 	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
515 		return (0);
516 	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
517 	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
518 	return (strcmp(class, new_class) == 0);
519 }
520 
521 typedef struct {
522 	int	*fcms_countp;
523 	int	fcms_maxcount;
524 	fmd_case_impl_t *fcms_cip;
525 	uint8_t *fcms_new_susp_state;
526 	uint8_t *fcms_old_susp_state;
527 	uint8_t *fcms_old_match_state;
528 } fcms_t;
529 #define	SUSPECT_STATE_FAULTY				0x1
530 #define	SUSPECT_STATE_ISOLATED				0x2
531 #define	SUSPECT_STATE_REMOVED				0x4
532 #define	SUSPECT_STATE_ACQUITED				0x8
533 #define	SUSPECT_STATE_REPAIRED				0x10
534 #define	SUSPECT_STATE_REPLACED				0x20
535 #define	SUSPECT_STATE_NO_MATCH				0x1
536 
537 /*
538  * This is called for each suspect in the old case. Compare it against each
539  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
540  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
541  * found in the old case.
542  */
543 static void
fmd_case_match_suspects(fmd_asru_link_t * alp,void * arg)544 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
545 {
546 	fcms_t *fcmsp = (fcms_t *)arg;
547 	fmd_case_impl_t *cip = fcmsp->fcms_cip;
548 	fmd_case_susp_t *cis;
549 	int i = 0;
550 	int state = fmd_asru_al_getstate(alp);
551 
552 	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
553 		return;
554 
555 	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
556 	    alp->al_reason == FMD_ASRU_REMOVED))
557 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
558 		    SUSPECT_STATE_REMOVED;
559 	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
560 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
561 		    SUSPECT_STATE_ISOLATED;
562 	else if (state & FMD_ASRU_FAULTY)
563 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
564 		    SUSPECT_STATE_FAULTY;
565 	else if (alp->al_reason == FMD_ASRU_REPLACED)
566 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
567 		    SUSPECT_STATE_REPLACED;
568 	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
569 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
570 		    SUSPECT_STATE_ACQUITED;
571 	else
572 		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
573 		    SUSPECT_STATE_REPAIRED;
574 
575 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
576 		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
577 			break;
578 	if (cis != NULL)
579 		fcmsp->fcms_new_susp_state[i] =
580 		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
581 	else
582 		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
583 		    SUSPECT_STATE_NO_MATCH;
584 	(*fcmsp->fcms_countp)++;
585 }
586 
587 typedef struct {
588 	int	*fca_do_update;
589 	fmd_case_impl_t *fca_cip;
590 } fca_t;
591 
592 /*
593  * Re-fault all acquitted suspects that are still present in the new list.
594  */
595 static void
fmd_case_fault_acquitted_matching(fmd_asru_link_t * alp,void * arg)596 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
597 {
598 	fca_t *fcap = (fca_t *)arg;
599 	fmd_case_impl_t *cip = fcap->fca_cip;
600 	fmd_case_susp_t *cis;
601 	int state = fmd_asru_al_getstate(alp);
602 
603 	if (!(state & FMD_ASRU_FAULTY) &&
604 	    alp->al_reason == FMD_ASRU_ACQUITTED) {
605 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
606 			if (fmd_case_match_suspect(cis->cis_nvl,
607 			    alp->al_event) == 1)
608 				break;
609 		if (cis != NULL) {
610 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
611 			*fcap->fca_do_update = 1;
612 		}
613 	}
614 }
615 
616 /*
617  * Re-fault all suspects that are still present in the new list.
618  */
619 static void
fmd_case_fault_all_matching(fmd_asru_link_t * alp,void * arg)620 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
621 {
622 	fca_t *fcap = (fca_t *)arg;
623 	fmd_case_impl_t *cip = fcap->fca_cip;
624 	fmd_case_susp_t *cis;
625 	int state = fmd_asru_al_getstate(alp);
626 
627 	if (!(state & FMD_ASRU_FAULTY)) {
628 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
629 			if (fmd_case_match_suspect(cis->cis_nvl,
630 			    alp->al_event) == 1)
631 				break;
632 		if (cis != NULL) {
633 			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
634 			*fcap->fca_do_update = 1;
635 		}
636 	}
637 }
638 
639 /*
640  * Acquit all suspects that are no longer present in the new list.
641  */
642 static void
fmd_case_acquit_no_match(fmd_asru_link_t * alp,void * arg)643 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
644 {
645 	fca_t *fcap = (fca_t *)arg;
646 	fmd_case_impl_t *cip = fcap->fca_cip;
647 	fmd_case_susp_t *cis;
648 	int state = fmd_asru_al_getstate(alp);
649 
650 	if (state & FMD_ASRU_FAULTY) {
651 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
652 			if (fmd_case_match_suspect(cis->cis_nvl,
653 			    alp->al_event) == 1)
654 				break;
655 		if (cis == NULL) {
656 			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
657 			    FMD_ASRU_ACQUITTED);
658 			*fcap->fca_do_update = 1;
659 		}
660 	}
661 }
662 
663 /*
664  * Acquit all isolated suspects.
665  */
666 static void
fmd_case_acquit_isolated(fmd_asru_link_t * alp,void * arg)667 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
668 {
669 	int *do_update = (int *)arg;
670 	int state = fmd_asru_al_getstate(alp);
671 
672 	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
673 	    (state & FMD_ASRU_FAULTY)) {
674 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
675 		    FMD_ASRU_ACQUITTED);
676 		*do_update = 1;
677 	}
678 }
679 
680 /*
681  * Acquit suspect which matches specified nvlist
682  */
683 static void
fmd_case_acquit_suspect(fmd_asru_link_t * alp,void * arg)684 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
685 {
686 	nvlist_t *nvl = (nvlist_t *)arg;
687 	int state = fmd_asru_al_getstate(alp);
688 
689 	if ((state & FMD_ASRU_FAULTY) &&
690 	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
691 		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
692 		    FMD_ASRU_ACQUITTED);
693 }
694 
695 typedef struct {
696 	fmd_case_impl_t *fccd_cip;
697 	uint8_t *fccd_new_susp_state;
698 	uint8_t *fccd_new_match_state;
699 	int *fccd_discard_new;
700 	int *fccd_adjust_new;
701 } fccd_t;
702 
703 /*
704  * see if a matching suspect list already exists in the cache
705  */
706 static void
fmd_case_check_for_dups(fmd_case_t * old_cp,void * arg)707 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
708 {
709 	fccd_t *fccdp = (fccd_t *)arg;
710 	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
711 	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
712 	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
713 	int got_faulty_overlap = 0;
714 	int got_acquit_overlap = 0;
715 	boolean_t too_recent;
716 	uint64_t most_recent = 0;
717 	fcms_t fcms;
718 	fca_t fca;
719 	uint8_t *new_susp_state;
720 	uint8_t *old_susp_state;
721 	uint8_t *old_match_state;
722 
723 	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
724 	for (i = 0; i < new_cip->ci_nsuspects; i++)
725 		new_susp_state[i] = 0;
726 	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
727 	for (i = 0; i < old_cip->ci_nsuspects; i++)
728 		old_susp_state[i] = 0;
729 	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
730 	for (i = 0; i < old_cip->ci_nsuspects; i++)
731 		old_match_state[i] = 0;
732 
733 	/*
734 	 * Compare with each suspect in the existing case.
735 	 */
736 	fcms.fcms_countp = &count;
737 	fcms.fcms_maxcount = old_cip->ci_nsuspects;
738 	fcms.fcms_cip = new_cip;
739 	fcms.fcms_new_susp_state = new_susp_state;
740 	fcms.fcms_old_susp_state = old_susp_state;
741 	fcms.fcms_old_match_state = old_match_state;
742 	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
743 	    fmd_case_match_suspects, &fcms);
744 
745 	/*
746 	 * If we have some faulty, non-isolated suspects that overlap, then most
747 	 * likely it is the suspects that overlap in the suspect lists that are
748 	 * to blame. So we can consider this to be a match.
749 	 */
750 	for (i = 0; i < new_cip->ci_nsuspects; i++)
751 		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
752 			got_faulty_overlap = 1;
753 	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
754 		goto got_match;
755 
756 	/*
757 	 * If we have no faulty, non-isolated suspects in the old case, but we
758 	 * do have some acquitted suspects that overlap, then most likely it is
759 	 * the acquitted suspects that overlap in the suspect lists that are
760 	 * to blame. So we can consider this to be a match.
761 	 */
762 	for (i = 0; i < new_cip->ci_nsuspects; i++)
763 		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
764 			got_acquit_overlap = 1;
765 	for (i = 0; i < old_cip->ci_nsuspects; i++)
766 		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
767 			got_acquit_overlap = 0;
768 	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
769 		goto got_match;
770 
771 	/*
772 	 * Check that all suspects in the new list are present in the old list.
773 	 * Return if we find one that isn't.
774 	 */
775 	for (i = 0; i < new_cip->ci_nsuspects; i++)
776 		if (new_susp_state[i] == 0)
777 			return;
778 
779 	/*
780 	 * Check that all suspects in the old list are present in the new list
781 	 * *or* they are isolated or removed/replaced (which would explain why
782 	 * they are not present in the new list). Return if we find one that is
783 	 * faulty and unisolated or repaired or acquitted, and that is not
784 	 * present in the new case.
785 	 */
786 	for (i = 0; i < old_cip->ci_nsuspects; i++)
787 		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
788 		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
789 		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
790 		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
791 			return;
792 
793 got_match:
794 	/*
795 	 * If the old case is already in repaired/resolved state, we can't
796 	 * do anything more with it, so keep the new case, but acquit some
797 	 * of the suspects if appropriate.
798 	 */
799 	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
800 		if (fmd_case_auto_acquit_non_acquitted) {
801 			*fccdp->fccd_adjust_new = 1;
802 			for (i = 0; i < new_cip->ci_nsuspects; i++) {
803 				fccdp->fccd_new_susp_state[i] |=
804 				    new_susp_state[i];
805 				if (new_susp_state[i] == 0)
806 					fccdp->fccd_new_susp_state[i] =
807 					    SUSPECT_STATE_NO_MATCH;
808 			}
809 		}
810 		return;
811 	}
812 
813 	/*
814 	 * Otherwise discard the new case and keep the old, again updating the
815 	 * state of the suspects as appropriate
816 	 */
817 	*fccdp->fccd_discard_new = 1;
818 	fca.fca_cip = new_cip;
819 	fca.fca_do_update = &do_update;
820 
821 	/*
822 	 * See if new case occurred within fmd_case_too_recent seconds of the
823 	 * most recent modification to the old case and if so don't do
824 	 * auto-acquit. This avoids problems if a flood of ereports come in and
825 	 * they don't all get diagnosed before the first case causes some of
826 	 * the devices to be isolated making it appear that an isolated device
827 	 * was in the suspect list.
828 	 */
829 	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
830 	    fmd_asru_most_recent, &most_recent);
831 	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
832 	    fmd_case_too_recent);
833 
834 	if (got_faulty_overlap) {
835 		/*
836 		 * Acquit any suspects not present in the new list, plus
837 		 * any that are are present but are isolated.
838 		 */
839 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
840 		    fmd_case_acquit_no_match, &fca);
841 		if (fmd_case_auto_acquit_isolated && !too_recent)
842 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
843 			    fmd_case_acquit_isolated, &do_update);
844 	} else if (got_acquit_overlap) {
845 		/*
846 		 * Re-fault the acquitted matching suspects and acquit all
847 		 * isolated suspects.
848 		 */
849 		if (fmd_case_auto_acquit_isolated && !too_recent) {
850 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
851 			    fmd_case_fault_acquitted_matching, &fca);
852 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
853 			    fmd_case_acquit_isolated, &do_update);
854 		}
855 	} else if (fmd_case_auto_acquit_isolated) {
856 		/*
857 		 * To get here, there must be no faulty or acquitted suspects,
858 		 * but there must be at least one isolated suspect. Just acquit
859 		 * non-matching isolated suspects. If there are no matching
860 		 * isolated suspects, then re-fault all matching suspects.
861 		 */
862 		for (i = 0; i < new_cip->ci_nsuspects; i++)
863 			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
864 				got_isolated_overlap = 1;
865 		if (!got_isolated_overlap)
866 			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
867 			    fmd_case_fault_all_matching, &fca);
868 		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
869 		    fmd_case_acquit_no_match, &fca);
870 	}
871 
872 	/*
873 	 * If we've updated anything in the old case, call fmd_case_update()
874 	 */
875 	if (do_update)
876 		fmd_case_update(old_cp);
877 }
878 
879 /*
880  * Convict suspects in a case by applying a conviction policy and updating the
881  * resource cache prior to emitting the list.suspect event for the given case.
882  * At present, our policy is very simple: convict every suspect in the case.
883  * In the future, this policy can be extended and made configurable to permit:
884  *
885  * - convicting the suspect with the highest FIT rate
886  * - convicting the suspect with the cheapest FRU
887  * - convicting the suspect with the FRU that is in a depot's inventory
888  * - convicting the suspect with the longest lifetime
889  *
890  * and so forth.  A word to the wise: this problem is significantly harder that
891  * it seems at first glance.  Future work should heed the following advice:
892  *
893  * Hacking the policy into C code here is a very bad idea.  The policy needs to
894  * be decided upon very carefully and fundamentally encodes knowledge of what
895  * suspect list combinations can be emitted by what diagnosis engines.  As such
896  * fmd's code is the wrong location, because that would require fmd itself to
897  * be updated for every diagnosis engine change, defeating the entire design.
898  * The FMA Event Registry knows the suspect list combinations: policy inputs
899  * can be derived from it and used to produce per-module policy configuration.
900  *
901  * If the policy needs to be dynamic and not statically fixed at either fmd
902  * startup or module load time, any implementation of dynamic policy retrieval
903  * must employ some kind of caching mechanism or be part of a built-in module.
904  * The fmd_case_convict() function is called with locks held inside of fmd and
905  * is not a place where unbounded blocking on some inter-process or inter-
906  * system communication to another service (e.g. another daemon) can occur.
907  */
908 static int
fmd_case_convict(fmd_case_t * cp)909 fmd_case_convict(fmd_case_t *cp)
910 {
911 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
912 	fmd_asru_hash_t *ahp = fmd.d_asrus;
913 	int discard_new = 0, i;
914 	fmd_case_susp_t *cis;
915 	fmd_asru_link_t *alp;
916 	uint8_t *new_susp_state;
917 	uint8_t *new_match_state;
918 	int adjust_new = 0;
919 	fccd_t fccd;
920 	fmd_case_impl_t *ncp, **cps, **cpp;
921 	uint_t cpc;
922 	fmd_case_hash_t *chp;
923 
924 	/*
925 	 * First we must see if any matching cases already exist.
926 	 */
927 	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
928 	for (i = 0; i < cip->ci_nsuspects; i++)
929 		new_susp_state[i] = 0;
930 	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
931 	for (i = 0; i < cip->ci_nsuspects; i++)
932 		new_match_state[i] = 0;
933 	fccd.fccd_cip = cip;
934 	fccd.fccd_adjust_new = &adjust_new;
935 	fccd.fccd_new_susp_state = new_susp_state;
936 	fccd.fccd_new_match_state = new_match_state;
937 	fccd.fccd_discard_new = &discard_new;
938 
939 	/*
940 	 * Hold all cases
941 	 */
942 	chp = fmd.d_cases;
943 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
944 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
945 	cpc = chp->ch_count;
946 	for (i = 0; i < chp->ch_hashlen; i++)
947 		for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
948 			*cpp++ = fmd_case_tryhold(ncp);
949 	ASSERT(cpp == cps + cpc);
950 	(void) pthread_rwlock_unlock(&chp->ch_lock);
951 
952 	/*
953 	 * Run fmd_case_check_for_dups() on all cases except the current one.
954 	 */
955 	for (i = 0; i < cpc; i++) {
956 		if (cps[i] != NULL) {
957 			if (cps[i] != (fmd_case_impl_t *)cp)
958 				fmd_case_check_for_dups((fmd_case_t *)cps[i],
959 				    &fccd);
960 			fmd_case_rele((fmd_case_t *)cps[i]);
961 		}
962 	}
963 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
964 
965 	(void) pthread_mutex_lock(&cip->ci_lock);
966 	if (cip->ci_code == NULL)
967 		(void) fmd_case_mkcode(cp);
968 	else if (cip->ci_precanned)
969 		fmd_case_code_hash_insert(fmd.d_cases, cip);
970 
971 	if (discard_new) {
972 		/*
973 		 * We've found an existing case that is a match and it is not
974 		 * already in repaired or resolved state. So we can close this
975 		 * one as a duplicate.
976 		 */
977 		(void) pthread_mutex_unlock(&cip->ci_lock);
978 		return (1);
979 	}
980 
981 	/*
982 	 * Allocate new cache entries
983 	 */
984 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
985 		if ((alp = fmd_asru_hash_create_entry(ahp,
986 		    cp, cis->cis_nvl)) == NULL) {
987 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
988 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
989 			continue;
990 		}
991 		alp->al_flags |= FMD_ASRU_PRESENT;
992 		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
993 		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
994 		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
995 	}
996 
997 	if (adjust_new) {
998 		int some_suspect = 0, some_not_suspect = 0;
999 
1000 		/*
1001 		 * There is one or more matching case but they are already in
1002 		 * repaired or resolved state. So we need to keep the new
1003 		 * case, but we can adjust it. Repaired/removed/replaced
1004 		 * suspects are unlikely to be to blame (unless there are
1005 		 * actually two separate faults). So if we have a combination of
1006 		 * repaired/replaced/removed suspects and acquitted suspects in
1007 		 * the old lists, then we should acquit in the new list those
1008 		 * that were repaired/replaced/removed in the old.
1009 		 */
1010 		for (i = 0; i < cip->ci_nsuspects; i++) {
1011 			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1012 			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1013 			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1014 			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1015 				some_not_suspect = 1;
1016 			else
1017 				some_suspect = 1;
1018 		}
1019 		if (some_suspect && some_not_suspect) {
1020 			for (cis = cip->ci_suspects, i = 0; cis != NULL;
1021 			    cis = cis->cis_next, i++)
1022 				if ((new_susp_state[i] &
1023 				    SUSPECT_STATE_REPLACED) ||
1024 				    (new_susp_state[i] &
1025 				    SUSPECT_STATE_REPAIRED) ||
1026 				    (new_susp_state[i] &
1027 				    SUSPECT_STATE_REMOVED) ||
1028 				    (new_match_state[i] &
1029 				    SUSPECT_STATE_NO_MATCH))
1030 					fmd_asru_hash_apply_by_case(fmd.d_asrus,
1031 					    cp, fmd_case_acquit_suspect,
1032 					    cis->cis_nvl);
1033 		}
1034 	}
1035 
1036 	(void) pthread_mutex_unlock(&cip->ci_lock);
1037 	return (0);
1038 }
1039 
1040 void
fmd_case_publish(fmd_case_t * cp,uint_t state)1041 fmd_case_publish(fmd_case_t *cp, uint_t state)
1042 {
1043 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1044 	fmd_event_t *e;
1045 	nvlist_t *nvl;
1046 	char *class;
1047 
1048 	if (state == FMD_CASE_CURRENT)
1049 		state = cip->ci_state; /* use current state */
1050 
1051 	switch (state) {
1052 	case FMD_CASE_SOLVED:
1053 		(void) pthread_mutex_lock(&cip->ci_lock);
1054 
1055 		/*
1056 		 * If we already have a code, then case is already solved.
1057 		 */
1058 		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1059 		    cip->ci_code != NULL) {
1060 			(void) pthread_mutex_unlock(&cip->ci_lock);
1061 			break;
1062 		}
1063 
1064 		if (cip->ci_tv_valid == 0) {
1065 			fmd_time_gettimeofday(&cip->ci_tv);
1066 			cip->ci_tv_valid = 1;
1067 		}
1068 		(void) pthread_mutex_unlock(&cip->ci_lock);
1069 
1070 		if (fmd_case_convict(cp) == 1) { /* dupclose */
1071 			cip->ci_flags &= ~FMD_CF_SOLVED;
1072 			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1073 			break;
1074 		}
1075 		if (cip->ci_xprt != NULL) {
1076 			/*
1077 			 * For proxy, save some information about the transport
1078 			 * in the resource cache.
1079 			 */
1080 			int count = 0;
1081 			fmd_asru_set_on_proxy_t fasp;
1082 			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1083 
1084 			fasp.fasp_countp = &count;
1085 			fasp.fasp_maxcount = cip->ci_nsuspects;
1086 			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1087 			fasp.fasp_proxy_external = xip->xi_flags &
1088 			    FMD_XPRT_EXTERNAL;
1089 			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1090 			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1091 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1092 			    fmd_asru_set_on_proxy, &fasp);
1093 		}
1094 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1095 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1096 
1097 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1098 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1099 		fmd_log_append(fmd.d_fltlog, e, cp);
1100 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1101 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1102 
1103 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1104 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1105 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1106 
1107 		break;
1108 
1109 	case FMD_CASE_CLOSE_WAIT:
1110 		fmd_case_hold(cp);
1111 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1112 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1113 
1114 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1115 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1116 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1117 
1118 		break;
1119 
1120 	case FMD_CASE_CLOSED:
1121 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1122 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1123 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1124 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1125 		break;
1126 
1127 	case FMD_CASE_REPAIRED:
1128 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1129 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1130 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1131 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1132 		fmd_log_append(fmd.d_fltlog, e, cp);
1133 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1134 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1135 		break;
1136 
1137 	case FMD_CASE_RESOLVED:
1138 		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1139 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1140 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1141 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1142 		fmd_log_append(fmd.d_fltlog, e, cp);
1143 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1144 		fmd_dispq_dispatch(fmd.d_disp, e, class);
1145 		break;
1146 	}
1147 }
1148 
1149 fmd_case_t *
fmd_case_hash_lookup(fmd_case_hash_t * chp,const char * uuid)1150 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1151 {
1152 	fmd_case_impl_t *cip;
1153 	uint_t h;
1154 
1155 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
1156 	h = fmd_strhash(uuid) % chp->ch_hashlen;
1157 
1158 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1159 		if (strcmp(cip->ci_uuid, uuid) == 0)
1160 			break;
1161 	}
1162 
1163 	/*
1164 	 * If deleting bit is set, treat the case as if it doesn't exist.
1165 	 */
1166 	if (cip != NULL)
1167 		cip = fmd_case_tryhold(cip);
1168 
1169 	if (cip == NULL)
1170 		(void) fmd_set_errno(EFMD_CASE_INVAL);
1171 
1172 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1173 	return ((fmd_case_t *)cip);
1174 }
1175 
1176 static fmd_case_impl_t *
fmd_case_hash_insert(fmd_case_hash_t * chp,fmd_case_impl_t * cip)1177 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1178 {
1179 	fmd_case_impl_t *eip;
1180 	uint_t h;
1181 
1182 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1183 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1184 
1185 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1186 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1187 		    fmd_case_tryhold(eip) != NULL) {
1188 			(void) pthread_rwlock_unlock(&chp->ch_lock);
1189 			return (eip); /* uuid already present */
1190 		}
1191 	}
1192 
1193 	cip->ci_next = chp->ch_hash[h];
1194 	chp->ch_hash[h] = cip;
1195 
1196 	chp->ch_count++;
1197 	ASSERT(chp->ch_count != 0);
1198 
1199 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1200 	return (cip);
1201 }
1202 
1203 static void
fmd_case_hash_delete(fmd_case_hash_t * chp,fmd_case_impl_t * cip)1204 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1205 {
1206 	fmd_case_impl_t *cp, **pp;
1207 	uint_t h;
1208 
1209 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1210 
1211 	cip->ci_flags |= FMD_CF_DELETING;
1212 	(void) pthread_mutex_unlock(&cip->ci_lock);
1213 
1214 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1215 
1216 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1217 	pp = &chp->ch_hash[h];
1218 
1219 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1220 		if (cp != cip)
1221 			pp = &cp->ci_next;
1222 		else
1223 			break;
1224 	}
1225 
1226 	if (cp == NULL) {
1227 		fmd_panic("case %p (%s) not found on hash chain %u\n",
1228 		    (void *)cip, cip->ci_uuid, h);
1229 	}
1230 
1231 	*pp = cp->ci_next;
1232 	cp->ci_next = NULL;
1233 
1234 	/*
1235 	 * delete from code hash if it is on it
1236 	 */
1237 	fmd_case_code_hash_delete(chp, cip);
1238 
1239 	ASSERT(chp->ch_count != 0);
1240 	chp->ch_count--;
1241 
1242 	(void) pthread_rwlock_unlock(&chp->ch_lock);
1243 
1244 	(void) pthread_mutex_lock(&cip->ci_lock);
1245 	ASSERT(cip->ci_flags & FMD_CF_DELETING);
1246 }
1247 
1248 fmd_case_t *
fmd_case_create(fmd_module_t * mp,const char * uuidstr,void * data)1249 fmd_case_create(fmd_module_t *mp, const char *uuidstr, void *data)
1250 {
1251 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1252 	fmd_case_impl_t *eip = NULL;
1253 	uuid_t uuid;
1254 
1255 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1256 	fmd_buf_hash_create(&cip->ci_bufs);
1257 
1258 	fmd_module_hold(mp);
1259 	cip->ci_mod = mp;
1260 	cip->ci_refs = 1;
1261 	cip->ci_state = FMD_CASE_UNSOLVED;
1262 	cip->ci_flags = FMD_CF_DIRTY;
1263 	cip->ci_data = data;
1264 
1265 	/*
1266 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
1267 	 * define any constant for the length of an unparse string, and do not
1268 	 * permit the caller to specify a buffer length for safety.  The spec
1269 	 * says it will be 36 bytes, but we make it tunable just in case.
1270 	 */
1271 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1272 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1273 
1274 	if (uuidstr == NULL) {
1275 		/*
1276 		 * We expect this loop to execute only once, but code it
1277 		 * defensively against the possibility of libuuid bugs.
1278 		 * Keep generating uuids and attempting to do a hash insert
1279 		 * until we get a unique one.
1280 		 */
1281 		do {
1282 			if (eip != NULL)
1283 				fmd_case_rele((fmd_case_t *)eip);
1284 			uuid_generate(uuid);
1285 			uuid_unparse(uuid, cip->ci_uuid);
1286 		} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1287 	} else {
1288 		/*
1289 		 * If a uuid was specified we must succeed with that uuid,
1290 		 * or return NULL indicating a case with that uuid already
1291 		 * exists.
1292 		 */
1293 		(void) strncpy(cip->ci_uuid, uuidstr, cip->ci_uuidlen + 1);
1294 		if (fmd_case_hash_insert(fmd.d_cases, cip) != cip) {
1295 			fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1296 			(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1297 			fmd_module_rele(mp);
1298 			pthread_mutex_destroy(&cip->ci_lock);
1299 			fmd_free(cip, sizeof (*cip));
1300 			return (NULL);
1301 		}
1302 	}
1303 
1304 	ASSERT(fmd_module_locked(mp));
1305 	fmd_list_append(&mp->mod_cases, cip);
1306 	fmd_module_setcdirty(mp);
1307 
1308 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1309 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1310 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1311 
1312 	return ((fmd_case_t *)cip);
1313 }
1314 
1315 static void
fmd_case_destroy_suspects(fmd_case_impl_t * cip)1316 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1317 {
1318 	fmd_case_susp_t *cis, *ncis;
1319 
1320 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1321 
1322 	if (cip->ci_proxy_asru)
1323 		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1324 		    cip->ci_nsuspects);
1325 	if (cip->ci_diag_de)
1326 		nvlist_free(cip->ci_diag_de);
1327 	if (cip->ci_diag_asru)
1328 		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1329 		    cip->ci_nsuspects);
1330 
1331 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1332 		ncis = cis->cis_next;
1333 		nvlist_free(cis->cis_nvl);
1334 		fmd_free(cis, sizeof (fmd_case_susp_t));
1335 	}
1336 
1337 	cip->ci_suspects = NULL;
1338 	cip->ci_nsuspects = 0;
1339 }
1340 
1341 fmd_case_t *
fmd_case_recreate(fmd_module_t * mp,fmd_xprt_t * xp,uint_t state,const char * uuid,const char * code)1342 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1343     uint_t state, const char *uuid, const char *code)
1344 {
1345 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1346 	fmd_case_impl_t *eip;
1347 
1348 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1349 	fmd_buf_hash_create(&cip->ci_bufs);
1350 
1351 	fmd_module_hold(mp);
1352 	cip->ci_mod = mp;
1353 	cip->ci_xprt = xp;
1354 	cip->ci_refs = 1;
1355 	cip->ci_state = state;
1356 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1357 	cip->ci_uuidlen = strlen(cip->ci_uuid);
1358 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1359 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1360 
1361 	if (state > FMD_CASE_CLOSE_WAIT)
1362 		cip->ci_flags |= FMD_CF_SOLVED;
1363 
1364 	/*
1365 	 * Insert the case into the global case hash.  If the specified UUID is
1366 	 * already present, check to see if it is an orphan: if so, reclaim it;
1367 	 * otherwise if it is owned by a different module then return NULL.
1368 	 */
1369 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1370 		(void) pthread_mutex_lock(&cip->ci_lock);
1371 		cip->ci_refs--; /* decrement to zero */
1372 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1373 
1374 		cip = eip; /* switch 'cip' to the existing case */
1375 		(void) pthread_mutex_lock(&cip->ci_lock);
1376 
1377 		/*
1378 		 * If the ASRU cache is trying to recreate an orphan, then just
1379 		 * return the existing case that we found without changing it.
1380 		 */
1381 		if (mp == fmd.d_rmod) {
1382 			/*
1383 			 * In case the case has already been created from
1384 			 * a checkpoint file we need to set up code now.
1385 			 */
1386 			if (cip->ci_state < FMD_CASE_CLOSED) {
1387 				if (code != NULL && cip->ci_code == NULL) {
1388 					cip->ci_code = fmd_strdup(code,
1389 					    FMD_SLEEP);
1390 					cip->ci_codelen = cip->ci_code ?
1391 					    strlen(cip->ci_code) + 1 : 0;
1392 					fmd_case_code_hash_insert(fmd.d_cases,
1393 					    cip);
1394 				}
1395 			}
1396 
1397 			/*
1398 			 * When recreating an orphan case, state passed in may
1399 			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1400 			 * any suspects are still CLOSED (faulty) then the
1401 			 * overall state needs to be CLOSED.
1402 			 */
1403 			if ((cip->ci_state == FMD_CASE_REPAIRED ||
1404 			    cip->ci_state == FMD_CASE_RESOLVED) &&
1405 			    state == FMD_CASE_CLOSED)
1406 				cip->ci_state = FMD_CASE_CLOSED;
1407 			(void) pthread_mutex_unlock(&cip->ci_lock);
1408 			fmd_case_rele((fmd_case_t *)cip);
1409 			return ((fmd_case_t *)cip);
1410 		}
1411 
1412 		/*
1413 		 * If the existing case isn't an orphan or is being proxied,
1414 		 * then we have a UUID conflict: return failure to the caller.
1415 		 */
1416 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1417 			(void) pthread_mutex_unlock(&cip->ci_lock);
1418 			fmd_case_rele((fmd_case_t *)cip);
1419 			return (NULL);
1420 		}
1421 
1422 		/*
1423 		 * If the new module is reclaiming an orphaned case, remove
1424 		 * the case from the root module, switch ci_mod, and then fall
1425 		 * through to adding the case to the new owner module 'mp'.
1426 		 */
1427 		fmd_module_lock(cip->ci_mod);
1428 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1429 		fmd_module_unlock(cip->ci_mod);
1430 
1431 		fmd_module_rele(cip->ci_mod);
1432 		cip->ci_mod = mp;
1433 		fmd_module_hold(mp);
1434 
1435 		/*
1436 		 * It's possible that fmd crashed or was restarted during a
1437 		 * previous solve operation between the asru cache being created
1438 		 * and the ckpt file being updated to SOLVED. Thus when the DE
1439 		 * recreates the case here from the checkpoint file, the state
1440 		 * will be UNSOLVED and yet we are having to reclaim because
1441 		 * the case was in the asru cache. If this happens, revert the
1442 		 * case back to the UNSOLVED state and let the DE solve it again
1443 		 */
1444 		if (state == FMD_CASE_UNSOLVED) {
1445 			fmd_asru_hash_delete_case(fmd.d_asrus,
1446 			    (fmd_case_t *)cip);
1447 			fmd_case_destroy_suspects(cip);
1448 			fmd_case_code_hash_delete(fmd.d_cases, cip);
1449 			fmd_free(cip->ci_code, cip->ci_codelen);
1450 			cip->ci_code = NULL;
1451 			cip->ci_codelen = 0;
1452 			cip->ci_tv_valid = 0;
1453 		}
1454 
1455 		cip->ci_state = state;
1456 
1457 		(void) pthread_mutex_unlock(&cip->ci_lock);
1458 		fmd_case_rele((fmd_case_t *)cip);
1459 	} else {
1460 		/*
1461 		 * add into hash of solved cases
1462 		 */
1463 		if (cip->ci_code)
1464 			fmd_case_code_hash_insert(fmd.d_cases, cip);
1465 	}
1466 
1467 	ASSERT(fmd_module_locked(mp));
1468 	fmd_list_append(&mp->mod_cases, cip);
1469 
1470 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1471 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1472 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1473 
1474 	return ((fmd_case_t *)cip);
1475 }
1476 
1477 void
fmd_case_destroy(fmd_case_t * cp,int visible)1478 fmd_case_destroy(fmd_case_t *cp, int visible)
1479 {
1480 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1481 	fmd_case_item_t *cit, *ncit;
1482 
1483 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1484 	ASSERT(cip->ci_refs == 0);
1485 
1486 	if (visible) {
1487 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1488 		fmd_case_hash_delete(fmd.d_cases, cip);
1489 	}
1490 
1491 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1492 		ncit = cit->cit_next;
1493 		fmd_event_rele(cit->cit_event);
1494 		fmd_free(cit, sizeof (fmd_case_item_t));
1495 	}
1496 
1497 	fmd_case_destroy_suspects(cip);
1498 
1499 	if (cip->ci_principal != NULL)
1500 		fmd_event_rele(cip->ci_principal);
1501 
1502 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1503 	fmd_free(cip->ci_code, cip->ci_codelen);
1504 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1505 
1506 	fmd_module_rele(cip->ci_mod);
1507 	fmd_free(cip, sizeof (fmd_case_impl_t));
1508 }
1509 
1510 void
fmd_case_hold(fmd_case_t * cp)1511 fmd_case_hold(fmd_case_t *cp)
1512 {
1513 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1514 
1515 	(void) pthread_mutex_lock(&cip->ci_lock);
1516 	fmd_case_hold_locked(cp);
1517 	(void) pthread_mutex_unlock(&cip->ci_lock);
1518 }
1519 
1520 void
fmd_case_hold_locked(fmd_case_t * cp)1521 fmd_case_hold_locked(fmd_case_t *cp)
1522 {
1523 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1524 
1525 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1526 	if (cip->ci_flags & FMD_CF_DELETING)
1527 		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1528 		    (void *)cip, cip->ci_uuid);
1529 	cip->ci_refs++;
1530 	ASSERT(cip->ci_refs != 0);
1531 }
1532 
1533 static fmd_case_impl_t *
fmd_case_tryhold(fmd_case_impl_t * cip)1534 fmd_case_tryhold(fmd_case_impl_t *cip)
1535 {
1536 	/*
1537 	 * If the case's "deleting" bit is unset, hold and return case,
1538 	 * otherwise, return NULL.
1539 	 */
1540 	(void) pthread_mutex_lock(&cip->ci_lock);
1541 	if (cip->ci_flags & FMD_CF_DELETING) {
1542 		(void) pthread_mutex_unlock(&cip->ci_lock);
1543 		cip = NULL;
1544 	} else {
1545 		fmd_case_hold_locked((fmd_case_t *)cip);
1546 		(void) pthread_mutex_unlock(&cip->ci_lock);
1547 	}
1548 	return (cip);
1549 }
1550 
1551 void
fmd_case_rele(fmd_case_t * cp)1552 fmd_case_rele(fmd_case_t *cp)
1553 {
1554 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1555 
1556 	(void) pthread_mutex_lock(&cip->ci_lock);
1557 	ASSERT(cip->ci_refs != 0);
1558 
1559 	if (--cip->ci_refs == 0)
1560 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1561 	else
1562 		(void) pthread_mutex_unlock(&cip->ci_lock);
1563 }
1564 
1565 void
fmd_case_rele_locked(fmd_case_t * cp)1566 fmd_case_rele_locked(fmd_case_t *cp)
1567 {
1568 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1569 
1570 	ASSERT(MUTEX_HELD(&cip->ci_lock));
1571 	--cip->ci_refs;
1572 	ASSERT(cip->ci_refs != 0);
1573 }
1574 
1575 int
fmd_case_insert_principal(fmd_case_t * cp,fmd_event_t * ep)1576 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1577 {
1578 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1579 	fmd_case_item_t *cit;
1580 	fmd_event_t *oep;
1581 	uint_t state;
1582 	int new;
1583 
1584 	fmd_event_hold(ep);
1585 	(void) pthread_mutex_lock(&cip->ci_lock);
1586 
1587 	if (cip->ci_flags & FMD_CF_SOLVED)
1588 		state = FMD_EVS_DIAGNOSED;
1589 	else
1590 		state = FMD_EVS_ACCEPTED;
1591 
1592 	oep = cip->ci_principal;
1593 	cip->ci_principal = ep;
1594 
1595 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1596 		if (cit->cit_event == ep)
1597 			break;
1598 	}
1599 
1600 	cip->ci_flags |= FMD_CF_DIRTY;
1601 	new = cit == NULL && ep != oep;
1602 
1603 	(void) pthread_mutex_unlock(&cip->ci_lock);
1604 
1605 	fmd_module_setcdirty(cip->ci_mod);
1606 	fmd_event_transition(ep, state);
1607 
1608 	if (oep != NULL)
1609 		fmd_event_rele(oep);
1610 
1611 	return (new);
1612 }
1613 
1614 int
fmd_case_insert_event(fmd_case_t * cp,fmd_event_t * ep)1615 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1616 {
1617 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1618 	fmd_case_item_t *cit;
1619 	uint_t state;
1620 	int new;
1621 	boolean_t injected;
1622 
1623 	(void) pthread_mutex_lock(&cip->ci_lock);
1624 
1625 	if (cip->ci_flags & FMD_CF_SOLVED)
1626 		state = FMD_EVS_DIAGNOSED;
1627 	else
1628 		state = FMD_EVS_ACCEPTED;
1629 
1630 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1631 		if (cit->cit_event == ep)
1632 			break;
1633 	}
1634 
1635 	new = cit == NULL && ep != cip->ci_principal;
1636 
1637 	/*
1638 	 * If the event is already in the case or the case is already solved,
1639 	 * there is no reason to save it: just transition it appropriately.
1640 	 */
1641 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1642 		(void) pthread_mutex_unlock(&cip->ci_lock);
1643 		fmd_event_transition(ep, state);
1644 		return (new);
1645 	}
1646 
1647 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1648 	fmd_event_hold(ep);
1649 
1650 	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1651 	    "__injected", &injected) == 0 && injected)
1652 		fmd_case_set_injected(cp);
1653 
1654 	cit->cit_next = cip->ci_items;
1655 	cit->cit_event = ep;
1656 
1657 	cip->ci_items = cit;
1658 	cip->ci_nitems++;
1659 
1660 	cip->ci_flags |= FMD_CF_DIRTY;
1661 	(void) pthread_mutex_unlock(&cip->ci_lock);
1662 
1663 	fmd_module_setcdirty(cip->ci_mod);
1664 	fmd_event_transition(ep, state);
1665 
1666 	return (new);
1667 }
1668 
1669 void
fmd_case_insert_suspect(fmd_case_t * cp,nvlist_t * nvl)1670 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1671 {
1672 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1673 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1674 
1675 	(void) pthread_mutex_lock(&cip->ci_lock);
1676 	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1677 	cip->ci_flags |= FMD_CF_DIRTY;
1678 
1679 	cis->cis_next = cip->ci_suspects;
1680 	cis->cis_nvl = nvl;
1681 
1682 	cip->ci_suspects = cis;
1683 	cip->ci_nsuspects++;
1684 
1685 	(void) pthread_mutex_unlock(&cip->ci_lock);
1686 	if (cip->ci_xprt == NULL)
1687 		fmd_module_setcdirty(cip->ci_mod);
1688 }
1689 
1690 void
fmd_case_recreate_suspect(fmd_case_t * cp,nvlist_t * nvl)1691 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1692 {
1693 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1694 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1695 	boolean_t b;
1696 
1697 	(void) pthread_mutex_lock(&cip->ci_lock);
1698 
1699 	cis->cis_next = cip->ci_suspects;
1700 	cis->cis_nvl = nvl;
1701 
1702 	if (nvlist_lookup_boolean_value(nvl,
1703 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1704 		cip->ci_flags |= FMD_CF_INVISIBLE;
1705 
1706 	cip->ci_suspects = cis;
1707 	cip->ci_nsuspects++;
1708 
1709 	(void) pthread_mutex_unlock(&cip->ci_lock);
1710 }
1711 
1712 void
fmd_case_reset_suspects(fmd_case_t * cp)1713 fmd_case_reset_suspects(fmd_case_t *cp)
1714 {
1715 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1716 
1717 	(void) pthread_mutex_lock(&cip->ci_lock);
1718 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1719 
1720 	fmd_case_destroy_suspects(cip);
1721 	cip->ci_flags |= FMD_CF_DIRTY;
1722 
1723 	(void) pthread_mutex_unlock(&cip->ci_lock);
1724 	fmd_module_setcdirty(cip->ci_mod);
1725 }
1726 
1727 /*ARGSUSED*/
1728 static void
fmd_case_unusable(fmd_asru_link_t * alp,void * arg)1729 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1730 {
1731 	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1732 }
1733 
1734 /*
1735  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1736  * whatever actions and emit whatever events are appropriate for the state.
1737  * Refer to the topmost block comment explaining the state machine for details.
1738  */
1739 void
fmd_case_transition(fmd_case_t * cp,uint_t state,uint_t flags)1740 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1741 {
1742 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1743 	fmd_case_item_t *cit;
1744 	fmd_event_t *e;
1745 	int resolved = 0;
1746 	int any_unusable_and_present = 0;
1747 
1748 	ASSERT(state <= FMD_CASE_RESOLVED);
1749 	(void) pthread_mutex_lock(&cip->ci_lock);
1750 
1751 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1752 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1753 
1754 	cip->ci_flags |= flags;
1755 
1756 	if (cip->ci_state >= state) {
1757 		(void) pthread_mutex_unlock(&cip->ci_lock);
1758 		return; /* already in specified state */
1759 	}
1760 
1761 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1762 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1763 
1764 	cip->ci_state = state;
1765 	cip->ci_flags |= FMD_CF_DIRTY;
1766 
1767 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1768 		fmd_module_setcdirty(cip->ci_mod);
1769 
1770 	switch (state) {
1771 	case FMD_CASE_SOLVED:
1772 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1773 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1774 
1775 		if (cip->ci_principal != NULL) {
1776 			fmd_event_transition(cip->ci_principal,
1777 			    FMD_EVS_DIAGNOSED);
1778 		}
1779 		break;
1780 
1781 	case FMD_CASE_CLOSE_WAIT:
1782 		/*
1783 		 * If the case was never solved, do not change ASRUs.
1784 		 * If the case was never fmd_case_closed, do not change ASRUs.
1785 		 * If the case was repaired, do not change ASRUs.
1786 		 */
1787 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1788 		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1789 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1790 			    fmd_case_unusable, NULL);
1791 
1792 		/*
1793 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1794 		 * module is no longer loaded: continue on to CASE_CLOSED or
1795 		 * CASE_REPAIRED as appropriate.
1796 		 */
1797 		if (fmd_case_orphaned(cp)) {
1798 			if (cip->ci_flags & FMD_CF_REPAIRED) {
1799 				state = cip->ci_state = FMD_CASE_REPAIRED;
1800 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1801 				    cip->ci_uuid,
1802 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1803 				    _fmd_case_snames[FMD_CASE_REPAIRED]));
1804 				goto do_repair;
1805 			} else {
1806 				state = cip->ci_state = FMD_CASE_CLOSED;
1807 				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1808 				    cip->ci_uuid,
1809 				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1810 				    _fmd_case_snames[FMD_CASE_CLOSED]));
1811 			}
1812 		}
1813 		break;
1814 
1815 	case FMD_CASE_REPAIRED:
1816 do_repair:
1817 		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1818 
1819 		/*
1820 		 * If we've been requested to transition straight on to the
1821 		 * RESOLVED state (which can happen with fault proxying where a
1822 		 * list.resolved or a uuresolved is received from the other
1823 		 * side), or if all suspects are already either usable or not
1824 		 * present then transition straight to RESOLVED state,
1825 		 * publishing both the list.repaired and list.resolved. For a
1826 		 * proxy, if we discover here that all suspects are already
1827 		 * either usable or not present, notify the diag side instead
1828 		 * using fmd_xprt_uuresolved().
1829 		 */
1830 		if (flags & FMD_CF_RESOLVED) {
1831 			if (cip->ci_xprt != NULL)
1832 				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1833 		} else {
1834 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1835 			    fmd_case_unusable_and_present,
1836 			    &any_unusable_and_present);
1837 			if (any_unusable_and_present)
1838 				break;
1839 			if (cip->ci_xprt != NULL) {
1840 				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1841 				break;
1842 			}
1843 		}
1844 
1845 		cip->ci_state = FMD_CASE_RESOLVED;
1846 		(void) pthread_mutex_unlock(&cip->ci_lock);
1847 		fmd_case_publish(cp, state);
1848 		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1849 		    _fmd_case_snames[FMD_CASE_REPAIRED],
1850 		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1851 		state = FMD_CASE_RESOLVED;
1852 		resolved = 1;
1853 		(void) pthread_mutex_lock(&cip->ci_lock);
1854 		break;
1855 
1856 	case FMD_CASE_RESOLVED:
1857 		/*
1858 		 * For a proxy, no need to check that all suspects are already
1859 		 * either usable or not present - this request has come from
1860 		 * the diagnosing side which makes the final decision on this.
1861 		 */
1862 		if (cip->ci_xprt != NULL) {
1863 			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1864 			resolved = 1;
1865 			break;
1866 		}
1867 
1868 		ASSERT(fmd_case_orphaned(cp));
1869 
1870 		/*
1871 		 * If all suspects are already either usable or not present then
1872 		 * carry on, publish list.resolved and discard the case.
1873 		 */
1874 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1875 		    fmd_case_unusable_and_present, &any_unusable_and_present);
1876 		if (any_unusable_and_present) {
1877 			(void) pthread_mutex_unlock(&cip->ci_lock);
1878 			return;
1879 		}
1880 
1881 		resolved = 1;
1882 		break;
1883 	}
1884 
1885 	(void) pthread_mutex_unlock(&cip->ci_lock);
1886 
1887 	/*
1888 	 * If the module has initialized, then publish the appropriate event
1889 	 * for the new case state.  If not, we are being called from the
1890 	 * checkpoint code during module load, in which case the module's
1891 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1892 	 * may not be open yet, which will prevent us from computing the event
1893 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1894 	 * event in our queue: this won't be processed until _fmd_init is done.
1895 	 */
1896 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1897 		fmd_case_publish(cp, state);
1898 	else {
1899 		fmd_case_hold(cp);
1900 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1901 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1902 	}
1903 
1904 	if (resolved) {
1905 		if (cip->ci_xprt != NULL) {
1906 			/*
1907 			 * If we transitioned to RESOLVED, adjust the reference
1908 			 * count to reflect our removal from
1909 			 * fmd.d_rmod->mod_cases above.  If the caller has not
1910 			 * placed an additional hold on the case, it will now
1911 			 * be freed.
1912 			 */
1913 			(void) pthread_mutex_lock(&cip->ci_lock);
1914 			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1915 			(void) pthread_mutex_unlock(&cip->ci_lock);
1916 			fmd_case_rele(cp);
1917 		} else {
1918 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1919 			    fmd_asru_log_resolved, NULL);
1920 			(void) pthread_mutex_lock(&cip->ci_lock);
1921 			/* mark as "ready to be discarded */
1922 			cip->ci_flags |= FMD_CF_RES_CMPL;
1923 			(void) pthread_mutex_unlock(&cip->ci_lock);
1924 		}
1925 	}
1926 }
1927 
1928 /*
1929  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1930  * is set if all suspects have passed the rsrc.aged time).
1931  */
1932 void
fmd_case_discard_resolved(fmd_case_t * cp,void * arg)1933 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1934 {
1935 	int check_if_aged = *(int *)arg;
1936 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1937 
1938 	/*
1939 	 * First check if case has completed transition to resolved.
1940 	 */
1941 	(void) pthread_mutex_lock(&cip->ci_lock);
1942 	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1943 		(void) pthread_mutex_unlock(&cip->ci_lock);
1944 		return;
1945 	}
1946 
1947 	/*
1948 	 * Now if check_is_aged is set, see if all suspects have aged.
1949 	 */
1950 	if (check_if_aged) {
1951 		int aged = 1;
1952 
1953 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1954 		    fmd_asru_check_if_aged, &aged);
1955 		if (!aged) {
1956 			(void) pthread_mutex_unlock(&cip->ci_lock);
1957 			return;
1958 		}
1959 	}
1960 
1961 	/*
1962 	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1963 	 * do it twice.
1964 	 */
1965 	fmd_module_lock(cip->ci_mod);
1966 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1967 	fmd_module_unlock(cip->ci_mod);
1968 	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1969 	cip->ci_flags &= ~FMD_CF_RES_CMPL;
1970 	(void) pthread_mutex_unlock(&cip->ci_lock);
1971 	fmd_case_rele(cp);
1972 }
1973 
1974 /*
1975  * Transition the specified case to *at least* the specified state by first
1976  * re-validating the suspect list using the resource cache.  This function is
1977  * employed by the checkpoint code when restoring a saved, solved case to see
1978  * if the state of the case has effectively changed while fmd was not running
1979  * or the module was not loaded.
1980  */
1981 void
fmd_case_transition_update(fmd_case_t * cp,uint_t state,uint_t flags)1982 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1983 {
1984 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1985 
1986 	int usable = 0;		/* are any suspects usable? */
1987 
1988 	ASSERT(state >= FMD_CASE_SOLVED);
1989 	(void) pthread_mutex_lock(&cip->ci_lock);
1990 
1991 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1992 
1993 	(void) pthread_mutex_unlock(&cip->ci_lock);
1994 
1995 	if (!usable) {
1996 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1997 		flags |= FMD_CF_ISOLATED;
1998 	}
1999 
2000 	fmd_case_transition(cp, state, flags);
2001 }
2002 
2003 void
fmd_case_setdirty(fmd_case_t * cp)2004 fmd_case_setdirty(fmd_case_t *cp)
2005 {
2006 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2007 
2008 	(void) pthread_mutex_lock(&cip->ci_lock);
2009 	cip->ci_flags |= FMD_CF_DIRTY;
2010 	(void) pthread_mutex_unlock(&cip->ci_lock);
2011 
2012 	fmd_module_setcdirty(cip->ci_mod);
2013 }
2014 
2015 void
fmd_case_clrdirty(fmd_case_t * cp)2016 fmd_case_clrdirty(fmd_case_t *cp)
2017 {
2018 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2019 
2020 	(void) pthread_mutex_lock(&cip->ci_lock);
2021 	cip->ci_flags &= ~FMD_CF_DIRTY;
2022 	(void) pthread_mutex_unlock(&cip->ci_lock);
2023 }
2024 
2025 void
fmd_case_commit(fmd_case_t * cp)2026 fmd_case_commit(fmd_case_t *cp)
2027 {
2028 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2029 	fmd_case_item_t *cit;
2030 
2031 	(void) pthread_mutex_lock(&cip->ci_lock);
2032 
2033 	if (cip->ci_flags & FMD_CF_DIRTY) {
2034 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2035 			fmd_event_commit(cit->cit_event);
2036 
2037 		if (cip->ci_principal != NULL)
2038 			fmd_event_commit(cip->ci_principal);
2039 
2040 		fmd_buf_hash_commit(&cip->ci_bufs);
2041 		cip->ci_flags &= ~FMD_CF_DIRTY;
2042 	}
2043 
2044 	(void) pthread_mutex_unlock(&cip->ci_lock);
2045 }
2046 
2047 /*
2048  * On proxy side, send back repair/acquit/etc request to diagnosing side
2049  */
2050 void
fmd_case_xprt_updated(fmd_case_t * cp)2051 fmd_case_xprt_updated(fmd_case_t *cp)
2052 {
2053 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2054 	nvlist_t **nva;
2055 	uint8_t *ba;
2056 	int msg = B_TRUE;
2057 	int count = 0;
2058 	fmd_case_lst_t fcl;
2059 
2060 	ASSERT(cip->ci_xprt != NULL);
2061 	(void) pthread_mutex_lock(&cip->ci_lock);
2062 	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2063 	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2064 	fcl.fcl_countp = &count;
2065 	fcl.fcl_maxcount = cip->ci_nsuspects;
2066 	fcl.fcl_msgp = &msg;
2067 	fcl.fcl_ba = ba;
2068 	fcl.fcl_nva = nva;
2069 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2070 	(void) pthread_mutex_unlock(&cip->ci_lock);
2071 	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2072 	    count);
2073 }
2074 
2075 /*
2076  * fmd_case_update_status() can be called on either the proxy side when a
2077  * list.suspect is received, or on the diagnosing side when an update request
2078  * is received from the proxy. It updates the status in the resource cache.
2079  */
2080 void
fmd_case_update_status(fmd_case_t * cp,uint8_t * statusp,uint8_t * proxy_asrup,uint8_t * diag_asrup)2081 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2082     uint8_t *diag_asrup)
2083 {
2084 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2085 	int count = 0;
2086 	fmd_asru_update_status_t faus;
2087 
2088 	/*
2089 	 * update status of resource cache entries
2090 	 */
2091 	faus.faus_countp = &count;
2092 	faus.faus_maxcount = cip->ci_nsuspects;
2093 	faus.faus_ba = statusp;
2094 	faus.faus_proxy_asru = proxy_asrup;
2095 	faus.faus_diag_asru = diag_asrup;
2096 	faus.faus_is_proxy = (cip->ci_xprt != NULL);
2097 	(void) pthread_mutex_lock(&cip->ci_lock);
2098 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2099 	    &faus);
2100 	(void) pthread_mutex_unlock(&cip->ci_lock);
2101 }
2102 
2103 /*
2104  * Called on either the proxy side or the diag side when a repair has taken
2105  * place on the other side but this side may know the asru "contains"
2106  * relationships.
2107  */
2108 void
fmd_case_update_containees(fmd_case_t * cp)2109 fmd_case_update_containees(fmd_case_t *cp)
2110 {
2111 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2112 
2113 	(void) pthread_mutex_lock(&cip->ci_lock);
2114 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2115 	    fmd_asru_update_containees, NULL);
2116 	(void) pthread_mutex_unlock(&cip->ci_lock);
2117 }
2118 
2119 /*
2120  * fmd_case_close_status() is called on diagnosing side when proxy side
2121  * has had a uuclose. It updates the status in the resource cache.
2122  */
2123 void
fmd_case_close_status(fmd_case_t * cp)2124 fmd_case_close_status(fmd_case_t *cp)
2125 {
2126 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2127 	int count = 0;
2128 	fmd_asru_close_status_t facs;
2129 
2130 	/*
2131 	 * update status of resource cache entries
2132 	 */
2133 	facs.facs_countp = &count;
2134 	facs.facs_maxcount = cip->ci_nsuspects;
2135 	(void) pthread_mutex_lock(&cip->ci_lock);
2136 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2137 	    &facs);
2138 	(void) pthread_mutex_unlock(&cip->ci_lock);
2139 }
2140 
2141 /*
2142  * Indicate that the case may need to change state because one or more of the
2143  * ASRUs named as a suspect has changed state.  We examine all the suspects
2144  * and if none are still faulty, we initiate a case close transition.
2145  */
2146 void
fmd_case_update(fmd_case_t * cp)2147 fmd_case_update(fmd_case_t *cp)
2148 {
2149 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2150 	uint_t cstate;
2151 	int faulty = 0;
2152 
2153 	(void) pthread_mutex_lock(&cip->ci_lock);
2154 	cstate = cip->ci_state;
2155 
2156 	if (cip->ci_state < FMD_CASE_SOLVED) {
2157 		(void) pthread_mutex_unlock(&cip->ci_lock);
2158 		return; /* update is not appropriate */
2159 	}
2160 
2161 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2162 		(void) pthread_mutex_unlock(&cip->ci_lock);
2163 		return; /* already repaired */
2164 	}
2165 
2166 	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2167 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2168 	(void) pthread_mutex_unlock(&cip->ci_lock);
2169 
2170 	if (faulty) {
2171 		nvlist_t *nvl;
2172 		fmd_event_t *e;
2173 		char *class;
2174 
2175 		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2176 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2177 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2178 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2179 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2180 		fmd_log_append(fmd.d_fltlog, e, cp);
2181 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
2182 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2183 		return; /* one or more suspects are still marked faulty */
2184 	}
2185 
2186 	if (cstate == FMD_CASE_CLOSED)
2187 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2188 	else
2189 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2190 }
2191 
2192 /*
2193  * Delete a closed case from the module's case list once the fmdo_close() entry
2194  * point has run to completion.  If the case is owned by a transport module,
2195  * tell the transport to proxy a case close on the other end of the transport.
2196  * Transition to the appropriate next state based on ci_flags.  This
2197  * function represents the end of CLOSE_WAIT and transitions the case to either
2198  * CLOSED or REPAIRED or discards it entirely because it was never solved;
2199  * refer to the topmost block comment explaining the state machine for details.
2200  */
2201 void
fmd_case_delete(fmd_case_t * cp)2202 fmd_case_delete(fmd_case_t *cp)
2203 {
2204 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2205 	fmd_modstat_t *msp;
2206 	size_t buftotal;
2207 
2208 	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2209 	ASSERT(fmd_module_locked(cip->ci_mod));
2210 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2211 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2212 
2213 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2214 	msp = cip->ci_mod->mod_stats;
2215 
2216 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2217 	msp->ms_caseopen.fmds_value.ui64--;
2218 
2219 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2220 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2221 
2222 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2223 
2224 	if (cip->ci_xprt == NULL)
2225 		fmd_module_setcdirty(cip->ci_mod);
2226 
2227 	fmd_module_rele(cip->ci_mod);
2228 	cip->ci_mod = fmd.d_rmod;
2229 	fmd_module_hold(cip->ci_mod);
2230 
2231 	/*
2232 	 * If the case has been solved, then retain it
2233 	 * on the root module's case list at least until we're transitioned.
2234 	 * Otherwise free the case with our final fmd_case_rele() below.
2235 	 */
2236 	if (cip->ci_flags & FMD_CF_SOLVED) {
2237 		fmd_module_lock(cip->ci_mod);
2238 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
2239 		fmd_module_unlock(cip->ci_mod);
2240 		fmd_case_hold(cp);
2241 	}
2242 
2243 	/*
2244 	 * Transition onwards to REPAIRED or CLOSED as originally requested.
2245 	 * Note that for proxy case if we're transitioning to CLOSED it means
2246 	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2247 	 * the diagnosing side. No need to notify the diagnosing side if we are
2248 	 * transitioning to REPAIRED as we only do this when requested to do
2249 	 * so by the diagnosing side anyway.
2250 	 */
2251 	if (cip->ci_flags & FMD_CF_REPAIRED)
2252 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2253 	else if (cip->ci_flags & FMD_CF_ISOLATED) {
2254 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2255 		if (cip->ci_xprt != NULL)
2256 			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2257 	}
2258 
2259 	fmd_case_rele(cp);
2260 }
2261 
2262 void
fmd_case_discard(fmd_case_t * cp,boolean_t delete_from_asru_cache)2263 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2264 {
2265 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2266 
2267 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2268 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2269 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2270 
2271 	ASSERT(fmd_module_locked(cip->ci_mod));
2272 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2273 	if (delete_from_asru_cache) {
2274 		(void) pthread_mutex_lock(&cip->ci_lock);
2275 		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2276 		(void) pthread_mutex_unlock(&cip->ci_lock);
2277 	}
2278 	fmd_case_rele(cp);
2279 }
2280 
2281 /*
2282  * Indicate that the problem corresponding to a case has been repaired by
2283  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2284  * already been closed, this function initiates the transition to CLOSE_WAIT.
2285  * The caller must have the case held from fmd_case_hash_lookup(), so we can
2286  * grab and drop ci_lock without the case being able to be freed in between.
2287  */
2288 int
fmd_case_repair(fmd_case_t * cp)2289 fmd_case_repair(fmd_case_t *cp)
2290 {
2291 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2292 	uint_t cstate;
2293 	fmd_asru_rep_arg_t fara;
2294 
2295 	(void) pthread_mutex_lock(&cip->ci_lock);
2296 	cstate = cip->ci_state;
2297 
2298 	if (cstate < FMD_CASE_SOLVED) {
2299 		(void) pthread_mutex_unlock(&cip->ci_lock);
2300 		return (fmd_set_errno(EFMD_CASE_STATE));
2301 	}
2302 
2303 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2304 		(void) pthread_mutex_unlock(&cip->ci_lock);
2305 		return (0); /* already repaired */
2306 	}
2307 
2308 	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2309 	fara.fara_reason = FMD_ASRU_REPAIRED;
2310 	fara.fara_bywhat = FARA_BY_CASE;
2311 	fara.fara_rval = NULL;
2312 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2313 	(void) pthread_mutex_unlock(&cip->ci_lock);
2314 
2315 	/*
2316 	 * if this is a proxied case, send the repair across the transport.
2317 	 * The remote side will then do the repair and send a list.repaired back
2318 	 * again such that we can finally repair the case on this side.
2319 	 */
2320 	if (cip->ci_xprt != NULL) {
2321 		fmd_case_xprt_updated(cp);
2322 		return (0);
2323 	}
2324 
2325 	if (cstate == FMD_CASE_CLOSED)
2326 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2327 	else
2328 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2329 
2330 	return (0);
2331 }
2332 
2333 int
fmd_case_acquit(fmd_case_t * cp)2334 fmd_case_acquit(fmd_case_t *cp)
2335 {
2336 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2337 	uint_t cstate;
2338 	fmd_asru_rep_arg_t fara;
2339 
2340 	(void) pthread_mutex_lock(&cip->ci_lock);
2341 	cstate = cip->ci_state;
2342 
2343 	if (cstate < FMD_CASE_SOLVED) {
2344 		(void) pthread_mutex_unlock(&cip->ci_lock);
2345 		return (fmd_set_errno(EFMD_CASE_STATE));
2346 	}
2347 
2348 	if (cip->ci_flags & FMD_CF_REPAIRED) {
2349 		(void) pthread_mutex_unlock(&cip->ci_lock);
2350 		return (0); /* already repaired */
2351 	}
2352 
2353 	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2354 	fara.fara_reason = FMD_ASRU_ACQUITTED;
2355 	fara.fara_bywhat = FARA_BY_CASE;
2356 	fara.fara_rval = NULL;
2357 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2358 	(void) pthread_mutex_unlock(&cip->ci_lock);
2359 
2360 	/*
2361 	 * if this is a proxied case, send the repair across the transport.
2362 	 * The remote side will then do the repair and send a list.repaired back
2363 	 * again such that we can finally repair the case on this side.
2364 	 */
2365 	if (cip->ci_xprt != NULL) {
2366 		fmd_case_xprt_updated(cp);
2367 		return (0);
2368 	}
2369 
2370 	if (cstate == FMD_CASE_CLOSED)
2371 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2372 	else
2373 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2374 
2375 	return (0);
2376 }
2377 
2378 int
fmd_case_contains(fmd_case_t * cp,fmd_event_t * ep)2379 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2380 {
2381 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2382 	fmd_case_item_t *cit;
2383 	uint_t state;
2384 	int rv = 0;
2385 
2386 	(void) pthread_mutex_lock(&cip->ci_lock);
2387 
2388 	if (cip->ci_state >= FMD_CASE_SOLVED)
2389 		state = FMD_EVS_DIAGNOSED;
2390 	else
2391 		state = FMD_EVS_ACCEPTED;
2392 
2393 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2394 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2395 			break;
2396 	}
2397 
2398 	if (rv == 0 && cip->ci_principal != NULL)
2399 		rv = fmd_event_equal(ep, cip->ci_principal);
2400 
2401 	(void) pthread_mutex_unlock(&cip->ci_lock);
2402 
2403 	if (rv != 0)
2404 		fmd_event_transition(ep, state);
2405 
2406 	return (rv);
2407 }
2408 
2409 int
fmd_case_orphaned(fmd_case_t * cp)2410 fmd_case_orphaned(fmd_case_t *cp)
2411 {
2412 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2413 }
2414 
2415 void
fmd_case_settime(fmd_case_t * cp,time_t tv_sec,suseconds_t tv_usec)2416 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2417 {
2418 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2419 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2420 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2421 }
2422 
2423 void
fmd_case_set_injected(fmd_case_t * cp)2424 fmd_case_set_injected(fmd_case_t *cp)
2425 {
2426 	((fmd_case_impl_t *)cp)->ci_injected = 1;
2427 }
2428 
2429 void
fmd_case_set_de_fmri(fmd_case_t * cp,nvlist_t * nvl)2430 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2431 {
2432 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2433 
2434 	if (cip->ci_diag_de)
2435 		nvlist_free(cip->ci_diag_de);
2436 	cip->ci_diag_de = nvl;
2437 }
2438 
2439 void
fmd_case_setcode(fmd_case_t * cp,char * code)2440 fmd_case_setcode(fmd_case_t *cp, char *code)
2441 {
2442 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2443 
2444 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2445 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2446 }
2447 
2448 /*ARGSUSED*/
2449 static void
fmd_case_repair_replay_case(fmd_case_t * cp,void * arg)2450 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2451 {
2452 	int not_faulty = 0;
2453 	int faulty = 0;
2454 	nvlist_t *nvl;
2455 	fmd_event_t *e;
2456 	char *class;
2457 	int any_unusable_and_present = 0;
2458 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2459 
2460 	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2461 		return;
2462 
2463 	if (cip->ci_state == FMD_CASE_RESOLVED) {
2464 		cip->ci_flags |= FMD_CF_RES_CMPL;
2465 		return;
2466 	}
2467 
2468 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2469 	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2470 	    &not_faulty);
2471 
2472 	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2473 		/*
2474 		 * If none of the suspects is faulty, replay the list.repaired.
2475 		 * If all suspects are already either usable or not present then
2476 		 * also transition straight to RESOLVED state.
2477 		 */
2478 		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2479 		    fmd_case_unusable_and_present, &any_unusable_and_present);
2480 		if (!any_unusable_and_present) {
2481 			cip->ci_state = FMD_CASE_RESOLVED;
2482 
2483 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2484 			    cip->ci_uuid));
2485 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2486 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2487 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2488 			    class);
2489 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2490 
2491 			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2492 			    cip->ci_uuid));
2493 			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2494 			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2495 			    fmd_asru_log_resolved, NULL);
2496 			cip->ci_flags |= FMD_CF_RES_CMPL;
2497 		} else {
2498 			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2499 			    cip->ci_uuid));
2500 			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2501 			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2502 			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2503 			    class);
2504 			fmd_dispq_dispatch(fmd.d_disp, e, class);
2505 		}
2506 	} else if (faulty && not_faulty) {
2507 		/*
2508 		 * if some but not all of the suspects are not faulty, replay
2509 		 * the list.updated.
2510 		 */
2511 		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2512 		    cip->ci_uuid));
2513 		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2514 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2515 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2516 		fmd_dispq_dispatch(fmd.d_disp, e, class);
2517 	}
2518 }
2519 
2520 void
fmd_case_repair_replay()2521 fmd_case_repair_replay()
2522 {
2523 	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2524 }
2525