xref: /titanic_44/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision 28cdc3d776761766afeb198769d1b70ed7e0f2e1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 
58 /* imported from eft.c... */
59 extern char *Autoclose;
60 extern int Dupclose;
61 extern hrtime_t Hesitate;
62 extern nv_alloc_t Eft_nv_hdl;
63 extern int Max_fme;
64 extern fmd_hdl_t *Hdl;
65 
66 static int Istat_need_save;
67 void istat_save(void);
68 
69 /* fme under construction is global so we can free it on module abort */
70 static struct fme *Nfmep;
71 
72 static const char *Undiag_reason;
73 
74 static int Nextid = 0;
75 
76 static int Open_fme_count = 0;	/* Count of open FMEs */
77 
78 /* list of fault management exercises underway */
79 static struct fme {
80 	struct fme *next;		/* next exercise */
81 	unsigned long long ull;		/* time when fme was created */
82 	int id;				/* FME id */
83 	struct cfgdata *cfgdata;	/* full configuration data */
84 	struct lut *eventtree;		/* propagation tree for this FME */
85 	/*
86 	 * The initial error report that created this FME is kept in
87 	 * two forms.  e0 points to the instance tree node and is used
88 	 * by fme_eval() as the starting point for the inference
89 	 * algorithm.  e0r is the event handle FMD passed to us when
90 	 * the ereport first arrived and is used when setting timers,
91 	 * which are always relative to the time of this initial
92 	 * report.
93 	 */
94 	struct event *e0;
95 	fmd_event_t *e0r;
96 
97 	id_t    timer;			/* for setting an fmd time-out */
98 	id_t	htid;			/* for setting hesitation timer */
99 
100 	struct event *ecurrent;		/* ereport under consideration */
101 	struct event *suspects;		/* current suspect list */
102 	struct event *psuspects;	/* previous suspect list */
103 	int nsuspects;			/* count of suspects */
104 	int nonfault;			/* zero if all suspects T_FAULT */
105 	int posted_suspects;		/* true if we've posted a diagnosis */
106 	int hesitated;			/* true if we hesitated */
107 	int uniqobs;			/* number of unique events observed */
108 	int peek;			/* just peeking, don't track suspects */
109 	int overflow;			/* true if overflow FME */
110 	enum fme_state {
111 		FME_NOTHING = 5000,	/* not evaluated yet */
112 		FME_WAIT,		/* need to wait for more info */
113 		FME_CREDIBLE,		/* suspect list is credible */
114 		FME_DISPROVED,		/* no valid suspects found */
115 		FME_DEFERRED		/* don't know yet (k-count not met) */
116 	} state;
117 
118 	unsigned long long pull;	/* time passed since created */
119 	unsigned long long wull;	/* wait until this time for re-eval */
120 	struct event *observations;	/* observation list */
121 	struct lut *globals;		/* values of global variables */
122 	/* fmd interfacing */
123 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
124 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
125 	/* stats */
126 	struct stats *Rcount;
127 	struct stats *Hcallcount;
128 	struct stats *Rcallcount;
129 	struct stats *Ccallcount;
130 	struct stats *Ecallcount;
131 	struct stats *Tcallcount;
132 	struct stats *Marrowcount;
133 	struct stats *diags;
134 } *FMElist, *EFMElist, *ClosedFMEs;
135 
136 static struct case_list {
137 	fmd_case_t *fmcase;
138 	struct case_list *next;
139 } *Undiagablecaselist;
140 
141 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
142 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
143 	unsigned long long at_latest_by, unsigned long long *pdelay);
144 static struct node *eventprop_lookup(struct event *ep, const char *propname);
145 static struct node *pathstring2epnamenp(char *path);
146 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep);
147 static void restore_suspects(struct fme *fmep);
148 static void save_suspects(struct fme *fmep);
149 static void destroy_fme(struct fme *f);
150 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
151     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
152 static void istat_counter_reset_cb(struct istat_entry *entp,
153     struct stats *statp, const struct ipath *ipp);
154 
155 static struct fme *
156 alloc_fme(void)
157 {
158 	struct fme *fmep;
159 
160 	fmep = MALLOC(sizeof (*fmep));
161 	bzero(fmep, sizeof (*fmep));
162 	return (fmep);
163 }
164 
165 /*
166  * fme_ready -- called when all initialization of the FME (except for
167  *	stats) has completed successfully.  Adds the fme to global lists
168  *	and establishes its stats.
169  */
170 static struct fme *
171 fme_ready(struct fme *fmep)
172 {
173 	char nbuf[100];
174 
175 	Nfmep = NULL;	/* don't need to free this on module abort now */
176 
177 	if (EFMElist) {
178 		EFMElist->next = fmep;
179 		EFMElist = fmep;
180 	} else
181 		FMElist = EFMElist = fmep;
182 
183 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
184 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
185 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
186 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
187 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
188 	fmep->Rcallcount = stats_new_counter(nbuf,
189 	    "calls to requirements_test()", 1);
190 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
191 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
192 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
193 	fmep->Ecallcount =
194 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
195 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
196 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
197 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
198 	fmep->Marrowcount = stats_new_counter(nbuf,
199 	    "arrows marked by mark_arrows()", 1);
200 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
201 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
202 
203 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
204 	config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked);
205 
206 	return (fmep);
207 }
208 
209 static struct fme *
210 newfme(const char *e0class, const struct ipath *e0ipp)
211 {
212 	struct cfgdata *cfgdata;
213 
214 	if ((cfgdata = config_snapshot()) == NULL) {
215 		out(O_ALTFP, "newfme: NULL configuration");
216 		Undiag_reason = UD_NOCONF;
217 		return (NULL);
218 	}
219 
220 	Nfmep = alloc_fme();
221 
222 	Nfmep->id = Nextid++;
223 	Nfmep->cfgdata = cfgdata;
224 	Nfmep->posted_suspects = 0;
225 	Nfmep->uniqobs = 0;
226 	Nfmep->state = FME_NOTHING;
227 	Nfmep->pull = 0ULL;
228 	Nfmep->overflow = 0;
229 
230 	Nfmep->fmcase = NULL;
231 	Nfmep->hdl = NULL;
232 
233 	if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
234 		out(O_ALTFP, "newfme: NULL instance tree");
235 		Undiag_reason = UD_INSTFAIL;
236 		config_free(cfgdata);
237 		FREE(Nfmep);
238 		Nfmep = NULL;
239 		return (NULL);
240 	}
241 
242 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
243 
244 	if ((Nfmep->e0 =
245 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
246 		out(O_ALTFP, "newfme: e0 not in instance tree");
247 		Undiag_reason = UD_BADEVENTI;
248 		itree_free(Nfmep->eventtree);
249 		config_free(cfgdata);
250 		FREE(Nfmep);
251 		Nfmep = NULL;
252 		return (NULL);
253 	}
254 
255 	return (fme_ready(Nfmep));
256 }
257 
258 void
259 fme_fini(void)
260 {
261 	struct fme *sfp, *fp;
262 	struct case_list *ucasep, *nextcasep;
263 
264 	ucasep = Undiagablecaselist;
265 	while (ucasep != NULL) {
266 		nextcasep = ucasep->next;
267 		FREE(ucasep);
268 		ucasep = nextcasep;
269 	}
270 	Undiagablecaselist = NULL;
271 
272 	/* clean up closed fmes */
273 	fp = ClosedFMEs;
274 	while (fp != NULL) {
275 		sfp = fp->next;
276 		destroy_fme(fp);
277 		fp = sfp;
278 	}
279 	ClosedFMEs = NULL;
280 
281 	fp = FMElist;
282 	while (fp != NULL) {
283 		sfp = fp->next;
284 		destroy_fme(fp);
285 		fp = sfp;
286 	}
287 	FMElist = EFMElist = NULL;
288 
289 	/* if we were in the middle of creating an fme, free it now */
290 	if (Nfmep) {
291 		destroy_fme(Nfmep);
292 		Nfmep = NULL;
293 	}
294 }
295 
296 /*
297  * Allocated space for a buffer name.  20 bytes allows for
298  * a ridiculous 9,999,999 unique observations.
299  */
300 #define	OBBUFNMSZ 20
301 
302 /*
303  *  serialize_observation
304  *
305  *  Create a recoverable version of the current observation
306  *  (f->ecurrent).  We keep a serialized version of each unique
307  *  observation in order that we may resume correctly the fme in the
308  *  correct state if eft or fmd crashes and we're restarted.
309  */
310 static void
311 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
312 {
313 	size_t pkdlen;
314 	char tmpbuf[OBBUFNMSZ];
315 	char *pkd = NULL;
316 	char *estr;
317 
318 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
319 	estr = ipath2str(cls, ipp);
320 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
321 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
322 	    strlen(estr) + 1);
323 	FREE(estr);
324 
325 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
326 		(void) snprintf(tmpbuf,
327 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
328 		if (nvlist_xpack(fp->ecurrent->nvp,
329 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
330 			out(O_DIE|O_SYS, "pack of observed nvl failed");
331 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
332 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
333 		FREE(pkd);
334 	}
335 
336 	fp->uniqobs++;
337 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
338 	    sizeof (fp->uniqobs));
339 }
340 
341 /*
342  *  init_fme_bufs -- We keep several bits of state about an fme for
343  *	use if eft or fmd crashes and we're restarted.
344  */
345 static void
346 init_fme_bufs(struct fme *fp)
347 {
348 	size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin;
349 
350 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen));
351 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen,
352 	    sizeof (cfglen));
353 	if (cfglen != 0) {
354 		fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen);
355 		fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG,
356 		    fp->cfgdata->begin, cfglen);
357 	}
358 
359 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
360 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
361 	    sizeof (fp->pull));
362 
363 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
364 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
365 	    sizeof (fp->id));
366 
367 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
368 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
369 	    sizeof (fp->uniqobs));
370 
371 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
372 	    sizeof (fp->posted_suspects));
373 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
374 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
375 }
376 
377 static void
378 destroy_fme_bufs(struct fme *fp)
379 {
380 	char tmpbuf[OBBUFNMSZ];
381 	int o;
382 
383 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
384 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
385 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
386 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
387 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
388 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
389 
390 	for (o = 0; o < fp->uniqobs; o++) {
391 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
392 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
393 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
394 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
395 	}
396 }
397 
398 /*
399  * reconstitute_observations -- convert a case's serialized observations
400  *	back into struct events.  Returns zero if all observations are
401  *	successfully reconstituted.
402  */
403 static int
404 reconstitute_observations(struct fme *fmep)
405 {
406 	struct event *ep;
407 	struct node *epnamenp = NULL;
408 	size_t pkdlen;
409 	char *pkd = NULL;
410 	char *tmpbuf = alloca(OBBUFNMSZ);
411 	char *sepptr;
412 	char *estr;
413 	int ocnt;
414 	int elen;
415 
416 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
417 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
418 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
419 		if (elen == 0) {
420 			out(O_ALTFP,
421 			    "reconstitute_observation: no %s buffer found.",
422 			    tmpbuf);
423 			Undiag_reason = UD_MISSINGOBS;
424 			break;
425 		}
426 
427 		estr = MALLOC(elen);
428 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
429 		sepptr = strchr(estr, '@');
430 		if (sepptr == NULL) {
431 			out(O_ALTFP,
432 			    "reconstitute_observation: %s: "
433 			    "missing @ separator in %s.",
434 			    tmpbuf, estr);
435 			Undiag_reason = UD_MISSINGPATH;
436 			FREE(estr);
437 			break;
438 		}
439 
440 		*sepptr = '\0';
441 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
442 			out(O_ALTFP,
443 			    "reconstitute_observation: %s: "
444 			    "trouble converting path string \"%s\" "
445 			    "to internal representation.",
446 			    tmpbuf, sepptr + 1);
447 			Undiag_reason = UD_MISSINGPATH;
448 			FREE(estr);
449 			break;
450 		}
451 
452 		/* construct the event */
453 		ep = itree_lookup(fmep->eventtree,
454 		    stable(estr), ipath(epnamenp));
455 		if (ep == NULL) {
456 			out(O_ALTFP,
457 			    "reconstitute_observation: %s: "
458 			    "lookup of  \"%s\" in itree failed.",
459 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
460 			Undiag_reason = UD_BADOBS;
461 			tree_free(epnamenp);
462 			FREE(estr);
463 			break;
464 		}
465 		tree_free(epnamenp);
466 
467 		/*
468 		 * We may or may not have a saved nvlist for the observation
469 		 */
470 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
471 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
472 		if (pkdlen != 0) {
473 			pkd = MALLOC(pkdlen);
474 			fmd_buf_read(fmep->hdl,
475 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
476 			ASSERT(ep->nvp == NULL);
477 			if (nvlist_xunpack(pkd,
478 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
479 				out(O_DIE|O_SYS, "pack of observed nvl failed");
480 			FREE(pkd);
481 		}
482 
483 		if (ocnt == 0)
484 			fmep->e0 = ep;
485 
486 		FREE(estr);
487 		fmep->ecurrent = ep;
488 		ep->count++;
489 
490 		/* link it into list of observations seen */
491 		ep->observations = fmep->observations;
492 		fmep->observations = ep;
493 	}
494 
495 	if (ocnt == fmep->uniqobs) {
496 		(void) fme_ready(fmep);
497 		return (0);
498 	}
499 
500 	return (1);
501 }
502 
503 /*
504  * restart_fme -- called during eft initialization.  Reconstitutes
505  *	an in-progress fme.
506  */
507 void
508 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
509 {
510 	nvlist_t *defect;
511 	struct case_list *bad;
512 	struct fme *fmep;
513 	struct cfgdata *cfgdata = NULL;
514 	size_t rawsz;
515 
516 	fmep = alloc_fme();
517 	fmep->fmcase = inprogress;
518 	fmep->hdl = hdl;
519 
520 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
521 		out(O_ALTFP, "restart_fme: No config data");
522 		Undiag_reason = UD_MISSINGINFO;
523 		goto badcase;
524 	}
525 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
526 	    sizeof (size_t));
527 
528 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
529 		out(O_ALTFP, "restart_fme: No event zero");
530 		Undiag_reason = UD_MISSINGZERO;
531 		goto badcase;
532 	}
533 
534 	cfgdata = MALLOC(sizeof (struct cfgdata));
535 	cfgdata->cooked = NULL;
536 	cfgdata->devcache = NULL;
537 	cfgdata->cpucache = NULL;
538 	cfgdata->refcnt = 1;
539 
540 	if (rawsz > 0) {
541 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
542 			out(O_ALTFP, "restart_fme: Config data size mismatch");
543 			Undiag_reason = UD_CFGMISMATCH;
544 			goto badcase;
545 		}
546 		cfgdata->begin = MALLOC(rawsz);
547 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
548 		fmd_buf_read(hdl,
549 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
550 	} else {
551 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
552 	}
553 	fmep->cfgdata = cfgdata;
554 
555 	config_cook(cfgdata);
556 	if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
557 		/* case not properly saved or irretrievable */
558 		out(O_ALTFP, "restart_fme: NULL instance tree");
559 		Undiag_reason = UD_INSTFAIL;
560 		goto badcase;
561 	}
562 
563 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
564 
565 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
566 		out(O_ALTFP, "restart_fme: no saved wait time");
567 		Undiag_reason = UD_MISSINGINFO;
568 		goto badcase;
569 	} else {
570 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
571 		    sizeof (fmep->pull));
572 	}
573 
574 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
575 		out(O_ALTFP, "restart_fme: no saved posted status");
576 		Undiag_reason = UD_MISSINGINFO;
577 		goto badcase;
578 	} else {
579 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
580 		    (void *)&fmep->posted_suspects,
581 		    sizeof (fmep->posted_suspects));
582 	}
583 
584 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
585 		out(O_ALTFP, "restart_fme: no saved id");
586 		Undiag_reason = UD_MISSINGINFO;
587 		goto badcase;
588 	} else {
589 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
590 		    sizeof (fmep->id));
591 	}
592 	if (Nextid <= fmep->id)
593 		Nextid = fmep->id + 1;
594 
595 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
596 		out(O_ALTFP, "restart_fme: no count of observations");
597 		Undiag_reason = UD_MISSINGINFO;
598 		goto badcase;
599 	} else {
600 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
601 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
602 	}
603 
604 	if (reconstitute_observations(fmep) != 0)
605 		goto badcase;
606 
607 	Open_fme_count++;
608 
609 	/* give the diagnosis algorithm a shot at the new FME state */
610 	fme_eval(fmep, NULL);
611 	return;
612 
613 badcase:
614 	if (fmep->eventtree != NULL)
615 		itree_free(fmep->eventtree);
616 	config_free(cfgdata);
617 	destroy_fme_bufs(fmep);
618 	FREE(fmep);
619 
620 	/*
621 	 * Since we're unable to restart the case, add it to the undiagable
622 	 * list and solve and close it as appropriate.
623 	 */
624 	bad = MALLOC(sizeof (struct case_list));
625 	bad->next = NULL;
626 
627 	if (Undiagablecaselist != NULL)
628 		bad->next = Undiagablecaselist;
629 	Undiagablecaselist = bad;
630 	bad->fmcase = inprogress;
631 
632 	out(O_ALTFP, "[case %s (unable to restart), ",
633 	    fmd_case_uuid(hdl, bad->fmcase));
634 
635 	if (fmd_case_solved(hdl, bad->fmcase)) {
636 		out(O_ALTFP, "already solved, ");
637 	} else {
638 		out(O_ALTFP, "solving, ");
639 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
640 		    NULL, NULL, NULL);
641 		if (Undiag_reason != NULL)
642 			(void) nvlist_add_string(defect,
643 			    UNDIAG_REASON, Undiag_reason);
644 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
645 		fmd_case_solve(hdl, bad->fmcase);
646 	}
647 
648 	if (fmd_case_closed(hdl, bad->fmcase)) {
649 		out(O_ALTFP, "already closed ]");
650 	} else {
651 		out(O_ALTFP, "closing ]");
652 		fmd_case_close(hdl, bad->fmcase);
653 	}
654 }
655 
656 /*ARGSUSED*/
657 static void
658 globals_destructor(void *left, void *right, void *arg)
659 {
660 	struct evalue *evp = (struct evalue *)right;
661 	if (evp->t == NODEPTR)
662 		tree_free((struct node *)(uintptr_t)evp->v);
663 	evp->v = NULL;
664 	FREE(evp);
665 }
666 
667 void
668 destroy_fme(struct fme *f)
669 {
670 	stats_delete(f->Rcount);
671 	stats_delete(f->Hcallcount);
672 	stats_delete(f->Rcallcount);
673 	stats_delete(f->Ccallcount);
674 	stats_delete(f->Ecallcount);
675 	stats_delete(f->Tcallcount);
676 	stats_delete(f->Marrowcount);
677 	stats_delete(f->diags);
678 
679 	itree_free(f->eventtree);
680 	config_free(f->cfgdata);
681 	lut_free(f->globals, globals_destructor, NULL);
682 	FREE(f);
683 }
684 
685 static const char *
686 fme_state2str(enum fme_state s)
687 {
688 	switch (s) {
689 	case FME_NOTHING:	return ("NOTHING");
690 	case FME_WAIT:		return ("WAIT");
691 	case FME_CREDIBLE:	return ("CREDIBLE");
692 	case FME_DISPROVED:	return ("DISPROVED");
693 	case FME_DEFERRED:	return ("DEFERRED");
694 	default:		return ("UNKNOWN");
695 	}
696 }
697 
698 static int
699 is_problem(enum nametype t)
700 {
701 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
702 }
703 
704 static int
705 is_fault(enum nametype t)
706 {
707 	return (t == N_FAULT);
708 }
709 
710 static int
711 is_defect(enum nametype t)
712 {
713 	return (t == N_DEFECT);
714 }
715 
716 static int
717 is_upset(enum nametype t)
718 {
719 	return (t == N_UPSET);
720 }
721 
722 static void
723 fme_print(int flags, struct fme *fmep)
724 {
725 	struct event *ep;
726 
727 	out(flags, "Fault Management Exercise %d", fmep->id);
728 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
729 	out(flags|O_NONL, "\t  Start time: ");
730 	ptree_timeval(flags|O_NONL, &fmep->ull);
731 	out(flags, NULL);
732 	if (fmep->wull) {
733 		out(flags|O_NONL, "\t   Wait time: ");
734 		ptree_timeval(flags|O_NONL, &fmep->wull);
735 		out(flags, NULL);
736 	}
737 	out(flags|O_NONL, "\t          E0: ");
738 	if (fmep->e0)
739 		itree_pevent_brief(flags|O_NONL, fmep->e0);
740 	else
741 		out(flags|O_NONL, "NULL");
742 	out(flags, NULL);
743 	out(flags|O_NONL, "\tObservations:");
744 	for (ep = fmep->observations; ep; ep = ep->observations) {
745 		out(flags|O_NONL, " ");
746 		itree_pevent_brief(flags|O_NONL, ep);
747 	}
748 	out(flags, NULL);
749 	out(flags|O_NONL, "\tSuspect list:");
750 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
751 		out(flags|O_NONL, " ");
752 		itree_pevent_brief(flags|O_NONL, ep);
753 	}
754 	out(flags, NULL);
755 	out(flags|O_VERB2, "\t        Tree:");
756 	itree_ptree(flags|O_VERB2, fmep->eventtree);
757 }
758 
759 static struct node *
760 pathstring2epnamenp(char *path)
761 {
762 	char *sep = "/";
763 	struct node *ret;
764 	char *ptr;
765 
766 	if ((ptr = strtok(path, sep)) == NULL)
767 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
768 
769 	ret = tree_iname(stable(ptr), NULL, 0);
770 
771 	while ((ptr = strtok(NULL, sep)) != NULL)
772 		ret = tree_name_append(ret,
773 		    tree_iname(stable(ptr), NULL, 0));
774 
775 	return (ret);
776 }
777 
778 /*
779  * for a given upset sp, increment the corresponding SERD engine.  if the
780  * SERD engine trips, return the ename and ipp of the resulting ereport.
781  * returns true if engine tripped and *enamep and *ippp were filled in.
782  */
783 static int
784 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
785     fmd_case_t *fmcase, struct event *sp, const char **enamep,
786     const struct ipath **ippp)
787 {
788 	struct node *serdinst;
789 	char *serdname;
790 	struct node *nid;
791 
792 	ASSERT(sp->t == N_UPSET);
793 	ASSERT(ffep != NULL);
794 
795 	/*
796 	 * obtain instanced SERD engine from the upset sp.  from this
797 	 * derive serdname, the string used to identify the SERD engine.
798 	 */
799 	serdinst = eventprop_lookup(sp, L_engine);
800 
801 	if (serdinst == NULL)
802 		return (NULL);
803 
804 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
805 	    ipath(serdinst->u.stmt.np->u.event.epname));
806 
807 	/* handle serd engine "id" property, if there is one */
808 	if ((nid =
809 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
810 		struct evalue *gval;
811 		char suffixbuf[200];
812 		char *suffix;
813 		char *nserdname;
814 		size_t nname;
815 
816 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
817 		ptree_name_iter(O_ALTFP|O_NONL, nid);
818 
819 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
820 
821 		if ((gval = lut_lookup(fmep->globals,
822 		    (void *)nid->u.globid.s, NULL)) == NULL) {
823 			out(O_ALTFP, " undefined");
824 		} else if (gval->t == UINT64) {
825 			out(O_ALTFP, " %llu", gval->v);
826 			(void) sprintf(suffixbuf, "%llu", gval->v);
827 			suffix = suffixbuf;
828 		} else {
829 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
830 			suffix = (char *)(uintptr_t)gval->v;
831 		}
832 
833 		nname = strlen(serdname) + strlen(suffix) + 2;
834 		nserdname = MALLOC(nname);
835 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
836 		FREE(serdname);
837 		serdname = nserdname;
838 	}
839 
840 	if (!fmd_serd_exists(hdl, serdname)) {
841 		struct node *nN, *nT;
842 
843 		/* no SERD engine yet, so create it */
844 		nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL);
845 		nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL);
846 
847 		ASSERT(nN->t == T_NUM);
848 		ASSERT(nT->t == T_TIMEVAL);
849 
850 		fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull,
851 		    (hrtime_t)nT->u.ull);
852 	}
853 
854 
855 	/*
856 	 * increment SERD engine.  if engine fires, reset serd
857 	 * engine and return trip_strcode
858 	 */
859 	if (fmd_serd_record(hdl, serdname, ffep)) {
860 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
861 		    (void *)L_trip, NULL);
862 
863 		ASSERT(tripinst != NULL);
864 
865 		*enamep = tripinst->u.event.ename->u.name.s;
866 		*ippp = ipath(tripinst->u.event.epname);
867 
868 		fmd_case_add_serd(hdl, fmcase, serdname);
869 		fmd_serd_reset(hdl, serdname);
870 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
871 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
872 		out(O_ALTFP, "]");
873 
874 		FREE(serdname);
875 		return (1);
876 	}
877 
878 	FREE(serdname);
879 	return (0);
880 }
881 
882 /*
883  * search a suspect list for upsets.  feed each upset to serd_eval() and
884  * build up tripped[], an array of ereports produced by the firing of
885  * any SERD engines.  then feed each ereport back into
886  * fme_receive_report().
887  *
888  * returns ntrip, the number of these ereports produced.
889  */
890 static int
891 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
892 {
893 	/* we build an array of tripped ereports that we send ourselves */
894 	struct {
895 		const char *ename;
896 		const struct ipath *ipp;
897 	} *tripped;
898 	struct event *sp;
899 	int ntrip, nupset, i;
900 
901 	/*
902 	 * count the number of upsets to determine the upper limit on
903 	 * expected trip ereport strings.  remember that one upset can
904 	 * lead to at most one ereport.
905 	 */
906 	nupset = 0;
907 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
908 		if (sp->t == N_UPSET)
909 			nupset++;
910 	}
911 
912 	if (nupset == 0)
913 		return (0);
914 
915 	/*
916 	 * get to this point if we have upsets and expect some trip
917 	 * ereports
918 	 */
919 	tripped = alloca(sizeof (*tripped) * nupset);
920 	bzero((void *)tripped, sizeof (*tripped) * nupset);
921 
922 	ntrip = 0;
923 	for (sp = fmep->suspects; sp; sp = sp->suspects)
924 		if (sp->t == N_UPSET &&
925 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
926 			    &tripped[ntrip].ename, &tripped[ntrip].ipp))
927 			ntrip++;
928 
929 	for (i = 0; i < ntrip; i++)
930 		fme_receive_report(fmep->hdl, ffep,
931 		    tripped[i].ename, tripped[i].ipp, NULL);
932 
933 	return (ntrip);
934 }
935 
936 /*
937  * fme_receive_external_report -- call when an external ereport comes in
938  *
939  * this routine just converts the relevant information from the ereport
940  * into a format used internally and passes it on to fme_receive_report().
941  */
942 void
943 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
944     const char *eventstring)
945 {
946 	struct node *epnamenp = platform_getpath(nvl);
947 	const struct ipath *ipp;
948 
949 	/*
950 	 * XFILE: If we ended up without a path, it's an X-file.
951 	 * For now, use our undiagnosable interface.
952 	 */
953 	if (epnamenp == NULL) {
954 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
955 		Undiag_reason = UD_NOPATH;
956 		publish_undiagnosable(hdl, ffep);
957 		return;
958 	}
959 
960 	ipp = ipath(epnamenp);
961 	tree_free(epnamenp);
962 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
963 }
964 
965 /*ARGSUSED*/
966 void
967 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
968     const char *eventstring)
969 {
970 	char *uuid;
971 	nvlist_t **nva;
972 	uint_t nvc;
973 	const struct ipath *ipp;
974 
975 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
976 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
977 	    &nva, &nvc) != 0) {
978 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
979 		return;
980 	}
981 
982 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
983 
984 	while (nvc-- != 0) {
985 		/*
986 		 * Reset any istat associated with this path.
987 		 */
988 		char *path;
989 
990 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
991 			continue;
992 
993 		path = ipath2str(NULL, ipp);
994 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
995 		    path);
996 		FREE(path);
997 
998 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
999 		istat_save();
1000 
1001 		/*
1002 		 * We do not have a list of stat engines in a form that
1003 		 * we can readily clear any associated serd engines.  When we
1004 		 * do, this will be the place to clear them.
1005 		 */
1006 	}
1007 }
1008 
1009 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1010     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1011 
1012 /* ARGSUSED */
1013 static void
1014 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1015 {
1016 	struct bubble *bp;
1017 	struct arrowlist *ap;
1018 
1019 	ep->cached_state = 0;
1020 	ep->keep_in_tree = 0;
1021 	for (bp = itree_next_bubble(ep, NULL); bp;
1022 	    bp = itree_next_bubble(ep, bp)) {
1023 		if (bp->t != B_FROM)
1024 			continue;
1025 		bp->mark = 0;
1026 		for (ap = itree_next_arrow(bp, NULL); ap;
1027 		    ap = itree_next_arrow(bp, ap))
1028 			ap->arrowp->mark = 0;
1029 	}
1030 }
1031 
1032 static void
1033 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1034     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1035 {
1036 	struct event *ep;
1037 	struct fme *fmep = NULL;
1038 	struct fme *ofmep = NULL;
1039 	struct fme *cfmep, *svfmep;
1040 	int matched = 0;
1041 	nvlist_t *defect;
1042 
1043 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1044 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1045 	out(O_ALTFP|O_STAMP, NULL);
1046 
1047 	/* decide which FME it goes to */
1048 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1049 		int prev_verbose;
1050 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1051 		enum fme_state state;
1052 		nvlist_t *pre_peek_nvp = NULL;
1053 
1054 		if (fmep->overflow) {
1055 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1056 				ofmep = fmep;
1057 
1058 			continue;
1059 		}
1060 
1061 		/* look up event in event tree for this FME */
1062 		if ((ep = itree_lookup(fmep->eventtree,
1063 		    eventstring, ipp)) == NULL)
1064 			continue;
1065 
1066 		/* note observation */
1067 		fmep->ecurrent = ep;
1068 		if (ep->count++ == 0) {
1069 			/* link it into list of observations seen */
1070 			ep->observations = fmep->observations;
1071 			fmep->observations = ep;
1072 			ep->nvp = evnv_dupnvl(nvl);
1073 		} else {
1074 			/* use new payload values for peek */
1075 			pre_peek_nvp = ep->nvp;
1076 			ep->nvp = evnv_dupnvl(nvl);
1077 		}
1078 
1079 		/* tell hypothesise() not to mess with suspect list */
1080 		fmep->peek = 1;
1081 
1082 		/* don't want this to be verbose (unless Debug is set) */
1083 		prev_verbose = Verbose;
1084 		if (Debug == 0)
1085 			Verbose = 0;
1086 
1087 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1088 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1089 
1090 		fmep->peek = 0;
1091 
1092 		/* put verbose flag back */
1093 		Verbose = prev_verbose;
1094 
1095 		if (state != FME_DISPROVED) {
1096 			/* found an FME that explains the ereport */
1097 			matched++;
1098 			out(O_ALTFP|O_NONL, "[");
1099 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1100 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1101 
1102 			if (pre_peek_nvp)
1103 				nvlist_free(pre_peek_nvp);
1104 
1105 			if (ep->count == 1)
1106 				serialize_observation(fmep, eventstring, ipp);
1107 
1108 			if (ffep)
1109 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1110 
1111 			stats_counter_bump(fmep->Rcount);
1112 
1113 			/* re-eval FME */
1114 			fme_eval(fmep, ffep);
1115 		} else {
1116 
1117 			/* not a match, undo noting of observation */
1118 			fmep->ecurrent = NULL;
1119 			if (--ep->count == 0) {
1120 				/* unlink it from observations */
1121 				fmep->observations = ep->observations;
1122 				ep->observations = NULL;
1123 				nvlist_free(ep->nvp);
1124 				ep->nvp = NULL;
1125 			} else {
1126 				nvlist_free(ep->nvp);
1127 				ep->nvp = pre_peek_nvp;
1128 			}
1129 		}
1130 	}
1131 
1132 	if (matched)
1133 		return;	/* explained by at least one existing FME */
1134 
1135 	/* clean up closed fmes */
1136 	cfmep = ClosedFMEs;
1137 	while (cfmep != NULL) {
1138 		svfmep = cfmep->next;
1139 		destroy_fme(cfmep);
1140 		cfmep = svfmep;
1141 	}
1142 	ClosedFMEs = NULL;
1143 
1144 	if (ofmep) {
1145 		out(O_ALTFP|O_NONL, "[");
1146 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1147 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1148 		if (ffep)
1149 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1150 
1151 		return;
1152 
1153 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1154 		out(O_ALTFP|O_NONL, "[");
1155 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1156 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1157 		/* Create overflow fme */
1158 		if ((fmep = newfme(eventstring, ipp)) == NULL) {
1159 			out(O_ALTFP|O_NONL, "[");
1160 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1161 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1162 			publish_undiagnosable(hdl, ffep);
1163 			return;
1164 		}
1165 
1166 		Open_fme_count++;
1167 
1168 		fmep->fmcase = fmd_case_open(hdl, NULL);
1169 		fmep->hdl = hdl;
1170 		init_fme_bufs(fmep);
1171 		fmep->overflow = B_TRUE;
1172 
1173 		if (ffep)
1174 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1175 
1176 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1177 		    NULL, NULL, NULL);
1178 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1179 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1180 		fmd_case_solve(hdl, fmep->fmcase);
1181 		return;
1182 	}
1183 
1184 	/* start a new FME */
1185 	if ((fmep = newfme(eventstring, ipp)) == NULL) {
1186 		out(O_ALTFP|O_NONL, "[");
1187 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1188 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1189 		publish_undiagnosable(hdl, ffep);
1190 		return;
1191 	}
1192 
1193 	Open_fme_count++;
1194 
1195 	/* open a case */
1196 	fmep->fmcase = fmd_case_open(hdl, NULL);
1197 	fmep->hdl = hdl;
1198 	init_fme_bufs(fmep);
1199 
1200 	out(O_ALTFP|O_NONL, "[");
1201 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1202 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1203 	    fmd_case_uuid(hdl, fmep->fmcase));
1204 
1205 	ep = fmep->e0;
1206 	ASSERT(ep != NULL);
1207 
1208 	/* note observation */
1209 	fmep->ecurrent = ep;
1210 	if (ep->count++ == 0) {
1211 		/* link it into list of observations seen */
1212 		ep->observations = fmep->observations;
1213 		fmep->observations = ep;
1214 		ep->nvp = evnv_dupnvl(nvl);
1215 		serialize_observation(fmep, eventstring, ipp);
1216 	} else {
1217 		/* new payload overrides any previous */
1218 		nvlist_free(ep->nvp);
1219 		ep->nvp = evnv_dupnvl(nvl);
1220 	}
1221 
1222 	stats_counter_bump(fmep->Rcount);
1223 
1224 	if (ffep) {
1225 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1226 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1227 		fmep->e0r = ffep;
1228 	}
1229 
1230 	/* give the diagnosis algorithm a shot at the new FME state */
1231 	fme_eval(fmep, ffep);
1232 }
1233 
1234 void
1235 fme_status(int flags)
1236 {
1237 	struct fme *fmep;
1238 
1239 	if (FMElist == NULL) {
1240 		out(flags, "No fault management exercises underway.");
1241 		return;
1242 	}
1243 
1244 	for (fmep = FMElist; fmep; fmep = fmep->next)
1245 		fme_print(flags, fmep);
1246 }
1247 
1248 /*
1249  * "indent" routines used mostly for nicely formatted debug output, but also
1250  * for sanity checking for infinite recursion bugs.
1251  */
1252 
1253 #define	MAX_INDENT 1024
1254 static const char *indent_s[MAX_INDENT];
1255 static int current_indent;
1256 
1257 static void
1258 indent_push(const char *s)
1259 {
1260 	if (current_indent < MAX_INDENT)
1261 		indent_s[current_indent++] = s;
1262 	else
1263 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1264 }
1265 
1266 static void
1267 indent_set(const char *s)
1268 {
1269 	current_indent = 0;
1270 	indent_push(s);
1271 }
1272 
1273 static void
1274 indent_pop(void)
1275 {
1276 	if (current_indent > 0)
1277 		current_indent--;
1278 	else
1279 		out(O_DIE, "recursion underflow");
1280 }
1281 
1282 static void
1283 indent(void)
1284 {
1285 	int i;
1286 	if (!Verbose)
1287 		return;
1288 	for (i = 0; i < current_indent; i++)
1289 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1290 }
1291 
1292 static int
1293 suspects_changed(struct fme *fmep)
1294 {
1295 	struct event *suspects = fmep->suspects;
1296 	struct event *psuspects = fmep->psuspects;
1297 
1298 	while (suspects != NULL && psuspects != NULL) {
1299 		if (suspects != psuspects)
1300 			return (1);
1301 		suspects = suspects->suspects;
1302 		psuspects = psuspects->psuspects;
1303 	}
1304 
1305 	return (suspects != psuspects);
1306 }
1307 
1308 #define	SLNEW		1
1309 #define	SLCHANGED	2
1310 #define	SLWAIT		3
1311 #define	SLDISPROVED	4
1312 
1313 static void
1314 print_suspects(int circumstance, struct fme *fmep)
1315 {
1316 	struct event *ep;
1317 
1318 	out(O_ALTFP|O_NONL, "[");
1319 	if (circumstance == SLCHANGED) {
1320 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1321 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1322 	} else if (circumstance == SLWAIT) {
1323 		out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id);
1324 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1325 	} else if (circumstance == SLDISPROVED) {
1326 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1327 	} else {
1328 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1329 	}
1330 
1331 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1332 		out(O_ALTFP, "]");
1333 		return;
1334 	}
1335 
1336 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1337 		out(O_ALTFP|O_NONL, " ");
1338 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1339 	}
1340 	out(O_ALTFP, "]");
1341 }
1342 
1343 static struct node *
1344 eventprop_lookup(struct event *ep, const char *propname)
1345 {
1346 	return (lut_lookup(ep->props, (void *)propname, NULL));
1347 }
1348 
1349 #define	MAXDIGITIDX	23
1350 static char numbuf[MAXDIGITIDX + 1];
1351 
1352 static int
1353 node2uint(struct node *n, uint_t *valp)
1354 {
1355 	struct evalue value;
1356 	struct lut *globals = NULL;
1357 
1358 	if (n == NULL)
1359 		return (1);
1360 
1361 	/*
1362 	 * check value.v since we are being asked to convert an unsigned
1363 	 * long long int to an unsigned int
1364 	 */
1365 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1366 	    value.t != UINT64 || value.v > (1ULL << 32))
1367 		return (1);
1368 
1369 	*valp = (uint_t)value.v;
1370 
1371 	return (0);
1372 }
1373 
1374 static nvlist_t *
1375 node2fmri(struct node *n)
1376 {
1377 	nvlist_t **pa, *f, *p;
1378 	struct node *nc;
1379 	uint_t depth = 0;
1380 	char *numstr, *nullbyte;
1381 	char *failure;
1382 	int err, i;
1383 
1384 	/* XXX do we need to be able to handle a non-T_NAME node? */
1385 	if (n == NULL || n->t != T_NAME)
1386 		return (NULL);
1387 
1388 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1389 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1390 			break;
1391 		depth++;
1392 	}
1393 
1394 	if (nc != NULL) {
1395 		/* We bailed early, something went wrong */
1396 		return (NULL);
1397 	}
1398 
1399 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1400 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1401 	pa = alloca(depth * sizeof (nvlist_t *));
1402 	for (i = 0; i < depth; i++)
1403 		pa[i] = NULL;
1404 
1405 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1406 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1407 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1408 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1409 	if (err != 0) {
1410 		failure = "basic construction of FMRI failed";
1411 		goto boom;
1412 	}
1413 
1414 	numbuf[MAXDIGITIDX] = '\0';
1415 	nullbyte = &numbuf[MAXDIGITIDX];
1416 	i = 0;
1417 
1418 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1419 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1420 		if (err != 0) {
1421 			failure = "alloc of an hc-pair failed";
1422 			goto boom;
1423 		}
1424 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1425 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1426 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1427 		if (err != 0) {
1428 			failure = "construction of an hc-pair failed";
1429 			goto boom;
1430 		}
1431 		pa[i++] = p;
1432 	}
1433 
1434 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1435 	if (err == 0) {
1436 		for (i = 0; i < depth; i++)
1437 			if (pa[i] != NULL)
1438 				nvlist_free(pa[i]);
1439 		return (f);
1440 	}
1441 	failure = "addition of hc-pair array to FMRI failed";
1442 
1443 boom:
1444 	for (i = 0; i < depth; i++)
1445 		if (pa[i] != NULL)
1446 			nvlist_free(pa[i]);
1447 	nvlist_free(f);
1448 	out(O_DIE, "%s", failure);
1449 	/*NOTREACHED*/
1450 	return (NULL);
1451 }
1452 
1453 static uint_t
1454 avg(uint_t sum, uint_t cnt)
1455 {
1456 	unsigned long long s = sum * 10;
1457 
1458 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1459 }
1460 
1461 static uint8_t
1462 percentof(uint_t part, uint_t whole)
1463 {
1464 	unsigned long long p = part * 1000;
1465 
1466 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1467 }
1468 
1469 struct rsl {
1470 	struct event *suspect;
1471 	nvlist_t *asru;
1472 	nvlist_t *fru;
1473 	nvlist_t *rsrc;
1474 };
1475 
1476 /*
1477  *  rslfree -- free internal members of struct rsl not expected to be
1478  *	freed elsewhere.
1479  */
1480 static void
1481 rslfree(struct rsl *freeme)
1482 {
1483 	if (freeme->asru != NULL)
1484 		nvlist_free(freeme->asru);
1485 	if (freeme->fru != NULL)
1486 		nvlist_free(freeme->fru);
1487 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
1488 		nvlist_free(freeme->rsrc);
1489 }
1490 
1491 /*
1492  *  rslcmp -- compare two rsl structures.  Use the following
1493  *	comparisons to establish cardinality:
1494  *
1495  *	1. Name of the suspect's class. (simple strcmp)
1496  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
1497  *
1498  */
1499 static int
1500 rslcmp(const void *a, const void *b)
1501 {
1502 	struct rsl *r1 = (struct rsl *)a;
1503 	struct rsl *r2 = (struct rsl *)b;
1504 	int rv;
1505 
1506 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
1507 	    r2->suspect->enode->u.event.ename->u.name.s);
1508 	if (rv != 0)
1509 		return (rv);
1510 
1511 	if (r1->asru == NULL && r2->asru == NULL)
1512 		return (0);
1513 	if (r1->asru == NULL)
1514 		return (-1);
1515 	if (r2->asru == NULL)
1516 		return (1);
1517 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
1518 }
1519 
1520 /*
1521  *  rsluniq -- given an array of rsl structures, seek out and "remove"
1522  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
1523  *	of the array element.  Removal also means updating the number of
1524  *	problems and the number of problems which are not faults.  User
1525  *	provides the first and last element pointers.
1526  */
1527 static void
1528 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
1529 {
1530 	struct rsl *cr;
1531 
1532 	if (*nprobs == 1)
1533 		return;
1534 
1535 	/*
1536 	 *  At this point, we only expect duplicate defects.
1537 	 *  Eversholt's diagnosis algorithm prevents duplicate
1538 	 *  suspects, but we rewrite defects in the platform code after
1539 	 *  the diagnosis is made, and that can introduce new
1540 	 *  duplicates.
1541 	 */
1542 	while (first <= last) {
1543 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
1544 			first++;
1545 			continue;
1546 		}
1547 		cr = first + 1;
1548 		while (cr <= last) {
1549 			if (is_defect(first->suspect->t)) {
1550 				if (rslcmp(first, cr) == 0) {
1551 					cr->suspect = NULL;
1552 					rslfree(cr);
1553 					(*nprobs)--;
1554 					(*nnonf)--;
1555 				}
1556 			}
1557 			/*
1558 			 * assume all defects are in order after our
1559 			 * sort and short circuit here with "else break" ?
1560 			 */
1561 			cr++;
1562 		}
1563 		first++;
1564 	}
1565 }
1566 
1567 /*
1568  * get_resources -- for a given suspect, determine what ASRU, FRU and
1569  *     RSRC nvlists should be advertised in the final suspect list.
1570  */
1571 void
1572 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
1573 {
1574 	struct node *asrudef, *frudef;
1575 	nvlist_t *asru, *fru;
1576 	nvlist_t *rsrc = NULL;
1577 	char *pathstr;
1578 
1579 	/*
1580 	 * First find any ASRU and/or FRU defined in the
1581 	 * initial fault tree.
1582 	 */
1583 	asrudef = eventprop_lookup(sp, L_ASRU);
1584 	frudef = eventprop_lookup(sp, L_FRU);
1585 
1586 	/*
1587 	 * Create FMRIs based on those definitions
1588 	 */
1589 	asru = node2fmri(asrudef);
1590 	fru = node2fmri(frudef);
1591 	pathstr = ipath2str(NULL, sp->ipp);
1592 
1593 	/*
1594 	 * Allow for platform translations of the FMRIs
1595 	 */
1596 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
1597 	    pathstr);
1598 
1599 	FREE(pathstr);
1600 	rsrcs->suspect = sp;
1601 	rsrcs->asru = asru;
1602 	rsrcs->fru = fru;
1603 	rsrcs->rsrc = rsrc;
1604 }
1605 
1606 /*
1607  * trim_suspects -- prior to publishing, we may need to remove some
1608  *    suspects from the list.  If we're auto-closing upsets, we don't
1609  *    want any of those in the published list.  If the ASRUs for multiple
1610  *    defects resolve to the same ASRU (driver) we only want to publish
1611  *    that as a single suspect.
1612  */
1613 static void
1614 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
1615     struct rsl **end)
1616 {
1617 	struct event *ep;
1618 	struct rsl *rp;
1619 	int rpcnt;
1620 
1621 	/*
1622 	 * First save the suspects in the psuspects, then copy back
1623 	 * only the ones we wish to retain.  This resets nsuspects to
1624 	 * zero.
1625 	 */
1626 	rpcnt = fmep->nsuspects;
1627 	save_suspects(fmep);
1628 
1629 	/*
1630 	 * allocate an array of resource pointers for the suspects.
1631 	 * We may end up using less than the full allocation, but this
1632 	 * is a very short-lived array.  publish_suspects() will free
1633 	 * this array when it's done using it.
1634 	 */
1635 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
1636 	bzero(rp, rpcnt * sizeof (struct rsl));
1637 
1638 	/* first pass, remove any unwanted upsets and populate our array */
1639 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
1640 		if (no_upsets && is_upset(ep->t))
1641 			continue;
1642 		get_resources(ep, rp, fmep->cfgdata->cooked);
1643 		rp++;
1644 		fmep->nsuspects++;
1645 		if (!is_fault(ep->t))
1646 			fmep->nonfault++;
1647 	}
1648 
1649 	/* if all we had was unwanted upsets, we're done */
1650 	if (fmep->nsuspects == 0)
1651 		return;
1652 
1653 	*end = rp - 1;
1654 
1655 	/* sort the array */
1656 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
1657 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
1658 }
1659 
1660 /*
1661  * addpayloadprop -- add a payload prop to a problem
1662  */
1663 static void
1664 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
1665 {
1666 	ASSERT(fault != NULL);
1667 	ASSERT(lhs != NULL);
1668 	ASSERT(rhs != NULL);
1669 
1670 	if (rhs->t == UINT64) {
1671 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
1672 
1673 		if (nvlist_add_uint64(fault, lhs, rhs->v) != 0)
1674 			out(O_DIE,
1675 			    "cannot add payloadprop \"%s\" to fault", lhs);
1676 	} else {
1677 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
1678 		    lhs, (char *)(uintptr_t)rhs->v);
1679 
1680 		if (nvlist_add_string(fault, lhs, (char *)(uintptr_t)rhs->v) !=
1681 		    0)
1682 			out(O_DIE,
1683 			    "cannot add payloadprop \"%s\" to fault", lhs);
1684 	}
1685 }
1686 
1687 static char *Istatbuf;
1688 static char *Istatbufptr;
1689 static int Istatsz;
1690 
1691 /*
1692  * istataddsize -- calculate size of istat and add it to Istatsz
1693  */
1694 /*ARGSUSED2*/
1695 static void
1696 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
1697 {
1698 	int val;
1699 
1700 	ASSERT(lhs != NULL);
1701 	ASSERT(rhs != NULL);
1702 
1703 	if ((val = stats_counter_value(rhs)) == 0)
1704 		return;	/* skip zero-valued stats */
1705 
1706 	/* count up the size of the stat name */
1707 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
1708 	Istatsz++;	/* for the trailing NULL byte */
1709 
1710 	/* count up the size of the stat value */
1711 	Istatsz += snprintf(NULL, 0, "%d", val);
1712 	Istatsz++;	/* for the trailing NULL byte */
1713 }
1714 
1715 /*
1716  * istat2str -- serialize an istat, writing result to *Istatbufptr
1717  */
1718 /*ARGSUSED2*/
1719 static void
1720 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
1721 {
1722 	char *str;
1723 	int len;
1724 	int val;
1725 
1726 	ASSERT(lhs != NULL);
1727 	ASSERT(rhs != NULL);
1728 
1729 	if ((val = stats_counter_value(rhs)) == 0)
1730 		return;	/* skip zero-valued stats */
1731 
1732 	/* serialize the stat name */
1733 	str = ipath2str(lhs->ename, lhs->ipath);
1734 	len = strlen(str);
1735 
1736 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
1737 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
1738 	Istatbufptr += len;
1739 	FREE(str);
1740 	*Istatbufptr++ = '\0';
1741 
1742 	/* serialize the stat value */
1743 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
1744 	    "%d", val);
1745 	*Istatbufptr++ = '\0';
1746 
1747 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
1748 }
1749 
1750 void
1751 istat_save()
1752 {
1753 	if (Istat_need_save == 0)
1754 		return;
1755 
1756 	/* figure out how big the serialzed info is */
1757 	Istatsz = 0;
1758 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
1759 
1760 	if (Istatsz == 0) {
1761 		/* no stats to save */
1762 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
1763 		return;
1764 	}
1765 
1766 	/* create the serialized buffer */
1767 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
1768 	lut_walk(Istats, (lut_cb)istat2str, NULL);
1769 
1770 	/* clear out current saved stats */
1771 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
1772 
1773 	/* write out the new version */
1774 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
1775 	FREE(Istatbuf);
1776 
1777 	Istat_need_save = 0;
1778 }
1779 
1780 int
1781 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
1782 {
1783 	if (ent1->ename != ent2->ename)
1784 		return (ent2->ename - ent1->ename);
1785 	if (ent1->ipath != ent2->ipath)
1786 		return ((char *)ent2->ipath - (char *)ent1->ipath);
1787 
1788 	return (0);
1789 }
1790 
1791 /*
1792  * istat-verify -- verify the component associated with a stat still exists
1793  *
1794  * if the component no longer exists, this routine resets the stat and
1795  * returns 0.  if the component still exists, it returns 1.
1796  */
1797 static int
1798 istat_verify(struct node *snp, struct istat_entry *entp)
1799 {
1800 	struct stats *statp;
1801 	nvlist_t *fmri;
1802 
1803 	fmri = node2fmri(snp->u.event.epname);
1804 	if (platform_path_exists(fmri)) {
1805 		nvlist_free(fmri);
1806 		return (1);
1807 	}
1808 	nvlist_free(fmri);
1809 
1810 	/* component no longer in system.  zero out the associated stats */
1811 	if ((statp = (struct stats *)
1812 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
1813 	    stats_counter_value(statp) == 0)
1814 		return (0);	/* stat is already reset */
1815 
1816 	Istat_need_save = 1;
1817 	stats_counter_reset(statp);
1818 	return (0);
1819 }
1820 
1821 static void
1822 istat_bump(struct node *snp, int n)
1823 {
1824 	struct stats *statp;
1825 	struct istat_entry ent;
1826 
1827 	ASSERT(snp != NULL);
1828 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
1829 	ASSERT(snp->u.event.epname != NULL);
1830 
1831 	/* class name should be hoisted into a single stable entry */
1832 	ASSERT(snp->u.event.ename->u.name.next == NULL);
1833 	ent.ename = snp->u.event.ename->u.name.s;
1834 	ent.ipath = ipath(snp->u.event.epname);
1835 
1836 	if (!istat_verify(snp, &ent)) {
1837 		/* component no longer exists in system, nothing to do */
1838 		return;
1839 	}
1840 
1841 	if ((statp = (struct stats *)
1842 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
1843 		/* need to create the counter */
1844 		int cnt = 0;
1845 		struct node *np;
1846 		char *sname;
1847 		char *snamep;
1848 		struct istat_entry *newentp;
1849 
1850 		/* count up the size of the stat name */
1851 		np = snp->u.event.ename;
1852 		while (np != NULL) {
1853 			cnt += strlen(np->u.name.s);
1854 			cnt++;	/* for the '.' or '@' */
1855 			np = np->u.name.next;
1856 		}
1857 		np = snp->u.event.epname;
1858 		while (np != NULL) {
1859 			cnt += snprintf(NULL, 0, "%s%llu",
1860 			    np->u.name.s, np->u.name.child->u.ull);
1861 			cnt++;	/* for the '/' or trailing NULL byte */
1862 			np = np->u.name.next;
1863 		}
1864 
1865 		/* build the stat name */
1866 		snamep = sname = alloca(cnt);
1867 		np = snp->u.event.ename;
1868 		while (np != NULL) {
1869 			snamep += snprintf(snamep, &sname[cnt] - snamep,
1870 			    "%s", np->u.name.s);
1871 			np = np->u.name.next;
1872 			if (np)
1873 				*snamep++ = '.';
1874 		}
1875 		*snamep++ = '@';
1876 		np = snp->u.event.epname;
1877 		while (np != NULL) {
1878 			snamep += snprintf(snamep, &sname[cnt] - snamep,
1879 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
1880 			np = np->u.name.next;
1881 			if (np)
1882 				*snamep++ = '/';
1883 		}
1884 		*snamep++ = '\0';
1885 
1886 		/* create the new stat & add it to our list */
1887 		newentp = MALLOC(sizeof (*newentp));
1888 		*newentp = ent;
1889 		statp = stats_new_counter(NULL, sname, 0);
1890 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
1891 		    (lut_cmp)istat_cmp);
1892 	}
1893 
1894 	/* if n is non-zero, set that value instead of bumping */
1895 	if (n) {
1896 		stats_counter_reset(statp);
1897 		stats_counter_add(statp, n);
1898 	} else
1899 		stats_counter_bump(statp);
1900 	Istat_need_save = 1;
1901 }
1902 
1903 /*ARGSUSED*/
1904 static void
1905 istat_destructor(void *left, void *right, void *arg)
1906 {
1907 	struct istat_entry *entp = (struct istat_entry *)left;
1908 	struct stats *statp = (struct stats *)right;
1909 	FREE(entp);
1910 	stats_delete(statp);
1911 }
1912 
1913 /*
1914  * Callback used in a walk of the Istats to reset matching stat counters.
1915  */
1916 static void
1917 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
1918     const struct ipath *ipp)
1919 {
1920 	char *path;
1921 
1922 	if (entp->ipath == ipp) {
1923 		path = ipath2str(entp->ename, ipp);
1924 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
1925 		FREE(path);
1926 		stats_counter_reset(statp);
1927 		Istat_need_save = 1;
1928 	}
1929 }
1930 
1931 void
1932 istat_fini(void)
1933 {
1934 	lut_free(Istats, istat_destructor, NULL);
1935 }
1936 
1937 static void
1938 publish_suspects(struct fme *fmep)
1939 {
1940 	struct event *ep;
1941 	struct rsl *srl = NULL;
1942 	struct rsl *erl;
1943 	struct rsl *rp;
1944 	nvlist_t *fault;
1945 	uint8_t cert;
1946 	uint_t *frs;
1947 	uint_t fravg, frsum, fr;
1948 	uint_t messval;
1949 	struct node *snp;
1950 	int frcnt, fridx;
1951 	boolean_t no_upsets = B_FALSE;
1952 	boolean_t allfaulty = B_TRUE;
1953 
1954 	stats_counter_bump(fmep->diags);
1955 
1956 	/*
1957 	 * The current fmd interfaces don't allow us to solve a case
1958 	 * that's already solved.  If we make a new case, what of the
1959 	 * ereports?  We don't appear to have an interface that allows
1960 	 * us to access the ereports attached to a case (if we wanted
1961 	 * to copy the original case's ereport attachments to the new
1962 	 * case) and it's also a bit unclear if there would be any
1963 	 * problems with having ereports attached to multiple cases
1964 	 * and/or attaching DIAGNOSED ereports to a case.  For now,
1965 	 * we'll just output a message.
1966 	 */
1967 	if (fmep->posted_suspects ||
1968 	    fmd_case_solved(fmep->hdl, fmep->fmcase)) {
1969 		out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ",
1970 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1971 		for (ep = fmep->suspects; ep; ep = ep->suspects) {
1972 			out(O_ALTFP|O_NONL, " ");
1973 			itree_pevent_brief(O_ALTFP|O_NONL, ep);
1974 		}
1975 		out(O_ALTFP, NULL);
1976 		return;
1977 	}
1978 
1979 	/*
1980 	 * If we're auto-closing upsets, we don't want to include them
1981 	 * in any produced suspect lists or certainty accounting.
1982 	 */
1983 	if (Autoclose != NULL)
1984 		if (strcmp(Autoclose, "true") == 0 ||
1985 		    strcmp(Autoclose, "all") == 0 ||
1986 		    strcmp(Autoclose, "upsets") == 0)
1987 			no_upsets = B_TRUE;
1988 
1989 	trim_suspects(fmep, no_upsets, &srl, &erl);
1990 
1991 	/*
1992 	 * If the resulting suspect list has no members, we're
1993 	 * done.  Returning here will simply close the case.
1994 	 */
1995 	if (fmep->nsuspects == 0) {
1996 		out(O_ALTFP,
1997 		    "[FME%d, case %s (all suspects are upsets)]",
1998 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
1999 		FREE(srl);
2000 		restore_suspects(fmep);
2001 		return;
2002 	}
2003 
2004 	/*
2005 	 * If the suspect list is all faults, then for a given fault,
2006 	 * say X of N, X's certainty is computed via:
2007 	 *
2008 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
2009 	 *
2010 	 * If none of the suspects are faults, and there are N suspects,
2011 	 * the certainty of a given suspect is 100/N.
2012 	 *
2013 	 * If there are are a mixture of faults and other problems in
2014 	 * the suspect list, we take an average of the faults'
2015 	 * FITrates and treat this average as the FITrate for any
2016 	 * non-faults.  The fitrate of any given suspect is then
2017 	 * computed per the first formula above.
2018 	 */
2019 	if (fmep->nonfault == fmep->nsuspects) {
2020 		/* NO faults in the suspect list */
2021 		cert = percentof(1, fmep->nsuspects);
2022 	} else {
2023 		/* sum the fitrates */
2024 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
2025 		fridx = frcnt = frsum = 0;
2026 
2027 		for (rp = srl; rp <= erl; rp++) {
2028 			struct node *n;
2029 
2030 			if (rp->suspect == NULL)
2031 				continue;
2032 			if (!is_fault(rp->suspect->t)) {
2033 				frs[fridx++] = 0;
2034 				continue;
2035 			}
2036 			n = eventprop_lookup(rp->suspect, L_FITrate);
2037 			if (node2uint(n, &fr) != 0) {
2038 				out(O_DEBUG|O_NONL, "event ");
2039 				ipath_print(O_DEBUG|O_NONL,
2040 				    ep->enode->u.event.ename->u.name.s,
2041 				    ep->ipp);
2042 				out(O_DEBUG, " has no FITrate (using 1)");
2043 				fr = 1;
2044 			} else if (fr == 0) {
2045 				out(O_DEBUG|O_NONL, "event ");
2046 				ipath_print(O_DEBUG|O_NONL,
2047 				    ep->enode->u.event.ename->u.name.s,
2048 				    ep->ipp);
2049 				out(O_DEBUG, " has zero FITrate (using 1)");
2050 				fr = 1;
2051 			}
2052 
2053 			frs[fridx++] = fr;
2054 			frsum += fr;
2055 			frcnt++;
2056 		}
2057 		fravg = avg(frsum, frcnt);
2058 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
2059 			if (frs[fridx] == 0) {
2060 				frs[fridx] = fravg;
2061 				frsum += fravg;
2062 			}
2063 	}
2064 
2065 	/* Add them in reverse order of our sort, as fmd reverses order */
2066 	for (rp = erl; rp >= srl; rp--) {
2067 		if (rp->suspect == NULL)
2068 			continue;
2069 		if (!is_fault(rp->suspect->t))
2070 			allfaulty = B_FALSE;
2071 		if (fmep->nonfault != fmep->nsuspects)
2072 			cert = percentof(frs[--fridx], frsum);
2073 		fault = fmd_nvl_create_fault(fmep->hdl,
2074 		    rp->suspect->enode->u.event.ename->u.name.s,
2075 		    cert,
2076 		    rp->asru,
2077 		    rp->fru,
2078 		    rp->rsrc);
2079 		if (fault == NULL)
2080 			out(O_DIE, "fault creation failed");
2081 		/* if "message" property exists, add it to the fault */
2082 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2083 		    &messval) == 0) {
2084 
2085 			out(O_ALTFP,
2086 			    "[FME%d, %s adds message=%d to suspect list]",
2087 			    fmep->id,
2088 			    rp->suspect->enode->u.event.ename->u.name.s,
2089 			    messval);
2090 			if (nvlist_add_boolean_value(fault,
2091 			    FM_SUSPECT_MESSAGE,
2092 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2093 				out(O_DIE, "cannot add no-message to fault");
2094 			}
2095 		}
2096 		/* add any payload properties */
2097 		lut_walk(rp->suspect->payloadprops,
2098 		    (lut_cb)addpayloadprop, (void *)fault);
2099 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2100 		rp->suspect->fault = fault;
2101 		rslfree(rp);
2102 
2103 		/*
2104 		 * If "action" property exists, evaluate it;  this must be done
2105 		 * before the dupclose check below since some actions may
2106 		 * modify the asru to be used in fmd_nvl_fmri_faulty.  This
2107 		 * needs to be restructured if any new actions are introduced
2108 		 * that have effects that we do not want to be visible if
2109 		 * we decide not to publish in the dupclose check below.
2110 		 */
2111 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2112 			struct evalue evalue;
2113 
2114 			out(O_ALTFP|O_NONL,
2115 			    "[FME%d, %s action ", fmep->id,
2116 			    rp->suspect->enode->u.event.ename->u.name.s);
2117 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2118 			out(O_ALTFP, "]");
2119 			Action_nvl = fault;
2120 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2121 			    NULL, 0, &evalue);
2122 		}
2123 
2124 		/*
2125 		 * if "dupclose" tunable is set, check if the asru is
2126 		 * already marked as "faulty".
2127 		 */
2128 		if (Dupclose && allfaulty) {
2129 			nvlist_t *asru;
2130 
2131 			out(O_ALTFP|O_VERB, "FMD%d dupclose check ", fmep->id);
2132 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2133 			out(O_ALTFP|O_VERB|O_NONL, " ");
2134 			if (nvlist_lookup_nvlist(fault,
2135 			    FM_FAULT_ASRU, &asru) != 0) {
2136 				out(O_ALTFP|O_VERB, "NULL asru");
2137 				allfaulty = B_FALSE;
2138 			} else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) {
2139 				out(O_ALTFP|O_VERB, "faulty");
2140 			} else {
2141 				out(O_ALTFP|O_VERB, "not faulty");
2142 				allfaulty = B_FALSE;
2143 			}
2144 		}
2145 
2146 	}
2147 
2148 	/*
2149 	 * Close the case if all asrus are already known to be faulty and if
2150 	 * Dupclose is enabled.  Otherwise we are going to publish so take
2151 	 * any pre-publication actions.
2152 	 */
2153 	if (Dupclose && allfaulty) {
2154 		out(O_ALTFP, "[dupclose FME%d, case %s]", fmep->id,
2155 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2156 		fmd_case_close(fmep->hdl, fmep->fmcase);
2157 	} else {
2158 		for (rp = erl; rp >= srl; rp--) {
2159 			struct event *suspect = rp->suspect;
2160 
2161 			if (suspect == NULL)
2162 				continue;
2163 
2164 			fault = suspect->fault;
2165 
2166 			/* if "count" exists, increment the appropriate stat */
2167 			if ((snp = eventprop_lookup(suspect,
2168 			    L_count)) != NULL) {
2169 				out(O_ALTFP|O_NONL,
2170 				    "[FME%d, %s count ", fmep->id,
2171 				    suspect->enode->u.event.ename->u.name.s);
2172 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2173 				out(O_ALTFP, "]");
2174 				istat_bump(snp, 0);
2175 
2176 			}
2177 		}
2178 		istat_save();	/* write out any istat changes */
2179 
2180 		out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
2181 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2182 		fmd_case_solve(fmep->hdl, fmep->fmcase);
2183 	}
2184 
2185 	/*
2186 	 * revert to the original suspect list
2187 	 */
2188 	FREE(srl);
2189 	restore_suspects(fmep);
2190 }
2191 
2192 static void
2193 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep)
2194 {
2195 	struct case_list *newcase;
2196 	nvlist_t *defect;
2197 
2198 	out(O_ALTFP,
2199 	    "[undiagnosable ereport received, "
2200 	    "creating and closing a new case (%s)]",
2201 	    Undiag_reason ? Undiag_reason : "reason not provided");
2202 
2203 	newcase = MALLOC(sizeof (struct case_list));
2204 	newcase->next = NULL;
2205 
2206 	newcase->fmcase = fmd_case_open(hdl, NULL);
2207 	if (Undiagablecaselist != NULL)
2208 		newcase->next = Undiagablecaselist;
2209 	Undiagablecaselist = newcase;
2210 
2211 	if (ffep != NULL)
2212 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
2213 
2214 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
2215 	    NULL, NULL, NULL);
2216 	if (Undiag_reason != NULL)
2217 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2218 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
2219 
2220 	fmd_case_solve(hdl, newcase->fmcase);
2221 	fmd_case_close(hdl, newcase->fmcase);
2222 }
2223 
2224 static void
2225 fme_undiagnosable(struct fme *f)
2226 {
2227 	nvlist_t *defect;
2228 
2229 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
2230 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
2231 	    Undiag_reason ? Undiag_reason : "undiagnosable");
2232 
2233 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
2234 	    NULL, NULL, NULL);
2235 	if (Undiag_reason != NULL)
2236 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2237 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2238 	fmd_case_solve(f->hdl, f->fmcase);
2239 	destroy_fme_bufs(f);
2240 	fmd_case_close(f->hdl, f->fmcase);
2241 }
2242 
2243 /*
2244  * fme_close_case
2245  *
2246  *	Find the requested case amongst our fmes and close it.  Free up
2247  *	the related fme.
2248  */
2249 void
2250 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
2251 {
2252 	struct case_list *ucasep, *prevcasep = NULL;
2253 	struct fme *prev = NULL;
2254 	struct fme *fmep;
2255 
2256 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
2257 		if (fmcase != ucasep->fmcase) {
2258 			prevcasep = ucasep;
2259 			continue;
2260 		}
2261 
2262 		if (prevcasep == NULL)
2263 			Undiagablecaselist = Undiagablecaselist->next;
2264 		else
2265 			prevcasep->next = ucasep->next;
2266 
2267 		FREE(ucasep);
2268 		return;
2269 	}
2270 
2271 	for (fmep = FMElist; fmep; fmep = fmep->next) {
2272 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
2273 			break;
2274 		prev = fmep;
2275 	}
2276 
2277 	if (fmep == NULL) {
2278 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
2279 		    fmd_case_uuid(hdl, fmcase));
2280 		return;
2281 	}
2282 
2283 	if (EFMElist == fmep)
2284 		EFMElist = prev;
2285 
2286 	if (prev == NULL)
2287 		FMElist = FMElist->next;
2288 	else
2289 		prev->next = fmep->next;
2290 
2291 	fmep->next = NULL;
2292 
2293 	/* Get rid of any timer this fme has set */
2294 	if (fmep->wull != 0)
2295 		fmd_timer_remove(fmep->hdl, fmep->timer);
2296 
2297 	if (ClosedFMEs == NULL) {
2298 		ClosedFMEs = fmep;
2299 	} else {
2300 		fmep->next = ClosedFMEs;
2301 		ClosedFMEs = fmep;
2302 	}
2303 
2304 	Open_fme_count--;
2305 
2306 	/* See if we can close the overflow FME */
2307 	if (Open_fme_count <= Max_fme) {
2308 		for (fmep = FMElist; fmep; fmep = fmep->next) {
2309 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
2310 			    fmep->fmcase)))
2311 				break;
2312 		}
2313 
2314 		if (fmep != NULL)
2315 			fmd_case_close(fmep->hdl, fmep->fmcase);
2316 	}
2317 }
2318 
2319 /*
2320  * fme_set_timer()
2321  *	If the time we need to wait for the given FME is less than the
2322  *	current timer, kick that old timer out and establish a new one.
2323  */
2324 static int
2325 fme_set_timer(struct fme *fmep, unsigned long long wull)
2326 {
2327 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
2328 	ptree_timeval(O_ALTFP|O_VERB, &wull);
2329 
2330 	if (wull <= fmep->pull) {
2331 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
2332 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
2333 		out(O_ALTFP|O_VERB, NULL);
2334 		/* we've waited at least wull already, don't need timer */
2335 		return (0);
2336 	}
2337 
2338 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
2339 	if (fmep->wull != 0) {
2340 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
2341 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
2342 		out(O_ALTFP|O_VERB, NULL);
2343 	} else {
2344 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
2345 		out(O_ALTFP|O_VERB, NULL);
2346 	}
2347 
2348 	if (fmep->wull != 0)
2349 		if (wull >= fmep->wull)
2350 			/* New timer would fire later than established timer */
2351 			return (0);
2352 
2353 	if (fmep->wull != 0) {
2354 		fmd_timer_remove(fmep->hdl, fmep->timer);
2355 		if (fmep->timer == fmep->htid) {
2356 			out(O_ALTFP,
2357 			    "[stopped hesitating FME%d, case %s]",
2358 			    fmep->id,
2359 			    fmd_case_uuid(fmep->hdl,
2360 			    fmep->fmcase));
2361 			fmep->htid = 0;
2362 		}
2363 	}
2364 
2365 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
2366 	    fmep->e0r, wull);
2367 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
2368 	fmep->wull = wull;
2369 	return (1);
2370 }
2371 
2372 void
2373 fme_timer_fired(struct fme *fmep, id_t tid)
2374 {
2375 	struct fme *ffmep = NULL;
2376 
2377 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
2378 		if (ffmep == fmep)
2379 			break;
2380 
2381 	if (ffmep == NULL) {
2382 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
2383 		    (void *)fmep);
2384 		return;
2385 	}
2386 
2387 	out(O_ALTFP, "Timer fired %lx %lx", tid, fmep->htid);
2388 	if (tid != fmep->htid) {
2389 		/*
2390 		 * normal timer (not the hesitation timer)
2391 		 */
2392 		fmep->pull = fmep->wull;
2393 		fmep->wull = 0;
2394 		fmd_buf_write(fmep->hdl, fmep->fmcase,
2395 		    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
2396 		/*
2397 		 * no point in heistating if we've already waited.
2398 		 */
2399 		fmep->hesitated = 1;
2400 	} else {
2401 		fmep->hesitated = 1;
2402 	}
2403 	fme_eval(fmep, fmep->e0r);
2404 }
2405 
2406 /*
2407  * Preserve the fme's suspect list in its psuspects list, NULLing the
2408  * suspects list in the meantime.
2409  */
2410 static void
2411 save_suspects(struct fme *fmep)
2412 {
2413 	struct event *ep;
2414 	struct event *nextep;
2415 
2416 	/* zero out the previous suspect list */
2417 	for (ep = fmep->psuspects; ep; ep = nextep) {
2418 		nextep = ep->psuspects;
2419 		ep->psuspects = NULL;
2420 	}
2421 	fmep->psuspects = NULL;
2422 
2423 	/* zero out the suspect list, copying it to previous suspect list */
2424 	fmep->psuspects = fmep->suspects;
2425 	for (ep = fmep->suspects; ep; ep = nextep) {
2426 		nextep = ep->suspects;
2427 		ep->psuspects = ep->suspects;
2428 		ep->suspects = NULL;
2429 		ep->is_suspect = 0;
2430 	}
2431 	fmep->suspects = NULL;
2432 	fmep->nsuspects = 0;
2433 	fmep->nonfault = 0;
2434 }
2435 
2436 /*
2437  * Retrieve the fme's suspect list from its psuspects list.
2438  */
2439 static void
2440 restore_suspects(struct fme *fmep)
2441 {
2442 	struct event *ep;
2443 	struct event *nextep;
2444 
2445 	fmep->nsuspects = fmep->nonfault = 0;
2446 	fmep->suspects = fmep->psuspects;
2447 	for (ep = fmep->psuspects; ep; ep = nextep) {
2448 		fmep->nsuspects++;
2449 		if (!is_fault(ep->t))
2450 			fmep->nonfault++;
2451 		nextep = ep->psuspects;
2452 		ep->suspects = ep->psuspects;
2453 	}
2454 }
2455 
2456 /*
2457  * this is what we use to call the Emrys prototype code instead of main()
2458  */
2459 static void
2460 fme_eval(struct fme *fmep, fmd_event_t *ffep)
2461 {
2462 	struct event *ep;
2463 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
2464 
2465 	save_suspects(fmep);
2466 
2467 	out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id);
2468 	indent_set("  ");
2469 
2470 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
2471 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
2472 
2473 	out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
2474 	    fme_state2str(fmep->state));
2475 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
2476 		out(O_ALTFP|O_VERB|O_NONL, " ");
2477 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2478 	}
2479 	out(O_ALTFP|O_VERB, NULL);
2480 
2481 	if (fmep->posted_suspects) {
2482 		/*
2483 		 * this FME has already posted a diagnosis, so see if
2484 		 * the event changed the diagnosis and print a warning
2485 		 * if it did.
2486 		 *
2487 		 */
2488 		if (suspects_changed(fmep)) {
2489 			print_suspects(SLCHANGED, fmep);
2490 			publish_suspects(fmep);
2491 		}
2492 	} else {
2493 		switch (fmep->state) {
2494 		case FME_CREDIBLE:
2495 			/*
2496 			 * if the suspect list contains any upsets, we
2497 			 * turn off the hesitation logic (by setting
2498 			 * the hesitate flag which normally indicates
2499 			 * we've already done the hesitate logic).
2500 			 * this is done because hesitating with upsets
2501 			 * causes us to explain away additional soft errors
2502 			 * while the upset FME stays open.
2503 			 */
2504 			if (fmep->hesitated == 0) {
2505 				struct event *s;
2506 
2507 				for (s = fmep->suspects; s; s = s->suspects) {
2508 					if (s->t == N_UPSET) {
2509 						fmep->hesitated = 1;
2510 						break;
2511 					}
2512 				}
2513 			}
2514 
2515 			if (Hesitate &&
2516 			    fmep->suspects != NULL &&
2517 			    fmep->suspects->suspects != NULL &&
2518 			    fmep->hesitated == 0) {
2519 				/*
2520 				 * about to publish multi-entry suspect list,
2521 				 * set the hesitation timer if not already set.
2522 				 */
2523 				if (fmep->htid == 0) {
2524 					out(O_ALTFP|O_NONL,
2525 					    "[hesitate FME%d, case %s ",
2526 					    fmep->id,
2527 					    fmd_case_uuid(fmep->hdl,
2528 					    fmep->fmcase));
2529 					ptree_timeval(O_ALTFP|O_NONL,
2530 					    (unsigned long long *)&Hesitate);
2531 					out(O_ALTFP, "]");
2532 					if (fme_set_timer(fmep, Hesitate))
2533 						fmep->htid = fmep->timer;
2534 				} else {
2535 					out(O_ALTFP,
2536 					    "[still hesitating FME%d, case %s]",
2537 					    fmep->id,
2538 					    fmd_case_uuid(fmep->hdl,
2539 					    fmep->fmcase));
2540 				}
2541 			} else {
2542 				print_suspects(SLNEW, fmep);
2543 				(void) upsets_eval(fmep, ffep);
2544 				publish_suspects(fmep);
2545 				fmep->posted_suspects = 1;
2546 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2547 				    WOBUF_POSTD,
2548 				    (void *)&fmep->posted_suspects,
2549 				    sizeof (fmep->posted_suspects));
2550 			}
2551 			break;
2552 
2553 		case FME_WAIT:
2554 			/*
2555 			 * singleton suspect list implies
2556 			 * no point in waiting
2557 			 */
2558 			if (fmep->suspects &&
2559 			    fmep->suspects->suspects == NULL) {
2560 				print_suspects(SLNEW, fmep);
2561 				(void) upsets_eval(fmep, ffep);
2562 				publish_suspects(fmep);
2563 				fmep->posted_suspects = 1;
2564 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2565 				    WOBUF_POSTD,
2566 				    (void *)&fmep->posted_suspects,
2567 				    sizeof (fmep->posted_suspects));
2568 				fmep->state = FME_CREDIBLE;
2569 			} else {
2570 				ASSERT(my_delay > fmep->ull);
2571 				(void) fme_set_timer(fmep, my_delay);
2572 				print_suspects(SLWAIT, fmep);
2573 			}
2574 			break;
2575 
2576 		case FME_DISPROVED:
2577 			print_suspects(SLDISPROVED, fmep);
2578 			Undiag_reason = UD_UNSOLVD;
2579 			fme_undiagnosable(fmep);
2580 			break;
2581 		}
2582 	}
2583 
2584 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
2585 		int doclose = 0;
2586 
2587 		if (strcmp(Autoclose, "true") == 0 ||
2588 		    strcmp(Autoclose, "all") == 0)
2589 			doclose = 1;
2590 
2591 		if (strcmp(Autoclose, "upsets") == 0) {
2592 			doclose = 1;
2593 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
2594 				if (ep->t != N_UPSET) {
2595 					doclose = 0;
2596 					break;
2597 				}
2598 			}
2599 		}
2600 
2601 		if (doclose) {
2602 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
2603 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2604 
2605 			destroy_fme_bufs(fmep);
2606 			fmd_case_close(fmep->hdl, fmep->fmcase);
2607 		}
2608 	}
2609 	itree_prune(fmep->eventtree);
2610 }
2611 
2612 static void indent(void);
2613 static int triggered(struct fme *fmep, struct event *ep, int mark);
2614 static enum fme_state effects_test(struct fme *fmep,
2615     struct event *fault_event, unsigned long long at_latest_by,
2616     unsigned long long *pdelay);
2617 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
2618     unsigned long long at_latest_by, unsigned long long *pdelay);
2619 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
2620     unsigned long long at_latest_by, unsigned long long *pdelay);
2621 
2622 static int
2623 checkconstraints(struct fme *fmep, struct arrow *arrowp)
2624 {
2625 	struct constraintlist *ctp;
2626 	struct evalue value;
2627 
2628 	if (arrowp->forever_false) {
2629 		char *sep = "";
2630 		indent();
2631 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
2632 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
2633 			out(O_ALTFP|O_VERB|O_NONL, sep);
2634 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2635 			sep = ", ";
2636 		}
2637 		out(O_ALTFP|O_VERB, NULL);
2638 		return (0);
2639 	}
2640 
2641 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
2642 		if (eval_expr(ctp->cnode, NULL, NULL,
2643 		    &fmep->globals, fmep->cfgdata->cooked,
2644 		    arrowp, 0, &value)) {
2645 			/* evaluation successful */
2646 			if (value.t == UNDEFINED || value.v == 0) {
2647 				/* known false */
2648 				arrowp->forever_false = 1;
2649 				indent();
2650 				out(O_ALTFP|O_VERB|O_NONL,
2651 				    "  False constraint: ");
2652 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2653 				out(O_ALTFP|O_VERB, NULL);
2654 				return (0);
2655 			}
2656 		} else {
2657 			/* evaluation unsuccessful -- unknown value */
2658 			indent();
2659 			out(O_ALTFP|O_VERB|O_NONL,
2660 			    "  Deferred constraint: ");
2661 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2662 			out(O_ALTFP|O_VERB, NULL);
2663 			return (2);
2664 		}
2665 	}
2666 	/* known true */
2667 	return (1);
2668 }
2669 
2670 static int
2671 triggered(struct fme *fmep, struct event *ep, int mark)
2672 {
2673 	struct bubble *bp;
2674 	struct arrowlist *ap;
2675 	int count = 0;
2676 
2677 	stats_counter_bump(fmep->Tcallcount);
2678 	for (bp = itree_next_bubble(ep, NULL); bp;
2679 	    bp = itree_next_bubble(ep, bp)) {
2680 		if (bp->t != B_TO)
2681 			continue;
2682 		for (ap = itree_next_arrow(bp, NULL); ap;
2683 		    ap = itree_next_arrow(bp, ap)) {
2684 			/* check count of marks against K in the bubble */
2685 			if ((ap->arrowp->mark & mark) &&
2686 			    ++count >= bp->nork)
2687 				return (1);
2688 		}
2689 	}
2690 	return (0);
2691 }
2692 
2693 static int
2694 mark_arrows(struct fme *fmep, struct event *ep, int mark,
2695     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
2696 {
2697 	struct bubble *bp;
2698 	struct arrowlist *ap;
2699 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2700 	unsigned long long my_delay;
2701 	enum fme_state result;
2702 	int retval = 0;
2703 
2704 	for (bp = itree_next_bubble(ep, NULL); bp;
2705 	    bp = itree_next_bubble(ep, bp)) {
2706 		if (bp->t != B_FROM)
2707 			continue;
2708 		stats_counter_bump(fmep->Marrowcount);
2709 		for (ap = itree_next_arrow(bp, NULL); ap;
2710 		    ap = itree_next_arrow(bp, ap)) {
2711 			struct event *ep2 = ap->arrowp->head->myevent;
2712 			/*
2713 			 * if we're clearing marks, we can avoid doing
2714 			 * all that work evaluating constraints.
2715 			 */
2716 			if (mark == 0) {
2717 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
2718 				if (keep && (ep2->cached_state &
2719 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
2720 					ep2->keep_in_tree = 1;
2721 				ep2->cached_state &=
2722 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
2723 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
2724 				    keep);
2725 				continue;
2726 			}
2727 			if (ep2->cached_state & REQMNTS_DISPROVED) {
2728 				indent();
2729 				out(O_ALTFP|O_VERB|O_NONL,
2730 				    "  ALREADY DISPROVED ");
2731 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2732 				out(O_ALTFP|O_VERB, NULL);
2733 				continue;
2734 			}
2735 			if (ep2->cached_state & WAIT_EFFECT) {
2736 				indent();
2737 				out(O_ALTFP|O_VERB|O_NONL,
2738 				    "  ALREADY EFFECTS WAIT ");
2739 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2740 				out(O_ALTFP|O_VERB, NULL);
2741 				continue;
2742 			}
2743 			if (ep2->cached_state & CREDIBLE_EFFECT) {
2744 				indent();
2745 				out(O_ALTFP|O_VERB|O_NONL,
2746 				    "  ALREADY EFFECTS CREDIBLE ");
2747 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2748 				out(O_ALTFP|O_VERB, NULL);
2749 				continue;
2750 			}
2751 			if ((ep2->cached_state & PARENT_WAIT) &&
2752 			    (mark & PARENT_WAIT)) {
2753 				indent();
2754 				out(O_ALTFP|O_VERB|O_NONL,
2755 				    "  ALREADY PARENT EFFECTS WAIT ");
2756 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2757 				out(O_ALTFP|O_VERB, NULL);
2758 				continue;
2759 			}
2760 			platform_set_payloadnvp(ep2->nvp);
2761 			if (checkconstraints(fmep, ap->arrowp) == 0) {
2762 				platform_set_payloadnvp(NULL);
2763 				indent();
2764 				out(O_ALTFP|O_VERB|O_NONL,
2765 				    "  CONSTRAINTS FAIL ");
2766 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2767 				out(O_ALTFP|O_VERB, NULL);
2768 				continue;
2769 			}
2770 			platform_set_payloadnvp(NULL);
2771 			ap->arrowp->mark |= EFFECTS_COUNTER;
2772 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
2773 				indent();
2774 				out(O_ALTFP|O_VERB|O_NONL,
2775 				    "  K-COUNT NOT YET MET ");
2776 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2777 				out(O_ALTFP|O_VERB, NULL);
2778 				continue;
2779 			}
2780 			ep2->cached_state &= ~PARENT_WAIT;
2781 			result = requirements_test(fmep, ep2, at_latest_by +
2782 			    ap->arrowp->maxdelay,
2783 			    &my_delay);
2784 			if (result == FME_WAIT) {
2785 				retval = WAIT_EFFECT;
2786 				if (overall_delay > my_delay)
2787 					overall_delay = my_delay;
2788 				ep2->cached_state |= WAIT_EFFECT;
2789 				indent();
2790 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
2791 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2792 				out(O_ALTFP|O_VERB, NULL);
2793 				indent_push("  E");
2794 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
2795 				    at_latest_by, &my_delay, 0) ==
2796 				    WAIT_EFFECT) {
2797 					retval = WAIT_EFFECT;
2798 					if (overall_delay > my_delay)
2799 						overall_delay = my_delay;
2800 				}
2801 				indent_pop();
2802 			} else if (result == FME_DISPROVED) {
2803 				indent();
2804 				out(O_ALTFP|O_VERB|O_NONL,
2805 				    "  EFFECTS DISPROVED ");
2806 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2807 				out(O_ALTFP|O_VERB, NULL);
2808 			} else {
2809 				ep2->cached_state |= mark;
2810 				indent();
2811 				if (mark == CREDIBLE_EFFECT)
2812 					out(O_ALTFP|O_VERB|O_NONL,
2813 					    "  EFFECTS CREDIBLE ");
2814 				else
2815 					out(O_ALTFP|O_VERB|O_NONL,
2816 					    "  PARENT EFFECTS WAIT ");
2817 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2818 				out(O_ALTFP|O_VERB, NULL);
2819 				indent_push("  E");
2820 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
2821 				    &my_delay, 0) == WAIT_EFFECT) {
2822 					retval = WAIT_EFFECT;
2823 					if (overall_delay > my_delay)
2824 						overall_delay = my_delay;
2825 				}
2826 				indent_pop();
2827 			}
2828 		}
2829 	}
2830 	if (retval == WAIT_EFFECT)
2831 		*pdelay = overall_delay;
2832 	return (retval);
2833 }
2834 
2835 static enum fme_state
2836 effects_test(struct fme *fmep, struct event *fault_event,
2837     unsigned long long at_latest_by, unsigned long long *pdelay)
2838 {
2839 	struct event *error_event;
2840 	enum fme_state return_value = FME_CREDIBLE;
2841 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2842 	unsigned long long my_delay;
2843 
2844 	stats_counter_bump(fmep->Ecallcount);
2845 	indent_push("  E");
2846 	indent();
2847 	out(O_ALTFP|O_VERB|O_NONL, "->");
2848 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2849 	out(O_ALTFP|O_VERB, NULL);
2850 
2851 	(void) mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
2852 	    &my_delay, 0);
2853 	for (error_event = fmep->observations;
2854 	    error_event; error_event = error_event->observations) {
2855 		indent();
2856 		out(O_ALTFP|O_VERB|O_NONL, " ");
2857 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
2858 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
2859 			if (error_event->cached_state &
2860 			    (PARENT_WAIT|WAIT_EFFECT)) {
2861 				return_value = FME_WAIT;
2862 				if (overall_delay > my_delay)
2863 					overall_delay = my_delay;
2864 				out(O_ALTFP|O_VERB, " NOT YET triggered");
2865 				continue;
2866 			}
2867 			return_value = FME_DISPROVED;
2868 			out(O_ALTFP|O_VERB, " NOT triggered");
2869 			break;
2870 		} else {
2871 			out(O_ALTFP|O_VERB, " triggered");
2872 		}
2873 	}
2874 	if (return_value == FME_DISPROVED) {
2875 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
2876 	} else {
2877 		fault_event->keep_in_tree = 1;
2878 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
2879 	}
2880 
2881 	indent();
2882 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
2883 	    fme_state2str(return_value));
2884 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2885 	out(O_ALTFP|O_VERB, NULL);
2886 	indent_pop();
2887 	if (return_value == FME_WAIT)
2888 		*pdelay = overall_delay;
2889 	return (return_value);
2890 }
2891 
2892 static enum fme_state
2893 requirements_test(struct fme *fmep, struct event *ep,
2894     unsigned long long at_latest_by, unsigned long long *pdelay)
2895 {
2896 	int waiting_events;
2897 	int credible_events;
2898 	int deferred_events;
2899 	enum fme_state return_value = FME_CREDIBLE;
2900 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2901 	unsigned long long arrow_delay;
2902 	unsigned long long my_delay;
2903 	struct event *ep2;
2904 	struct bubble *bp;
2905 	struct arrowlist *ap;
2906 
2907 	if (ep->cached_state & REQMNTS_CREDIBLE) {
2908 		indent();
2909 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
2910 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2911 		out(O_ALTFP|O_VERB, NULL);
2912 		return (FME_CREDIBLE);
2913 	}
2914 	if (ep->cached_state & REQMNTS_DISPROVED) {
2915 		indent();
2916 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
2917 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2918 		out(O_ALTFP|O_VERB, NULL);
2919 		return (FME_DISPROVED);
2920 	}
2921 	if (ep->cached_state & REQMNTS_WAIT) {
2922 		indent();
2923 		*pdelay = ep->cached_delay;
2924 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
2925 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2926 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
2927 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2928 		out(O_ALTFP|O_VERB, NULL);
2929 		return (FME_WAIT);
2930 	}
2931 	stats_counter_bump(fmep->Rcallcount);
2932 	indent_push("  R");
2933 	indent();
2934 	out(O_ALTFP|O_VERB|O_NONL, "->");
2935 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2936 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2937 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2938 	out(O_ALTFP|O_VERB, NULL);
2939 
2940 	if (ep->t == N_EREPORT) {
2941 		if (ep->count == 0) {
2942 			if (fmep->pull >= at_latest_by) {
2943 				return_value = FME_DISPROVED;
2944 			} else {
2945 				ep->cached_delay = *pdelay = at_latest_by;
2946 				return_value = FME_WAIT;
2947 			}
2948 		}
2949 
2950 		indent();
2951 		switch (return_value) {
2952 		case FME_CREDIBLE:
2953 			ep->cached_state |= REQMNTS_CREDIBLE;
2954 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
2955 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2956 			break;
2957 		case FME_DISPROVED:
2958 			ep->cached_state |= REQMNTS_DISPROVED;
2959 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
2960 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2961 			break;
2962 		case FME_WAIT:
2963 			ep->cached_state |= REQMNTS_WAIT;
2964 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
2965 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2966 			out(O_ALTFP|O_VERB|O_NONL, " to ");
2967 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2968 			break;
2969 		default:
2970 			out(O_DIE, "requirements_test: unexpected fme_state");
2971 			break;
2972 		}
2973 		out(O_ALTFP|O_VERB, NULL);
2974 		indent_pop();
2975 
2976 		return (return_value);
2977 	}
2978 
2979 	/* this event is not a report, descend the tree */
2980 	for (bp = itree_next_bubble(ep, NULL); bp;
2981 	    bp = itree_next_bubble(ep, bp)) {
2982 		int n;
2983 
2984 		if (bp->t != B_FROM)
2985 			continue;
2986 
2987 		n = bp->nork;
2988 
2989 		credible_events = 0;
2990 		waiting_events = 0;
2991 		deferred_events = 0;
2992 		arrow_delay = TIMEVAL_EVENTUALLY;
2993 		/*
2994 		 * n is -1 for 'A' so adjust it.
2995 		 * XXX just count up the arrows for now.
2996 		 */
2997 		if (n < 0) {
2998 			n = 0;
2999 			for (ap = itree_next_arrow(bp, NULL); ap;
3000 			    ap = itree_next_arrow(bp, ap))
3001 				n++;
3002 			indent();
3003 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
3004 		} else {
3005 			indent();
3006 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
3007 		}
3008 
3009 		if (n == 0)
3010 			continue;
3011 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
3012 			for (ap = itree_next_arrow(bp, NULL); ap;
3013 			    ap = itree_next_arrow(bp, ap)) {
3014 				ep2 = ap->arrowp->head->myevent;
3015 				platform_set_payloadnvp(ep2->nvp);
3016 				if (checkconstraints(fmep, ap->arrowp) == 0) {
3017 					/*
3018 					 * if any arrow is invalidated by the
3019 					 * constraints, then we should elide the
3020 					 * whole bubble to be consistant with
3021 					 * the tree creation time behaviour
3022 					 */
3023 					bp->mark |= BUBBLE_ELIDED;
3024 					platform_set_payloadnvp(NULL);
3025 					break;
3026 				}
3027 				platform_set_payloadnvp(NULL);
3028 			}
3029 		}
3030 		if (bp->mark & BUBBLE_ELIDED)
3031 			continue;
3032 		bp->mark |= BUBBLE_OK;
3033 		for (ap = itree_next_arrow(bp, NULL); ap;
3034 		    ap = itree_next_arrow(bp, ap)) {
3035 			ep2 = ap->arrowp->head->myevent;
3036 			if (n <= credible_events)
3037 				break;
3038 
3039 			ap->arrowp->mark |= REQMNTS_COUNTER;
3040 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
3041 				/* XXX adding max timevals! */
3042 				switch (requirements_test(fmep, ep2,
3043 				    at_latest_by + ap->arrowp->maxdelay,
3044 				    &my_delay)) {
3045 				case FME_DEFERRED:
3046 					deferred_events++;
3047 					break;
3048 				case FME_CREDIBLE:
3049 					credible_events++;
3050 					break;
3051 				case FME_DISPROVED:
3052 					break;
3053 				case FME_WAIT:
3054 					if (my_delay < arrow_delay)
3055 						arrow_delay = my_delay;
3056 					waiting_events++;
3057 					break;
3058 				default:
3059 					out(O_DIE,
3060 					"Bug in requirements_test.");
3061 				}
3062 			else
3063 				deferred_events++;
3064 		}
3065 		indent();
3066 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
3067 		    credible_events + deferred_events, waiting_events);
3068 		if (credible_events + deferred_events + waiting_events < n) {
3069 			/* Can never meet requirements */
3070 			ep->cached_state |= REQMNTS_DISPROVED;
3071 			indent();
3072 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3073 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3074 			out(O_ALTFP|O_VERB, NULL);
3075 			indent_pop();
3076 			return (FME_DISPROVED);
3077 		}
3078 		if (credible_events + deferred_events < n) {
3079 			/* will have to wait */
3080 			/* wait time is shortest known */
3081 			if (arrow_delay < overall_delay)
3082 				overall_delay = arrow_delay;
3083 			return_value = FME_WAIT;
3084 		} else if (credible_events < n) {
3085 			if (return_value != FME_WAIT)
3086 				return_value = FME_DEFERRED;
3087 		}
3088 	}
3089 
3090 	/*
3091 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
3092 	 * path, then this will be considered FME_CREDIBLE. But if it is
3093 	 * reached by a different path so the K-count is met, then might
3094 	 * get overridden by FME_WAIT or FME_DISPROVED.
3095 	 */
3096 	if (return_value == FME_WAIT) {
3097 		ep->cached_state |= REQMNTS_WAIT;
3098 		ep->cached_delay = *pdelay = overall_delay;
3099 	} else if (return_value == FME_CREDIBLE) {
3100 		ep->cached_state |= REQMNTS_CREDIBLE;
3101 	}
3102 	indent();
3103 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
3104 	    fme_state2str(return_value));
3105 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3106 	out(O_ALTFP|O_VERB, NULL);
3107 	indent_pop();
3108 	return (return_value);
3109 }
3110 
3111 static enum fme_state
3112 causes_test(struct fme *fmep, struct event *ep,
3113     unsigned long long at_latest_by, unsigned long long *pdelay)
3114 {
3115 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3116 	unsigned long long my_delay;
3117 	int credible_results = 0;
3118 	int waiting_results = 0;
3119 	enum fme_state fstate;
3120 	struct event *tail_event;
3121 	struct bubble *bp;
3122 	struct arrowlist *ap;
3123 	int k = 1;
3124 
3125 	stats_counter_bump(fmep->Ccallcount);
3126 	indent_push("  C");
3127 	indent();
3128 	out(O_ALTFP|O_VERB|O_NONL, "->");
3129 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3130 	out(O_ALTFP|O_VERB, NULL);
3131 
3132 	for (bp = itree_next_bubble(ep, NULL); bp;
3133 	    bp = itree_next_bubble(ep, bp)) {
3134 		if (bp->t != B_TO)
3135 			continue;
3136 		k = bp->nork;	/* remember the K value */
3137 		for (ap = itree_next_arrow(bp, NULL); ap;
3138 		    ap = itree_next_arrow(bp, ap)) {
3139 			int do_not_follow = 0;
3140 
3141 			/*
3142 			 * if we get to the same event multiple times
3143 			 * only worry about the first one.
3144 			 */
3145 			if (ap->arrowp->tail->myevent->cached_state &
3146 			    CAUSES_TESTED) {
3147 				indent();
3148 				out(O_ALTFP|O_VERB|O_NONL,
3149 				    "  causes test already run for ");
3150 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3151 				    ap->arrowp->tail->myevent);
3152 				out(O_ALTFP|O_VERB, NULL);
3153 				continue;
3154 			}
3155 
3156 			/*
3157 			 * see if false constraint prevents us
3158 			 * from traversing this arrow
3159 			 */
3160 			platform_set_payloadnvp(ep->nvp);
3161 			if (checkconstraints(fmep, ap->arrowp) == 0)
3162 				do_not_follow = 1;
3163 			platform_set_payloadnvp(NULL);
3164 			if (do_not_follow) {
3165 				indent();
3166 				out(O_ALTFP|O_VERB|O_NONL,
3167 				    "  False arrow from ");
3168 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3169 				    ap->arrowp->tail->myevent);
3170 				out(O_ALTFP|O_VERB, NULL);
3171 				continue;
3172 			}
3173 
3174 			ap->arrowp->tail->myevent->cached_state |=
3175 			    CAUSES_TESTED;
3176 			tail_event = ap->arrowp->tail->myevent;
3177 			fstate = hypothesise(fmep, tail_event, at_latest_by,
3178 			    &my_delay);
3179 
3180 			switch (fstate) {
3181 			case FME_WAIT:
3182 				if (my_delay < overall_delay)
3183 					overall_delay = my_delay;
3184 				waiting_results++;
3185 				break;
3186 			case FME_CREDIBLE:
3187 				credible_results++;
3188 				break;
3189 			case FME_DISPROVED:
3190 				break;
3191 			default:
3192 				out(O_DIE, "Bug in causes_test");
3193 			}
3194 		}
3195 	}
3196 	/* compare against K */
3197 	if (credible_results + waiting_results < k) {
3198 		indent();
3199 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
3200 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3201 		out(O_ALTFP|O_VERB, NULL);
3202 		indent_pop();
3203 		return (FME_DISPROVED);
3204 	}
3205 	if (waiting_results != 0) {
3206 		*pdelay = overall_delay;
3207 		indent();
3208 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
3209 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3210 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3211 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3212 		out(O_ALTFP|O_VERB, NULL);
3213 		indent_pop();
3214 		return (FME_WAIT);
3215 	}
3216 	indent();
3217 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
3218 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3219 	out(O_ALTFP|O_VERB, NULL);
3220 	indent_pop();
3221 	return (FME_CREDIBLE);
3222 }
3223 
3224 static enum fme_state
3225 hypothesise(struct fme *fmep, struct event *ep,
3226 	unsigned long long at_latest_by, unsigned long long *pdelay)
3227 {
3228 	enum fme_state rtr, otr;
3229 	unsigned long long my_delay;
3230 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3231 
3232 	stats_counter_bump(fmep->Hcallcount);
3233 	indent_push("  H");
3234 	indent();
3235 	out(O_ALTFP|O_VERB|O_NONL, "->");
3236 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3237 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3238 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3239 	out(O_ALTFP|O_VERB, NULL);
3240 
3241 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
3242 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
3243 		overall_delay = my_delay;
3244 	if (rtr != FME_DISPROVED) {
3245 		if (is_problem(ep->t)) {
3246 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
3247 			if (otr != FME_DISPROVED) {
3248 				if (fmep->peek == 0 && ep->is_suspect++ == 0) {
3249 					ep->suspects = fmep->suspects;
3250 					fmep->suspects = ep;
3251 					fmep->nsuspects++;
3252 					if (!is_fault(ep->t))
3253 						fmep->nonfault++;
3254 				}
3255 			}
3256 		} else
3257 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
3258 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
3259 			overall_delay = my_delay;
3260 		if ((otr != FME_DISPROVED) &&
3261 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
3262 			*pdelay = overall_delay;
3263 	}
3264 	if (rtr == FME_DISPROVED) {
3265 		indent();
3266 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3267 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3268 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
3269 		indent_pop();
3270 		return (FME_DISPROVED);
3271 	}
3272 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
3273 		indent();
3274 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3275 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3276 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
3277 		indent_pop();
3278 		return (FME_DISPROVED);
3279 	}
3280 	if (otr == FME_DISPROVED) {
3281 		indent();
3282 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3283 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3284 		out(O_ALTFP|O_VERB, " (causes are not credible)");
3285 		indent_pop();
3286 		return (FME_DISPROVED);
3287 	}
3288 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
3289 		indent();
3290 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
3291 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3292 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3293 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
3294 		out(O_ALTFP|O_VERB, NULL);
3295 		indent_pop();
3296 		return (FME_WAIT);
3297 	}
3298 	indent();
3299 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
3300 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3301 	out(O_ALTFP|O_VERB, NULL);
3302 	indent_pop();
3303 	return (FME_CREDIBLE);
3304 }
3305 
3306 /*
3307  * fme_istat_load -- reconstitute any persistent istats
3308  */
3309 void
3310 fme_istat_load(fmd_hdl_t *hdl)
3311 {
3312 	int sz;
3313 	char *sbuf;
3314 	char *ptr;
3315 
3316 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
3317 		out(O_ALTFP, "fme_istat_load: No stats");
3318 		return;
3319 	}
3320 
3321 	sbuf = alloca(sz);
3322 
3323 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
3324 
3325 	/*
3326 	 * pick apart the serialized stats
3327 	 *
3328 	 * format is:
3329 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
3330 	 * for example:
3331 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
3332 	 *
3333 	 * since this is parsing our own serialized data, any parsing issues
3334 	 * are fatal, so we check for them all with ASSERT() below.
3335 	 */
3336 	ptr = sbuf;
3337 	while (ptr < &sbuf[sz]) {
3338 		char *sepptr;
3339 		struct node *np;
3340 		int val;
3341 
3342 		sepptr = strchr(ptr, '@');
3343 		ASSERT(sepptr != NULL);
3344 		*sepptr = '\0';
3345 
3346 		/* construct the event */
3347 		np = newnode(T_EVENT, NULL, 0);
3348 		np->u.event.ename = newnode(T_NAME, NULL, 0);
3349 		np->u.event.ename->u.name.t = N_STAT;
3350 		np->u.event.ename->u.name.s = stable(ptr);
3351 		np->u.event.ename->u.name.it = IT_ENAME;
3352 		np->u.event.ename->u.name.last = np->u.event.ename;
3353 
3354 		ptr = sepptr + 1;
3355 		ASSERT(ptr < &sbuf[sz]);
3356 		ptr += strlen(ptr);
3357 		ptr++;	/* move past the '\0' separating path from value */
3358 		ASSERT(ptr < &sbuf[sz]);
3359 		ASSERT(isdigit(*ptr));
3360 		val = atoi(ptr);
3361 		ASSERT(val > 0);
3362 		ptr += strlen(ptr);
3363 		ptr++;	/* move past the final '\0' for this entry */
3364 
3365 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
3366 		ASSERT(np->u.event.epname != NULL);
3367 
3368 		istat_bump(np, val);
3369 		tree_free(np);
3370 	}
3371 
3372 	istat_save();
3373 }
3374