xref: /titanic_41/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision 65d0d3dc14f60cfaf85c61a0d1e3e9f34e997781)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 
58 /* imported from eft.c... */
59 extern char *Autoclose;
60 extern int Dupclose;
61 extern hrtime_t Hesitate;
62 extern nv_alloc_t Eft_nv_hdl;
63 extern int Max_fme;
64 extern fmd_hdl_t *Hdl;
65 
66 static int Istat_need_save;
67 
68 /* fme under construction is global so we can free it on module abort */
69 static struct fme *Nfmep;
70 
71 static const char *Undiag_reason;
72 
73 static int Nextid = 0;
74 
75 static int Open_fme_count = 0;	/* Count of open FMEs */
76 
77 /* list of fault management exercises underway */
78 static struct fme {
79 	struct fme *next;		/* next exercise */
80 	unsigned long long ull;		/* time when fme was created */
81 	int id;				/* FME id */
82 	struct cfgdata *cfgdata;	/* full configuration data */
83 	struct lut *eventtree;		/* propagation tree for this FME */
84 	/*
85 	 * The initial error report that created this FME is kept in
86 	 * two forms.  e0 points to the instance tree node and is used
87 	 * by fme_eval() as the starting point for the inference
88 	 * algorithm.  e0r is the event handle FMD passed to us when
89 	 * the ereport first arrived and is used when setting timers,
90 	 * which are always relative to the time of this initial
91 	 * report.
92 	 */
93 	struct event *e0;
94 	fmd_event_t *e0r;
95 
96 	id_t    timer;			/* for setting an fmd time-out */
97 	id_t	htid;			/* for setting hesitation timer */
98 
99 	struct event *ecurrent;		/* ereport under consideration */
100 	struct event *suspects;		/* current suspect list */
101 	struct event *psuspects;	/* previous suspect list */
102 	int nsuspects;			/* count of suspects */
103 	int nonfault;			/* zero if all suspects T_FAULT */
104 	int posted_suspects;		/* true if we've posted a diagnosis */
105 	int hesitated;			/* true if we hesitated */
106 	int uniqobs;			/* number of unique events observed */
107 	int peek;			/* just peeking, don't track suspects */
108 	int overflow;			/* true if overflow FME */
109 	enum fme_state {
110 		FME_NOTHING = 5000,	/* not evaluated yet */
111 		FME_WAIT,		/* need to wait for more info */
112 		FME_CREDIBLE,		/* suspect list is credible */
113 		FME_DISPROVED,		/* no valid suspects found */
114 		FME_DEFERRED		/* don't know yet (k-count not met) */
115 	} state;
116 
117 	unsigned long long pull;	/* time passed since created */
118 	unsigned long long wull;	/* wait until this time for re-eval */
119 	struct event *observations;	/* observation list */
120 	struct lut *globals;		/* values of global variables */
121 	/* fmd interfacing */
122 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
123 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
124 	/* stats */
125 	struct stats *Rcount;
126 	struct stats *Hcallcount;
127 	struct stats *Rcallcount;
128 	struct stats *Ccallcount;
129 	struct stats *Ecallcount;
130 	struct stats *Tcallcount;
131 	struct stats *Marrowcount;
132 	struct stats *diags;
133 } *FMElist, *EFMElist, *ClosedFMEs;
134 
135 static struct case_list {
136 	fmd_case_t *fmcase;
137 	struct case_list *next;
138 } *Undiagablecaselist;
139 
140 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
141 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
142 	unsigned long long at_latest_by, unsigned long long *pdelay);
143 static struct node *eventprop_lookup(struct event *ep, const char *propname);
144 static struct node *pathstring2epnamenp(char *path);
145 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep);
146 static void restore_suspects(struct fme *fmep);
147 static void save_suspects(struct fme *fmep);
148 static void destroy_fme(struct fme *f);
149 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
150     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
151 
152 static struct fme *
153 alloc_fme(void)
154 {
155 	struct fme *fmep;
156 
157 	fmep = MALLOC(sizeof (*fmep));
158 	bzero(fmep, sizeof (*fmep));
159 	return (fmep);
160 }
161 
162 /*
163  * fme_ready -- called when all initialization of the FME (except for
164  *	stats) has completed successfully.  Adds the fme to global lists
165  *	and establishes its stats.
166  */
167 static struct fme *
168 fme_ready(struct fme *fmep)
169 {
170 	char nbuf[100];
171 
172 	Nfmep = NULL;	/* don't need to free this on module abort now */
173 
174 	if (EFMElist) {
175 		EFMElist->next = fmep;
176 		EFMElist = fmep;
177 	} else
178 		FMElist = EFMElist = fmep;
179 
180 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
181 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
182 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
183 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
184 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
185 	fmep->Rcallcount = stats_new_counter(nbuf,
186 	    "calls to requirements_test()", 1);
187 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
188 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
189 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
190 	fmep->Ecallcount =
191 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
192 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
193 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
194 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
195 	fmep->Marrowcount = stats_new_counter(nbuf,
196 	    "arrows marked by mark_arrows()", 1);
197 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
198 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
199 
200 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
201 	config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked);
202 
203 	return (fmep);
204 }
205 
206 static struct fme *
207 newfme(const char *e0class, const struct ipath *e0ipp)
208 {
209 	struct cfgdata *cfgdata;
210 
211 	if ((cfgdata = config_snapshot()) == NULL) {
212 		out(O_ALTFP, "newfme: NULL configuration");
213 		Undiag_reason = UD_NOCONF;
214 		return (NULL);
215 	}
216 
217 	Nfmep = alloc_fme();
218 
219 	Nfmep->id = Nextid++;
220 	Nfmep->cfgdata = cfgdata;
221 	Nfmep->posted_suspects = 0;
222 	Nfmep->uniqobs = 0;
223 	Nfmep->state = FME_NOTHING;
224 	Nfmep->pull = 0ULL;
225 	Nfmep->overflow = 0;
226 
227 	Nfmep->fmcase = NULL;
228 	Nfmep->hdl = NULL;
229 
230 	if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
231 		out(O_ALTFP, "newfme: NULL instance tree");
232 		Undiag_reason = UD_INSTFAIL;
233 		config_free(cfgdata);
234 		FREE(Nfmep);
235 		Nfmep = NULL;
236 		return (NULL);
237 	}
238 
239 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
240 
241 	if ((Nfmep->e0 =
242 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
243 		out(O_ALTFP, "newfme: e0 not in instance tree");
244 		Undiag_reason = UD_BADEVENTI;
245 		itree_free(Nfmep->eventtree);
246 		config_free(cfgdata);
247 		FREE(Nfmep);
248 		Nfmep = NULL;
249 		return (NULL);
250 	}
251 
252 	return (fme_ready(Nfmep));
253 }
254 
255 void
256 fme_fini(void)
257 {
258 	struct fme *sfp, *fp;
259 	struct case_list *ucasep, *nextcasep;
260 
261 	ucasep = Undiagablecaselist;
262 	while (ucasep != NULL) {
263 		nextcasep = ucasep->next;
264 		FREE(ucasep);
265 		ucasep = nextcasep;
266 	}
267 	Undiagablecaselist = NULL;
268 
269 	/* clean up closed fmes */
270 	fp = ClosedFMEs;
271 	while (fp != NULL) {
272 		sfp = fp->next;
273 		destroy_fme(fp);
274 		fp = sfp;
275 	}
276 	ClosedFMEs = NULL;
277 
278 	fp = FMElist;
279 	while (fp != NULL) {
280 		sfp = fp->next;
281 		destroy_fme(fp);
282 		fp = sfp;
283 	}
284 	FMElist = EFMElist = NULL;
285 
286 	/* if we were in the middle of creating an fme, free it now */
287 	if (Nfmep) {
288 		destroy_fme(Nfmep);
289 		Nfmep = NULL;
290 	}
291 }
292 
293 /*
294  * Allocated space for a buffer name.  20 bytes allows for
295  * a ridiculous 9,999,999 unique observations.
296  */
297 #define	OBBUFNMSZ 20
298 
299 /*
300  *  serialize_observation
301  *
302  *  Create a recoverable version of the current observation
303  *  (f->ecurrent).  We keep a serialized version of each unique
304  *  observation in order that we may resume correctly the fme in the
305  *  correct state if eft or fmd crashes and we're restarted.
306  */
307 static void
308 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
309 {
310 	size_t pkdlen;
311 	char tmpbuf[OBBUFNMSZ];
312 	char *pkd = NULL;
313 	char *estr;
314 
315 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
316 	estr = ipath2str(cls, ipp);
317 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
318 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
319 	    strlen(estr) + 1);
320 	FREE(estr);
321 
322 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
323 		(void) snprintf(tmpbuf,
324 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
325 		if (nvlist_xpack(fp->ecurrent->nvp,
326 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
327 			out(O_DIE|O_SYS, "pack of observed nvl failed");
328 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
329 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
330 		FREE(pkd);
331 	}
332 
333 	fp->uniqobs++;
334 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
335 	    sizeof (fp->uniqobs));
336 }
337 
338 /*
339  *  init_fme_bufs -- We keep several bits of state about an fme for
340  *	use if eft or fmd crashes and we're restarted.
341  */
342 static void
343 init_fme_bufs(struct fme *fp)
344 {
345 	size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin;
346 
347 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen));
348 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen,
349 	    sizeof (cfglen));
350 	if (cfglen != 0) {
351 		fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen);
352 		fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG,
353 		    fp->cfgdata->begin, cfglen);
354 	}
355 
356 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
357 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
358 	    sizeof (fp->pull));
359 
360 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
361 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
362 	    sizeof (fp->id));
363 
364 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
365 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
366 	    sizeof (fp->uniqobs));
367 
368 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
369 	    sizeof (fp->posted_suspects));
370 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
371 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
372 }
373 
374 static void
375 destroy_fme_bufs(struct fme *fp)
376 {
377 	char tmpbuf[OBBUFNMSZ];
378 	int o;
379 
380 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
381 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
382 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
383 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
384 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
385 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
386 
387 	for (o = 0; o < fp->uniqobs; o++) {
388 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
389 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
390 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
391 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
392 	}
393 }
394 
395 /*
396  * reconstitute_observations -- convert a case's serialized observations
397  *	back into struct events.  Returns zero if all observations are
398  *	successfully reconstituted.
399  */
400 static int
401 reconstitute_observations(struct fme *fmep)
402 {
403 	struct event *ep;
404 	struct node *epnamenp = NULL;
405 	size_t pkdlen;
406 	char *pkd = NULL;
407 	char *tmpbuf = alloca(OBBUFNMSZ);
408 	char *sepptr;
409 	char *estr;
410 	int ocnt;
411 	int elen;
412 
413 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
414 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
415 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
416 		if (elen == 0) {
417 			out(O_ALTFP,
418 			    "reconstitute_observation: no %s buffer found.",
419 			    tmpbuf);
420 			Undiag_reason = UD_MISSINGOBS;
421 			break;
422 		}
423 
424 		estr = MALLOC(elen);
425 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
426 		sepptr = strchr(estr, '@');
427 		if (sepptr == NULL) {
428 			out(O_ALTFP,
429 			    "reconstitute_observation: %s: "
430 			    "missing @ separator in %s.",
431 			    tmpbuf, estr);
432 			Undiag_reason = UD_MISSINGPATH;
433 			FREE(estr);
434 			break;
435 		}
436 
437 		*sepptr = '\0';
438 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
439 			out(O_ALTFP,
440 			    "reconstitute_observation: %s: "
441 			    "trouble converting path string \"%s\" "
442 			    "to internal representation.",
443 			    tmpbuf, sepptr + 1);
444 			Undiag_reason = UD_MISSINGPATH;
445 			FREE(estr);
446 			break;
447 		}
448 
449 		/* construct the event */
450 		ep = itree_lookup(fmep->eventtree,
451 		    stable(estr), ipath(epnamenp));
452 		if (ep == NULL) {
453 			out(O_ALTFP,
454 			    "reconstitute_observation: %s: "
455 			    "lookup of  \"%s\" in itree failed.",
456 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
457 			Undiag_reason = UD_BADOBS;
458 			tree_free(epnamenp);
459 			FREE(estr);
460 			break;
461 		}
462 		tree_free(epnamenp);
463 
464 		/*
465 		 * We may or may not have a saved nvlist for the observation
466 		 */
467 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
468 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
469 		if (pkdlen != 0) {
470 			pkd = MALLOC(pkdlen);
471 			fmd_buf_read(fmep->hdl,
472 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
473 			ASSERT(ep->nvp == NULL);
474 			if (nvlist_xunpack(pkd,
475 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
476 				out(O_DIE|O_SYS, "pack of observed nvl failed");
477 			FREE(pkd);
478 		}
479 
480 		if (ocnt == 0)
481 			fmep->e0 = ep;
482 
483 		FREE(estr);
484 		fmep->ecurrent = ep;
485 		ep->count++;
486 
487 		/* link it into list of observations seen */
488 		ep->observations = fmep->observations;
489 		fmep->observations = ep;
490 	}
491 
492 	if (ocnt == fmep->uniqobs) {
493 		(void) fme_ready(fmep);
494 		return (0);
495 	}
496 
497 	return (1);
498 }
499 
500 /*
501  * restart_fme -- called during eft initialization.  Reconstitutes
502  *	an in-progress fme.
503  */
504 void
505 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
506 {
507 	nvlist_t *defect;
508 	struct case_list *bad;
509 	struct fme *fmep;
510 	struct cfgdata *cfgdata = NULL;
511 	size_t rawsz;
512 
513 	fmep = alloc_fme();
514 	fmep->fmcase = inprogress;
515 	fmep->hdl = hdl;
516 
517 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
518 		out(O_ALTFP, "restart_fme: No config data");
519 		Undiag_reason = UD_MISSINGINFO;
520 		goto badcase;
521 	}
522 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
523 	    sizeof (size_t));
524 
525 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
526 		out(O_ALTFP, "restart_fme: No event zero");
527 		Undiag_reason = UD_MISSINGZERO;
528 		goto badcase;
529 	}
530 
531 	cfgdata = MALLOC(sizeof (struct cfgdata));
532 	cfgdata->cooked = NULL;
533 	cfgdata->devcache = NULL;
534 	cfgdata->cpucache = NULL;
535 	cfgdata->refcnt = 1;
536 
537 	if (rawsz > 0) {
538 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
539 			out(O_ALTFP, "restart_fme: Config data size mismatch");
540 			Undiag_reason = UD_CFGMISMATCH;
541 			goto badcase;
542 		}
543 		cfgdata->begin = MALLOC(rawsz);
544 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
545 		fmd_buf_read(hdl,
546 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
547 	} else {
548 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
549 	}
550 	fmep->cfgdata = cfgdata;
551 
552 	config_cook(cfgdata);
553 	if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
554 		/* case not properly saved or irretrievable */
555 		out(O_ALTFP, "restart_fme: NULL instance tree");
556 		Undiag_reason = UD_INSTFAIL;
557 		goto badcase;
558 	}
559 
560 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
561 
562 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
563 		out(O_ALTFP, "restart_fme: no saved wait time");
564 		Undiag_reason = UD_MISSINGINFO;
565 		goto badcase;
566 	} else {
567 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
568 		    sizeof (fmep->pull));
569 	}
570 
571 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
572 		out(O_ALTFP, "restart_fme: no saved posted status");
573 		Undiag_reason = UD_MISSINGINFO;
574 		goto badcase;
575 	} else {
576 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
577 		    (void *)&fmep->posted_suspects,
578 		    sizeof (fmep->posted_suspects));
579 	}
580 
581 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
582 		out(O_ALTFP, "restart_fme: no saved id");
583 		Undiag_reason = UD_MISSINGINFO;
584 		goto badcase;
585 	} else {
586 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
587 		    sizeof (fmep->id));
588 	}
589 	if (Nextid <= fmep->id)
590 		Nextid = fmep->id + 1;
591 
592 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
593 		out(O_ALTFP, "restart_fme: no count of observations");
594 		Undiag_reason = UD_MISSINGINFO;
595 		goto badcase;
596 	} else {
597 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
598 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
599 	}
600 
601 	if (reconstitute_observations(fmep) != 0)
602 		goto badcase;
603 
604 	Open_fme_count++;
605 
606 	/* give the diagnosis algorithm a shot at the new FME state */
607 	fme_eval(fmep, NULL);
608 	return;
609 
610 badcase:
611 	if (fmep->eventtree != NULL)
612 		itree_free(fmep->eventtree);
613 	config_free(cfgdata);
614 	destroy_fme_bufs(fmep);
615 	FREE(fmep);
616 
617 	/*
618 	 * Since we're unable to restart the case, add it to the undiagable
619 	 * list and solve and close it as appropriate.
620 	 */
621 	bad = MALLOC(sizeof (struct case_list));
622 	bad->next = NULL;
623 
624 	if (Undiagablecaselist != NULL)
625 		bad->next = Undiagablecaselist;
626 	Undiagablecaselist = bad;
627 	bad->fmcase = inprogress;
628 
629 	out(O_ALTFP, "[case %s (unable to restart), ",
630 	    fmd_case_uuid(hdl, bad->fmcase));
631 
632 	if (fmd_case_solved(hdl, bad->fmcase)) {
633 		out(O_ALTFP, "already solved, ");
634 	} else {
635 		out(O_ALTFP, "solving, ");
636 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
637 		    NULL, NULL, NULL);
638 		if (Undiag_reason != NULL)
639 			(void) nvlist_add_string(defect,
640 			    UNDIAG_REASON, Undiag_reason);
641 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
642 		fmd_case_solve(hdl, bad->fmcase);
643 	}
644 
645 	if (fmd_case_closed(hdl, bad->fmcase)) {
646 		out(O_ALTFP, "already closed ]");
647 	} else {
648 		out(O_ALTFP, "closing ]");
649 		fmd_case_close(hdl, bad->fmcase);
650 	}
651 }
652 
653 /*ARGSUSED*/
654 static void
655 globals_destructor(void *left, void *right, void *arg)
656 {
657 	struct evalue *evp = (struct evalue *)right;
658 	if (evp->t == NODEPTR)
659 		tree_free((struct node *)(uintptr_t)evp->v);
660 	evp->v = NULL;
661 	FREE(evp);
662 }
663 
664 void
665 destroy_fme(struct fme *f)
666 {
667 	stats_delete(f->Rcount);
668 	stats_delete(f->Hcallcount);
669 	stats_delete(f->Rcallcount);
670 	stats_delete(f->Ccallcount);
671 	stats_delete(f->Ecallcount);
672 	stats_delete(f->Tcallcount);
673 	stats_delete(f->Marrowcount);
674 	stats_delete(f->diags);
675 
676 	itree_free(f->eventtree);
677 	config_free(f->cfgdata);
678 	lut_free(f->globals, globals_destructor, NULL);
679 	FREE(f);
680 }
681 
682 static const char *
683 fme_state2str(enum fme_state s)
684 {
685 	switch (s) {
686 	case FME_NOTHING:	return ("NOTHING");
687 	case FME_WAIT:		return ("WAIT");
688 	case FME_CREDIBLE:	return ("CREDIBLE");
689 	case FME_DISPROVED:	return ("DISPROVED");
690 	case FME_DEFERRED:	return ("DEFERRED");
691 	default:		return ("UNKNOWN");
692 	}
693 }
694 
695 static int
696 is_problem(enum nametype t)
697 {
698 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
699 }
700 
701 static int
702 is_fault(enum nametype t)
703 {
704 	return (t == N_FAULT);
705 }
706 
707 static int
708 is_defect(enum nametype t)
709 {
710 	return (t == N_DEFECT);
711 }
712 
713 static int
714 is_upset(enum nametype t)
715 {
716 	return (t == N_UPSET);
717 }
718 
719 static void
720 fme_print(int flags, struct fme *fmep)
721 {
722 	struct event *ep;
723 
724 	out(flags, "Fault Management Exercise %d", fmep->id);
725 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
726 	out(flags|O_NONL, "\t  Start time: ");
727 	ptree_timeval(flags|O_NONL, &fmep->ull);
728 	out(flags, NULL);
729 	if (fmep->wull) {
730 		out(flags|O_NONL, "\t   Wait time: ");
731 		ptree_timeval(flags|O_NONL, &fmep->wull);
732 		out(flags, NULL);
733 	}
734 	out(flags|O_NONL, "\t          E0: ");
735 	if (fmep->e0)
736 		itree_pevent_brief(flags|O_NONL, fmep->e0);
737 	else
738 		out(flags|O_NONL, "NULL");
739 	out(flags, NULL);
740 	out(flags|O_NONL, "\tObservations:");
741 	for (ep = fmep->observations; ep; ep = ep->observations) {
742 		out(flags|O_NONL, " ");
743 		itree_pevent_brief(flags|O_NONL, ep);
744 	}
745 	out(flags, NULL);
746 	out(flags|O_NONL, "\tSuspect list:");
747 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
748 		out(flags|O_NONL, " ");
749 		itree_pevent_brief(flags|O_NONL, ep);
750 	}
751 	out(flags, NULL);
752 	out(flags|O_VERB2, "\t        Tree:");
753 	itree_ptree(flags|O_VERB2, fmep->eventtree);
754 }
755 
756 static struct node *
757 pathstring2epnamenp(char *path)
758 {
759 	char *sep = "/";
760 	struct node *ret;
761 	char *ptr;
762 
763 	if ((ptr = strtok(path, sep)) == NULL)
764 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
765 
766 	ret = tree_iname(stable(ptr), NULL, 0);
767 
768 	while ((ptr = strtok(NULL, sep)) != NULL)
769 		ret = tree_name_append(ret,
770 		    tree_iname(stable(ptr), NULL, 0));
771 
772 	return (ret);
773 }
774 
775 /*
776  * for a given upset sp, increment the corresponding SERD engine.  if the
777  * SERD engine trips, return the ename and ipp of the resulting ereport.
778  * returns true if engine tripped and *enamep and *ippp were filled in.
779  */
780 static int
781 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
782     fmd_case_t *fmcase, struct event *sp, const char **enamep,
783     const struct ipath **ippp)
784 {
785 	struct node *serdinst;
786 	char *serdname;
787 	struct node *nid;
788 
789 	ASSERT(sp->t == N_UPSET);
790 	ASSERT(ffep != NULL);
791 
792 	/*
793 	 * obtain instanced SERD engine from the upset sp.  from this
794 	 * derive serdname, the string used to identify the SERD engine.
795 	 */
796 	serdinst = eventprop_lookup(sp, L_engine);
797 
798 	if (serdinst == NULL)
799 		return (NULL);
800 
801 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
802 	    ipath(serdinst->u.stmt.np->u.event.epname));
803 
804 	/* handle serd engine "id" property, if there is one */
805 	if ((nid =
806 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
807 		struct evalue *gval;
808 		char suffixbuf[200];
809 		char *suffix;
810 		char *nserdname;
811 		size_t nname;
812 
813 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
814 		ptree_name_iter(O_ALTFP|O_NONL, nid);
815 
816 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
817 
818 		if ((gval = lut_lookup(fmep->globals,
819 		    (void *)nid->u.globid.s, NULL)) == NULL) {
820 			out(O_ALTFP, " undefined");
821 		} else if (gval->t == UINT64) {
822 			out(O_ALTFP, " %llu", gval->v);
823 			(void) sprintf(suffixbuf, "%llu", gval->v);
824 			suffix = suffixbuf;
825 		} else {
826 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
827 			suffix = (char *)(uintptr_t)gval->v;
828 		}
829 
830 		nname = strlen(serdname) + strlen(suffix) + 2;
831 		nserdname = MALLOC(nname);
832 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
833 		FREE(serdname);
834 		serdname = nserdname;
835 	}
836 
837 	if (!fmd_serd_exists(hdl, serdname)) {
838 		struct node *nN, *nT;
839 
840 		/* no SERD engine yet, so create it */
841 		nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL);
842 		nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL);
843 
844 		ASSERT(nN->t == T_NUM);
845 		ASSERT(nT->t == T_TIMEVAL);
846 
847 		fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull,
848 		    (hrtime_t)nT->u.ull);
849 	}
850 
851 
852 	/*
853 	 * increment SERD engine.  if engine fires, reset serd
854 	 * engine and return trip_strcode
855 	 */
856 	if (fmd_serd_record(hdl, serdname, ffep)) {
857 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
858 		    (void *)L_trip, NULL);
859 
860 		ASSERT(tripinst != NULL);
861 
862 		*enamep = tripinst->u.event.ename->u.name.s;
863 		*ippp = ipath(tripinst->u.event.epname);
864 
865 		fmd_case_add_serd(hdl, fmcase, serdname);
866 		fmd_serd_reset(hdl, serdname);
867 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
868 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
869 		out(O_ALTFP, "]");
870 
871 		FREE(serdname);
872 		return (1);
873 	}
874 
875 	FREE(serdname);
876 	return (0);
877 }
878 
879 /*
880  * search a suspect list for upsets.  feed each upset to serd_eval() and
881  * build up tripped[], an array of ereports produced by the firing of
882  * any SERD engines.  then feed each ereport back into
883  * fme_receive_report().
884  *
885  * returns ntrip, the number of these ereports produced.
886  */
887 static int
888 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
889 {
890 	/* we build an array of tripped ereports that we send ourselves */
891 	struct {
892 		const char *ename;
893 		const struct ipath *ipp;
894 	} *tripped;
895 	struct event *sp;
896 	int ntrip, nupset, i;
897 
898 	/*
899 	 * count the number of upsets to determine the upper limit on
900 	 * expected trip ereport strings.  remember that one upset can
901 	 * lead to at most one ereport.
902 	 */
903 	nupset = 0;
904 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
905 		if (sp->t == N_UPSET)
906 			nupset++;
907 	}
908 
909 	if (nupset == 0)
910 		return (0);
911 
912 	/*
913 	 * get to this point if we have upsets and expect some trip
914 	 * ereports
915 	 */
916 	tripped = alloca(sizeof (*tripped) * nupset);
917 	bzero((void *)tripped, sizeof (*tripped) * nupset);
918 
919 	ntrip = 0;
920 	for (sp = fmep->suspects; sp; sp = sp->suspects)
921 		if (sp->t == N_UPSET &&
922 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
923 			    &tripped[ntrip].ename, &tripped[ntrip].ipp))
924 			ntrip++;
925 
926 	for (i = 0; i < ntrip; i++)
927 		fme_receive_report(fmep->hdl, ffep,
928 		    tripped[i].ename, tripped[i].ipp, NULL);
929 
930 	return (ntrip);
931 }
932 
933 /*
934  * fme_receive_external_report -- call when an external ereport comes in
935  *
936  * this routine just converts the relevant information from the ereport
937  * into a format used internally and passes it on to fme_receive_report().
938  */
939 void
940 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
941     const char *eventstring)
942 {
943 	struct node *epnamenp = platform_getpath(nvl);
944 	const struct ipath *ipp;
945 
946 	/*
947 	 * XFILE: If we ended up without a path, it's an X-file.
948 	 * For now, use our undiagnosable interface.
949 	 */
950 	if (epnamenp == NULL) {
951 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
952 		Undiag_reason = UD_NOPATH;
953 		publish_undiagnosable(hdl, ffep);
954 		return;
955 	}
956 
957 	ipp = ipath(epnamenp);
958 	tree_free(epnamenp);
959 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
960 }
961 
962 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
963     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
964 
965 /* ARGSUSED */
966 static void
967 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
968 {
969 	struct bubble *bp;
970 	struct arrowlist *ap;
971 
972 	ep->cached_state = 0;
973 	ep->keep_in_tree = 0;
974 	for (bp = itree_next_bubble(ep, NULL); bp;
975 	    bp = itree_next_bubble(ep, bp)) {
976 		if (bp->t != B_FROM)
977 			continue;
978 		bp->mark = 0;
979 		for (ap = itree_next_arrow(bp, NULL); ap;
980 		    ap = itree_next_arrow(bp, ap))
981 			ap->arrowp->mark = 0;
982 	}
983 }
984 
985 static void
986 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
987     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
988 {
989 	struct event *ep;
990 	struct fme *fmep = NULL;
991 	struct fme *ofmep = NULL;
992 	struct fme *cfmep, *svfmep;
993 	int matched = 0;
994 	nvlist_t *defect;
995 
996 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
997 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
998 	out(O_ALTFP|O_STAMP, NULL);
999 
1000 	/* decide which FME it goes to */
1001 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1002 		int prev_verbose;
1003 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1004 		enum fme_state state;
1005 		nvlist_t *pre_peek_nvp = NULL;
1006 
1007 		if (fmep->overflow) {
1008 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1009 				ofmep = fmep;
1010 
1011 			continue;
1012 		}
1013 
1014 		/* look up event in event tree for this FME */
1015 		if ((ep = itree_lookup(fmep->eventtree,
1016 		    eventstring, ipp)) == NULL)
1017 			continue;
1018 
1019 		/* note observation */
1020 		fmep->ecurrent = ep;
1021 		if (ep->count++ == 0) {
1022 			/* link it into list of observations seen */
1023 			ep->observations = fmep->observations;
1024 			fmep->observations = ep;
1025 			ep->nvp = evnv_dupnvl(nvl);
1026 		} else {
1027 			/* use new payload values for peek */
1028 			pre_peek_nvp = ep->nvp;
1029 			ep->nvp = evnv_dupnvl(nvl);
1030 		}
1031 
1032 		/* tell hypothesise() not to mess with suspect list */
1033 		fmep->peek = 1;
1034 
1035 		/* don't want this to be verbose (unless Debug is set) */
1036 		prev_verbose = Verbose;
1037 		if (Debug == 0)
1038 			Verbose = 0;
1039 
1040 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1041 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1042 
1043 		fmep->peek = 0;
1044 
1045 		/* put verbose flag back */
1046 		Verbose = prev_verbose;
1047 
1048 		if (state != FME_DISPROVED) {
1049 			/* found an FME that explains the ereport */
1050 			matched++;
1051 			out(O_ALTFP|O_NONL, "[");
1052 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1053 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1054 
1055 			if (pre_peek_nvp)
1056 				nvlist_free(pre_peek_nvp);
1057 
1058 			if (ep->count == 1)
1059 				serialize_observation(fmep, eventstring, ipp);
1060 
1061 			if (ffep)
1062 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1063 
1064 			stats_counter_bump(fmep->Rcount);
1065 
1066 			/* re-eval FME */
1067 			fme_eval(fmep, ffep);
1068 		} else {
1069 
1070 			/* not a match, undo noting of observation */
1071 			fmep->ecurrent = NULL;
1072 			if (--ep->count == 0) {
1073 				/* unlink it from observations */
1074 				fmep->observations = ep->observations;
1075 				ep->observations = NULL;
1076 				nvlist_free(ep->nvp);
1077 				ep->nvp = NULL;
1078 			} else {
1079 				nvlist_free(ep->nvp);
1080 				ep->nvp = pre_peek_nvp;
1081 			}
1082 		}
1083 	}
1084 
1085 	if (matched)
1086 		return;	/* explained by at least one existing FME */
1087 
1088 	/* clean up closed fmes */
1089 	cfmep = ClosedFMEs;
1090 	while (cfmep != NULL) {
1091 		svfmep = cfmep->next;
1092 		destroy_fme(cfmep);
1093 		cfmep = svfmep;
1094 	}
1095 	ClosedFMEs = NULL;
1096 
1097 	if (ofmep) {
1098 		out(O_ALTFP|O_NONL, "[");
1099 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1100 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1101 		if (ffep)
1102 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1103 
1104 		return;
1105 
1106 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1107 		out(O_ALTFP|O_NONL, "[");
1108 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1109 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1110 		/* Create overflow fme */
1111 		if ((fmep = newfme(eventstring, ipp)) == NULL) {
1112 			out(O_ALTFP|O_NONL, "[");
1113 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1114 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1115 			publish_undiagnosable(hdl, ffep);
1116 			return;
1117 		}
1118 
1119 		Open_fme_count++;
1120 
1121 		fmep->fmcase = fmd_case_open(hdl, NULL);
1122 		fmep->hdl = hdl;
1123 		init_fme_bufs(fmep);
1124 		fmep->overflow = B_TRUE;
1125 
1126 		if (ffep)
1127 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1128 
1129 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1130 		    NULL, NULL, NULL);
1131 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1132 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1133 		fmd_case_solve(hdl, fmep->fmcase);
1134 		return;
1135 	}
1136 
1137 	/* start a new FME */
1138 	if ((fmep = newfme(eventstring, ipp)) == NULL) {
1139 		out(O_ALTFP|O_NONL, "[");
1140 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1141 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1142 		publish_undiagnosable(hdl, ffep);
1143 		return;
1144 	}
1145 
1146 	Open_fme_count++;
1147 
1148 	/* open a case */
1149 	fmep->fmcase = fmd_case_open(hdl, NULL);
1150 	fmep->hdl = hdl;
1151 	init_fme_bufs(fmep);
1152 
1153 	out(O_ALTFP|O_NONL, "[");
1154 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1155 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1156 	    fmd_case_uuid(hdl, fmep->fmcase));
1157 
1158 	ep = fmep->e0;
1159 	ASSERT(ep != NULL);
1160 
1161 	/* note observation */
1162 	fmep->ecurrent = ep;
1163 	if (ep->count++ == 0) {
1164 		/* link it into list of observations seen */
1165 		ep->observations = fmep->observations;
1166 		fmep->observations = ep;
1167 		ep->nvp = evnv_dupnvl(nvl);
1168 		serialize_observation(fmep, eventstring, ipp);
1169 	} else {
1170 		/* new payload overrides any previous */
1171 		nvlist_free(ep->nvp);
1172 		ep->nvp = evnv_dupnvl(nvl);
1173 	}
1174 
1175 	stats_counter_bump(fmep->Rcount);
1176 
1177 	if (ffep) {
1178 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1179 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1180 		fmep->e0r = ffep;
1181 	}
1182 
1183 	/* give the diagnosis algorithm a shot at the new FME state */
1184 	fme_eval(fmep, ffep);
1185 }
1186 
1187 void
1188 fme_status(int flags)
1189 {
1190 	struct fme *fmep;
1191 
1192 	if (FMElist == NULL) {
1193 		out(flags, "No fault management exercises underway.");
1194 		return;
1195 	}
1196 
1197 	for (fmep = FMElist; fmep; fmep = fmep->next)
1198 		fme_print(flags, fmep);
1199 }
1200 
1201 /*
1202  * "indent" routines used mostly for nicely formatted debug output, but also
1203  * for sanity checking for infinite recursion bugs.
1204  */
1205 
1206 #define	MAX_INDENT 1024
1207 static const char *indent_s[MAX_INDENT];
1208 static int current_indent;
1209 
1210 static void
1211 indent_push(const char *s)
1212 {
1213 	if (current_indent < MAX_INDENT)
1214 		indent_s[current_indent++] = s;
1215 	else
1216 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1217 }
1218 
1219 static void
1220 indent_set(const char *s)
1221 {
1222 	current_indent = 0;
1223 	indent_push(s);
1224 }
1225 
1226 static void
1227 indent_pop(void)
1228 {
1229 	if (current_indent > 0)
1230 		current_indent--;
1231 	else
1232 		out(O_DIE, "recursion underflow");
1233 }
1234 
1235 static void
1236 indent(void)
1237 {
1238 	int i;
1239 	if (!Verbose)
1240 		return;
1241 	for (i = 0; i < current_indent; i++)
1242 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1243 }
1244 
1245 static int
1246 suspects_changed(struct fme *fmep)
1247 {
1248 	struct event *suspects = fmep->suspects;
1249 	struct event *psuspects = fmep->psuspects;
1250 
1251 	while (suspects != NULL && psuspects != NULL) {
1252 		if (suspects != psuspects)
1253 			return (1);
1254 		suspects = suspects->suspects;
1255 		psuspects = psuspects->psuspects;
1256 	}
1257 
1258 	return (suspects != psuspects);
1259 }
1260 
1261 #define	SLNEW		1
1262 #define	SLCHANGED	2
1263 #define	SLWAIT		3
1264 #define	SLDISPROVED	4
1265 
1266 static void
1267 print_suspects(int circumstance, struct fme *fmep)
1268 {
1269 	struct event *ep;
1270 
1271 	out(O_ALTFP|O_NONL, "[");
1272 	if (circumstance == SLCHANGED) {
1273 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1274 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1275 	} else if (circumstance == SLWAIT) {
1276 		out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id);
1277 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1278 	} else if (circumstance == SLDISPROVED) {
1279 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1280 	} else {
1281 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1282 	}
1283 
1284 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1285 		out(O_ALTFP, "]");
1286 		return;
1287 	}
1288 
1289 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1290 		out(O_ALTFP|O_NONL, " ");
1291 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1292 	}
1293 	out(O_ALTFP, "]");
1294 }
1295 
1296 static struct node *
1297 eventprop_lookup(struct event *ep, const char *propname)
1298 {
1299 	return (lut_lookup(ep->props, (void *)propname, NULL));
1300 }
1301 
1302 #define	MAXDIGITIDX	23
1303 static char numbuf[MAXDIGITIDX + 1];
1304 
1305 static int
1306 node2uint(struct node *n, uint_t *valp)
1307 {
1308 	struct evalue value;
1309 	struct lut *globals = NULL;
1310 
1311 	if (n == NULL)
1312 		return (1);
1313 
1314 	/*
1315 	 * check value.v since we are being asked to convert an unsigned
1316 	 * long long int to an unsigned int
1317 	 */
1318 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1319 	    value.t != UINT64 || value.v > (1ULL << 32))
1320 		return (1);
1321 
1322 	*valp = (uint_t)value.v;
1323 
1324 	return (0);
1325 }
1326 
1327 static nvlist_t *
1328 node2fmri(struct node *n)
1329 {
1330 	nvlist_t **pa, *f, *p;
1331 	struct node *nc;
1332 	uint_t depth = 0;
1333 	char *numstr, *nullbyte;
1334 	char *failure;
1335 	int err, i;
1336 
1337 	/* XXX do we need to be able to handle a non-T_NAME node? */
1338 	if (n == NULL || n->t != T_NAME)
1339 		return (NULL);
1340 
1341 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1342 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1343 			break;
1344 		depth++;
1345 	}
1346 
1347 	if (nc != NULL) {
1348 		/* We bailed early, something went wrong */
1349 		return (NULL);
1350 	}
1351 
1352 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1353 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1354 	pa = alloca(depth * sizeof (nvlist_t *));
1355 	for (i = 0; i < depth; i++)
1356 		pa[i] = NULL;
1357 
1358 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1359 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1360 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1361 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1362 	if (err != 0) {
1363 		failure = "basic construction of FMRI failed";
1364 		goto boom;
1365 	}
1366 
1367 	numbuf[MAXDIGITIDX] = '\0';
1368 	nullbyte = &numbuf[MAXDIGITIDX];
1369 	i = 0;
1370 
1371 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1372 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1373 		if (err != 0) {
1374 			failure = "alloc of an hc-pair failed";
1375 			goto boom;
1376 		}
1377 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1378 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1379 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1380 		if (err != 0) {
1381 			failure = "construction of an hc-pair failed";
1382 			goto boom;
1383 		}
1384 		pa[i++] = p;
1385 	}
1386 
1387 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1388 	if (err == 0) {
1389 		for (i = 0; i < depth; i++)
1390 			if (pa[i] != NULL)
1391 				nvlist_free(pa[i]);
1392 		return (f);
1393 	}
1394 	failure = "addition of hc-pair array to FMRI failed";
1395 
1396 boom:
1397 	for (i = 0; i < depth; i++)
1398 		if (pa[i] != NULL)
1399 			nvlist_free(pa[i]);
1400 	nvlist_free(f);
1401 	out(O_DIE, "%s", failure);
1402 	/*NOTREACHED*/
1403 	return (NULL);
1404 }
1405 
1406 static uint_t
1407 avg(uint_t sum, uint_t cnt)
1408 {
1409 	unsigned long long s = sum * 10;
1410 
1411 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1412 }
1413 
1414 static uint8_t
1415 percentof(uint_t part, uint_t whole)
1416 {
1417 	unsigned long long p = part * 1000;
1418 
1419 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1420 }
1421 
1422 struct rsl {
1423 	struct event *suspect;
1424 	nvlist_t *asru;
1425 	nvlist_t *fru;
1426 	nvlist_t *rsrc;
1427 };
1428 
1429 /*
1430  *  rslfree -- free internal members of struct rsl not expected to be
1431  *	freed elsewhere.
1432  */
1433 static void
1434 rslfree(struct rsl *freeme)
1435 {
1436 	if (freeme->asru != NULL)
1437 		nvlist_free(freeme->asru);
1438 	if (freeme->fru != NULL)
1439 		nvlist_free(freeme->fru);
1440 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
1441 		nvlist_free(freeme->rsrc);
1442 }
1443 
1444 /*
1445  *  rslcmp -- compare two rsl structures.  Use the following
1446  *	comparisons to establish cardinality:
1447  *
1448  *	1. Name of the suspect's class. (simple strcmp)
1449  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
1450  *
1451  */
1452 static int
1453 rslcmp(const void *a, const void *b)
1454 {
1455 	struct rsl *r1 = (struct rsl *)a;
1456 	struct rsl *r2 = (struct rsl *)b;
1457 	int rv;
1458 
1459 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
1460 	    r2->suspect->enode->u.event.ename->u.name.s);
1461 	if (rv != 0)
1462 		return (rv);
1463 
1464 	if (r1->asru == NULL && r2->asru == NULL)
1465 		return (0);
1466 	if (r1->asru == NULL)
1467 		return (-1);
1468 	if (r2->asru == NULL)
1469 		return (1);
1470 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
1471 }
1472 
1473 /*
1474  *  rsluniq -- given an array of rsl structures, seek out and "remove"
1475  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
1476  *	of the array element.  Removal also means updating the number of
1477  *	problems and the number of problems which are not faults.  User
1478  *	provides the first and last element pointers.
1479  */
1480 static void
1481 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
1482 {
1483 	struct rsl *cr;
1484 
1485 	if (*nprobs == 1)
1486 		return;
1487 
1488 	/*
1489 	 *  At this point, we only expect duplicate defects.
1490 	 *  Eversholt's diagnosis algorithm prevents duplicate
1491 	 *  suspects, but we rewrite defects in the platform code after
1492 	 *  the diagnosis is made, and that can introduce new
1493 	 *  duplicates.
1494 	 */
1495 	while (first <= last) {
1496 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
1497 			first++;
1498 			continue;
1499 		}
1500 		cr = first + 1;
1501 		while (cr <= last) {
1502 			if (is_defect(first->suspect->t)) {
1503 				if (rslcmp(first, cr) == 0) {
1504 					cr->suspect = NULL;
1505 					rslfree(cr);
1506 					(*nprobs)--;
1507 					(*nnonf)--;
1508 				}
1509 			}
1510 			/*
1511 			 * assume all defects are in order after our
1512 			 * sort and short circuit here with "else break" ?
1513 			 */
1514 			cr++;
1515 		}
1516 		first++;
1517 	}
1518 }
1519 
1520 /*
1521  * get_resources -- for a given suspect, determine what ASRU, FRU and
1522  *     RSRC nvlists should be advertised in the final suspect list.
1523  */
1524 void
1525 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
1526 {
1527 	struct node *asrudef, *frudef;
1528 	nvlist_t *asru, *fru;
1529 	nvlist_t *rsrc = NULL;
1530 	char *pathstr;
1531 
1532 	/*
1533 	 * First find any ASRU and/or FRU defined in the
1534 	 * initial fault tree.
1535 	 */
1536 	asrudef = eventprop_lookup(sp, L_ASRU);
1537 	frudef = eventprop_lookup(sp, L_FRU);
1538 
1539 	/*
1540 	 * Create FMRIs based on those definitions
1541 	 */
1542 	asru = node2fmri(asrudef);
1543 	fru = node2fmri(frudef);
1544 	pathstr = ipath2str(NULL, sp->ipp);
1545 
1546 	/*
1547 	 * Allow for platform translations of the FMRIs
1548 	 */
1549 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
1550 	    pathstr);
1551 
1552 	FREE(pathstr);
1553 	rsrcs->suspect = sp;
1554 	rsrcs->asru = asru;
1555 	rsrcs->fru = fru;
1556 	rsrcs->rsrc = rsrc;
1557 }
1558 
1559 /*
1560  * trim_suspects -- prior to publishing, we may need to remove some
1561  *    suspects from the list.  If we're auto-closing upsets, we don't
1562  *    want any of those in the published list.  If the ASRUs for multiple
1563  *    defects resolve to the same ASRU (driver) we only want to publish
1564  *    that as a single suspect.
1565  */
1566 static void
1567 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
1568     struct rsl **end)
1569 {
1570 	struct event *ep;
1571 	struct rsl *rp;
1572 	int rpcnt;
1573 
1574 	/*
1575 	 * First save the suspects in the psuspects, then copy back
1576 	 * only the ones we wish to retain.  This resets nsuspects to
1577 	 * zero.
1578 	 */
1579 	rpcnt = fmep->nsuspects;
1580 	save_suspects(fmep);
1581 
1582 	/*
1583 	 * allocate an array of resource pointers for the suspects.
1584 	 * We may end up using less than the full allocation, but this
1585 	 * is a very short-lived array.  publish_suspects() will free
1586 	 * this array when it's done using it.
1587 	 */
1588 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
1589 	bzero(rp, rpcnt * sizeof (struct rsl));
1590 
1591 	/* first pass, remove any unwanted upsets and populate our array */
1592 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
1593 		if (no_upsets && is_upset(ep->t))
1594 			continue;
1595 		get_resources(ep, rp, fmep->cfgdata->cooked);
1596 		rp++;
1597 		fmep->nsuspects++;
1598 		if (!is_fault(ep->t))
1599 			fmep->nonfault++;
1600 	}
1601 
1602 	/* if all we had was unwanted upsets, we're done */
1603 	if (fmep->nsuspects == 0)
1604 		return;
1605 
1606 	*end = rp - 1;
1607 
1608 	/* sort the array */
1609 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
1610 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
1611 }
1612 
1613 /*
1614  * addpayloadprop -- add a payload prop to a problem
1615  */
1616 static void
1617 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
1618 {
1619 	ASSERT(fault != NULL);
1620 	ASSERT(lhs != NULL);
1621 	ASSERT(rhs != NULL);
1622 
1623 	if (rhs->t == UINT64) {
1624 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
1625 
1626 		if (nvlist_add_uint64(fault, lhs, rhs->v) != 0)
1627 			out(O_DIE,
1628 			    "cannot add payloadprop \"%s\" to fault", lhs);
1629 	} else {
1630 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
1631 		    lhs, (char *)(uintptr_t)rhs->v);
1632 
1633 		if (nvlist_add_string(fault, lhs, (char *)(uintptr_t)rhs->v) !=
1634 		    0)
1635 			out(O_DIE,
1636 			    "cannot add payloadprop \"%s\" to fault", lhs);
1637 	}
1638 }
1639 
1640 static char *Istatbuf;
1641 static char *Istatbufptr;
1642 static int Istatsz;
1643 
1644 /*
1645  * istataddsize -- calculate size of istat and add it to Istatsz
1646  */
1647 /*ARGSUSED2*/
1648 static void
1649 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
1650 {
1651 	int val;
1652 
1653 	ASSERT(lhs != NULL);
1654 	ASSERT(rhs != NULL);
1655 
1656 	if ((val = stats_counter_value(rhs)) == 0)
1657 		return;	/* skip zero-valued stats */
1658 
1659 	/* count up the size of the stat name */
1660 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
1661 	Istatsz++;	/* for the trailing NULL byte */
1662 
1663 	/* count up the size of the stat value */
1664 	Istatsz += snprintf(NULL, 0, "%d", val);
1665 	Istatsz++;	/* for the trailing NULL byte */
1666 }
1667 
1668 /*
1669  * istat2str -- serialize an istat, writing result to *Istatbufptr
1670  */
1671 /*ARGSUSED2*/
1672 static void
1673 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
1674 {
1675 	char *str;
1676 	int len;
1677 	int val;
1678 
1679 	ASSERT(lhs != NULL);
1680 	ASSERT(rhs != NULL);
1681 
1682 	if ((val = stats_counter_value(rhs)) == 0)
1683 		return;	/* skip zero-valued stats */
1684 
1685 	/* serialize the stat name */
1686 	str = ipath2str(lhs->ename, lhs->ipath);
1687 	len = strlen(str);
1688 
1689 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
1690 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
1691 	Istatbufptr += len;
1692 	FREE(str);
1693 	*Istatbufptr++ = '\0';
1694 
1695 	/* serialize the stat value */
1696 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
1697 	    "%d", val);
1698 	*Istatbufptr++ = '\0';
1699 
1700 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
1701 }
1702 
1703 void
1704 istat_save()
1705 {
1706 	if (Istat_need_save == 0)
1707 		return;
1708 
1709 	/* figure out how big the serialzed info is */
1710 	Istatsz = 0;
1711 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
1712 
1713 	if (Istatsz == 0) {
1714 		/* no stats to save */
1715 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
1716 		return;
1717 	}
1718 
1719 	/* create the serialized buffer */
1720 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
1721 	lut_walk(Istats, (lut_cb)istat2str, NULL);
1722 
1723 	/* clear out current saved stats */
1724 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
1725 
1726 	/* write out the new version */
1727 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
1728 	FREE(Istatbuf);
1729 
1730 	Istat_need_save = 0;
1731 }
1732 
1733 int
1734 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
1735 {
1736 	if (ent1->ename != ent2->ename)
1737 		return (ent2->ename - ent1->ename);
1738 	if (ent1->ipath != ent2->ipath)
1739 		return ((char *)ent2->ipath - (char *)ent1->ipath);
1740 
1741 	return (0);
1742 }
1743 
1744 /*
1745  * istat-verify -- verify the component associated with a stat still exists
1746  *
1747  * if the component no longer exists, this routine resets the stat and
1748  * returns 0.  if the component still exists, it returns 1.
1749  */
1750 static int
1751 istat_verify(struct node *snp, struct istat_entry *entp)
1752 {
1753 	struct stats *statp;
1754 	nvlist_t *fmri;
1755 
1756 	fmri = node2fmri(snp->u.event.epname);
1757 	if (platform_path_exists(fmri)) {
1758 		nvlist_free(fmri);
1759 		return (1);
1760 	}
1761 	nvlist_free(fmri);
1762 
1763 	/* component no longer in system.  zero out the associated stats */
1764 	if ((statp = (struct stats *)
1765 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
1766 	    stats_counter_value(statp) == 0)
1767 		return (0);	/* stat is already reset */
1768 
1769 	Istat_need_save = 1;
1770 	stats_counter_reset(statp);
1771 	return (0);
1772 }
1773 
1774 static void
1775 istat_bump(struct node *snp, int n)
1776 {
1777 	struct stats *statp;
1778 	struct istat_entry ent;
1779 
1780 	ASSERT(snp != NULL);
1781 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
1782 	ASSERT(snp->u.event.epname != NULL);
1783 
1784 	/* class name should be hoisted into a single stable entry */
1785 	ASSERT(snp->u.event.ename->u.name.next == NULL);
1786 	ent.ename = snp->u.event.ename->u.name.s;
1787 	ent.ipath = ipath(snp->u.event.epname);
1788 
1789 	if (!istat_verify(snp, &ent)) {
1790 		/* component no longer exists in system, nothing to do */
1791 		return;
1792 	}
1793 
1794 	if ((statp = (struct stats *)
1795 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
1796 		/* need to create the counter */
1797 		int cnt = 0;
1798 		struct node *np;
1799 		char *sname;
1800 		char *snamep;
1801 		struct istat_entry *newentp;
1802 
1803 		/* count up the size of the stat name */
1804 		np = snp->u.event.ename;
1805 		while (np != NULL) {
1806 			cnt += strlen(np->u.name.s);
1807 			cnt++;	/* for the '.' or '@' */
1808 			np = np->u.name.next;
1809 		}
1810 		np = snp->u.event.epname;
1811 		while (np != NULL) {
1812 			cnt += snprintf(NULL, 0, "%s%llu",
1813 			    np->u.name.s, np->u.name.child->u.ull);
1814 			cnt++;	/* for the '/' or trailing NULL byte */
1815 			np = np->u.name.next;
1816 		}
1817 
1818 		/* build the stat name */
1819 		snamep = sname = alloca(cnt);
1820 		np = snp->u.event.ename;
1821 		while (np != NULL) {
1822 			snamep += snprintf(snamep, &sname[cnt] - snamep,
1823 			    "%s", np->u.name.s);
1824 			np = np->u.name.next;
1825 			if (np)
1826 				*snamep++ = '.';
1827 		}
1828 		*snamep++ = '@';
1829 		np = snp->u.event.epname;
1830 		while (np != NULL) {
1831 			snamep += snprintf(snamep, &sname[cnt] - snamep,
1832 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
1833 			np = np->u.name.next;
1834 			if (np)
1835 				*snamep++ = '/';
1836 		}
1837 		*snamep++ = '\0';
1838 
1839 		/* create the new stat & add it to our list */
1840 		newentp = MALLOC(sizeof (*newentp));
1841 		*newentp = ent;
1842 		statp = stats_new_counter(NULL, sname, 0);
1843 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
1844 		    (lut_cmp)istat_cmp);
1845 	}
1846 
1847 	/* if n is non-zero, set that value instead of bumping */
1848 	if (n) {
1849 		stats_counter_reset(statp);
1850 		stats_counter_add(statp, n);
1851 	} else
1852 		stats_counter_bump(statp);
1853 	Istat_need_save = 1;
1854 }
1855 
1856 /*ARGSUSED*/
1857 static void
1858 istat_destructor(void *left, void *right, void *arg)
1859 {
1860 	struct istat_entry *entp = (struct istat_entry *)left;
1861 	struct stats *statp = (struct stats *)right;
1862 	FREE(entp);
1863 	stats_delete(statp);
1864 }
1865 
1866 void
1867 istat_fini(void)
1868 {
1869 	lut_free(Istats, istat_destructor, NULL);
1870 }
1871 
1872 static void
1873 publish_suspects(struct fme *fmep)
1874 {
1875 	struct event *ep;
1876 	struct rsl *srl = NULL;
1877 	struct rsl *erl;
1878 	struct rsl *rp;
1879 	nvlist_t *fault;
1880 	uint8_t cert;
1881 	uint_t *frs;
1882 	uint_t fravg, frsum, fr;
1883 	uint_t messval;
1884 	struct node *snp;
1885 	int frcnt, fridx;
1886 	boolean_t no_upsets = B_FALSE;
1887 	boolean_t allfaulty = B_TRUE;
1888 
1889 	stats_counter_bump(fmep->diags);
1890 
1891 	/*
1892 	 * The current fmd interfaces don't allow us to solve a case
1893 	 * that's already solved.  If we make a new case, what of the
1894 	 * ereports?  We don't appear to have an interface that allows
1895 	 * us to access the ereports attached to a case (if we wanted
1896 	 * to copy the original case's ereport attachments to the new
1897 	 * case) and it's also a bit unclear if there would be any
1898 	 * problems with having ereports attached to multiple cases
1899 	 * and/or attaching DIAGNOSED ereports to a case.  For now,
1900 	 * we'll just output a message.
1901 	 */
1902 	if (fmep->posted_suspects ||
1903 	    fmd_case_solved(fmep->hdl, fmep->fmcase)) {
1904 		out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ",
1905 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1906 		for (ep = fmep->suspects; ep; ep = ep->suspects) {
1907 			out(O_ALTFP|O_NONL, " ");
1908 			itree_pevent_brief(O_ALTFP|O_NONL, ep);
1909 		}
1910 		out(O_ALTFP, NULL);
1911 		return;
1912 	}
1913 
1914 	/*
1915 	 * If we're auto-closing upsets, we don't want to include them
1916 	 * in any produced suspect lists or certainty accounting.
1917 	 */
1918 	if (Autoclose != NULL)
1919 		if (strcmp(Autoclose, "true") == 0 ||
1920 		    strcmp(Autoclose, "all") == 0 ||
1921 		    strcmp(Autoclose, "upsets") == 0)
1922 			no_upsets = B_TRUE;
1923 
1924 	trim_suspects(fmep, no_upsets, &srl, &erl);
1925 
1926 	/*
1927 	 * If the resulting suspect list has no members, we're
1928 	 * done.  Returning here will simply close the case.
1929 	 */
1930 	if (fmep->nsuspects == 0) {
1931 		out(O_ALTFP,
1932 		    "[FME%d, case %s (all suspects are upsets)]",
1933 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
1934 		FREE(srl);
1935 		restore_suspects(fmep);
1936 		return;
1937 	}
1938 
1939 	/*
1940 	 * If the suspect list is all faults, then for a given fault,
1941 	 * say X of N, X's certainty is computed via:
1942 	 *
1943 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
1944 	 *
1945 	 * If none of the suspects are faults, and there are N suspects,
1946 	 * the certainty of a given suspect is 100/N.
1947 	 *
1948 	 * If there are are a mixture of faults and other problems in
1949 	 * the suspect list, we take an average of the faults'
1950 	 * FITrates and treat this average as the FITrate for any
1951 	 * non-faults.  The fitrate of any given suspect is then
1952 	 * computed per the first formula above.
1953 	 */
1954 	if (fmep->nonfault == fmep->nsuspects) {
1955 		/* NO faults in the suspect list */
1956 		cert = percentof(1, fmep->nsuspects);
1957 	} else {
1958 		/* sum the fitrates */
1959 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
1960 		fridx = frcnt = frsum = 0;
1961 
1962 		for (rp = srl; rp <= erl; rp++) {
1963 			struct node *n;
1964 
1965 			if (rp->suspect == NULL)
1966 				continue;
1967 			if (!is_fault(rp->suspect->t)) {
1968 				frs[fridx++] = 0;
1969 				continue;
1970 			}
1971 			n = eventprop_lookup(rp->suspect, L_FITrate);
1972 			if (node2uint(n, &fr) != 0) {
1973 				out(O_DEBUG|O_NONL, "event ");
1974 				ipath_print(O_DEBUG|O_NONL,
1975 				    ep->enode->u.event.ename->u.name.s,
1976 				    ep->ipp);
1977 				out(O_DEBUG, " has no FITrate (using 1)");
1978 				fr = 1;
1979 			} else if (fr == 0) {
1980 				out(O_DEBUG|O_NONL, "event ");
1981 				ipath_print(O_DEBUG|O_NONL,
1982 				    ep->enode->u.event.ename->u.name.s,
1983 				    ep->ipp);
1984 				out(O_DEBUG, " has zero FITrate (using 1)");
1985 				fr = 1;
1986 			}
1987 
1988 			frs[fridx++] = fr;
1989 			frsum += fr;
1990 			frcnt++;
1991 		}
1992 		fravg = avg(frsum, frcnt);
1993 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
1994 			if (frs[fridx] == 0) {
1995 				frs[fridx] = fravg;
1996 				frsum += fravg;
1997 			}
1998 	}
1999 
2000 	/* Add them in reverse order of our sort, as fmd reverses order */
2001 	for (rp = erl; rp >= srl; rp--) {
2002 		if (rp->suspect == NULL)
2003 			continue;
2004 		if (!is_fault(rp->suspect->t))
2005 			allfaulty = B_FALSE;
2006 		if (fmep->nonfault != fmep->nsuspects)
2007 			cert = percentof(frs[--fridx], frsum);
2008 		fault = fmd_nvl_create_fault(fmep->hdl,
2009 		    rp->suspect->enode->u.event.ename->u.name.s,
2010 		    cert,
2011 		    rp->asru,
2012 		    rp->fru,
2013 		    rp->rsrc);
2014 		if (fault == NULL)
2015 			out(O_DIE, "fault creation failed");
2016 		/* if "message" property exists, add it to the fault */
2017 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2018 		    &messval) == 0) {
2019 
2020 			out(O_ALTFP,
2021 			    "[FME%d, %s adds message=%d to suspect list]",
2022 			    fmep->id,
2023 			    rp->suspect->enode->u.event.ename->u.name.s,
2024 			    messval);
2025 			if (nvlist_add_boolean_value(fault,
2026 			    FM_SUSPECT_MESSAGE,
2027 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2028 				out(O_DIE, "cannot add no-message to fault");
2029 			}
2030 		}
2031 		/* add any payload properties */
2032 		lut_walk(rp->suspect->payloadprops,
2033 		    (lut_cb)addpayloadprop, (void *)fault);
2034 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2035 		rp->suspect->fault = fault;
2036 		rslfree(rp);
2037 
2038 		/*
2039 		 * If "action" property exists, evaluate it;  this must be done
2040 		 * before the dupclose check below since some actions may
2041 		 * modify the asru to be used in fmd_nvl_fmri_faulty.  This
2042 		 * needs to be restructured if any new actions are introduced
2043 		 * that have effects that we do not want to be visible if
2044 		 * we decide not to publish in the dupclose check below.
2045 		 */
2046 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2047 			struct evalue evalue;
2048 
2049 			out(O_ALTFP|O_NONL,
2050 			    "[FME%d, %s action ", fmep->id,
2051 			    rp->suspect->enode->u.event.ename->u.name.s);
2052 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2053 			out(O_ALTFP, "]");
2054 			Action_nvl = fault;
2055 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2056 			    NULL, 0, &evalue);
2057 		}
2058 
2059 		/*
2060 		 * if "dupclose" tunable is set, check if the asru is
2061 		 * already marked as "faulty".
2062 		 */
2063 		if (Dupclose && allfaulty) {
2064 			nvlist_t *asru;
2065 
2066 			out(O_ALTFP|O_VERB, "FMD%d dupclose check ", fmep->id);
2067 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2068 			out(O_ALTFP|O_VERB|O_NONL, " ");
2069 			if (nvlist_lookup_nvlist(fault,
2070 			    FM_FAULT_ASRU, &asru) != 0) {
2071 				out(O_ALTFP|O_VERB, "NULL asru");
2072 				allfaulty = B_FALSE;
2073 			} else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) {
2074 				out(O_ALTFP|O_VERB, "faulty");
2075 			} else {
2076 				out(O_ALTFP|O_VERB, "not faulty");
2077 				allfaulty = B_FALSE;
2078 			}
2079 		}
2080 
2081 	}
2082 
2083 	/*
2084 	 * Close the case if all asrus are already known to be faulty and if
2085 	 * Dupclose is enabled.  Otherwise we are going to publish so take
2086 	 * any pre-publication actions.
2087 	 */
2088 	if (Dupclose && allfaulty) {
2089 		out(O_ALTFP, "[dupclose FME%d, case %s]", fmep->id,
2090 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2091 		fmd_case_close(fmep->hdl, fmep->fmcase);
2092 	} else {
2093 		for (rp = erl; rp >= srl; rp--) {
2094 			struct event *suspect = rp->suspect;
2095 
2096 			if (suspect == NULL)
2097 				continue;
2098 
2099 			fault = suspect->fault;
2100 
2101 			/* if "count" exists, increment the appropriate stat */
2102 			if ((snp = eventprop_lookup(suspect,
2103 			    L_count)) != NULL) {
2104 				out(O_ALTFP|O_NONL,
2105 				    "[FME%d, %s count ", fmep->id,
2106 				    suspect->enode->u.event.ename->u.name.s);
2107 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2108 				out(O_ALTFP, "]");
2109 				istat_bump(snp, 0);
2110 
2111 			}
2112 		}
2113 		istat_save();	/* write out any istat changes */
2114 
2115 		out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
2116 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2117 		fmd_case_solve(fmep->hdl, fmep->fmcase);
2118 	}
2119 
2120 	/*
2121 	 * revert to the original suspect list
2122 	 */
2123 	FREE(srl);
2124 	restore_suspects(fmep);
2125 }
2126 
2127 static void
2128 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep)
2129 {
2130 	struct case_list *newcase;
2131 	nvlist_t *defect;
2132 
2133 	out(O_ALTFP,
2134 	    "[undiagnosable ereport received, "
2135 	    "creating and closing a new case (%s)]",
2136 	    Undiag_reason ? Undiag_reason : "reason not provided");
2137 
2138 	newcase = MALLOC(sizeof (struct case_list));
2139 	newcase->next = NULL;
2140 
2141 	newcase->fmcase = fmd_case_open(hdl, NULL);
2142 	if (Undiagablecaselist != NULL)
2143 		newcase->next = Undiagablecaselist;
2144 	Undiagablecaselist = newcase;
2145 
2146 	if (ffep != NULL)
2147 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
2148 
2149 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
2150 	    NULL, NULL, NULL);
2151 	if (Undiag_reason != NULL)
2152 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2153 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
2154 
2155 	fmd_case_solve(hdl, newcase->fmcase);
2156 	fmd_case_close(hdl, newcase->fmcase);
2157 }
2158 
2159 static void
2160 fme_undiagnosable(struct fme *f)
2161 {
2162 	nvlist_t *defect;
2163 
2164 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
2165 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
2166 	    Undiag_reason ? Undiag_reason : "undiagnosable");
2167 
2168 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
2169 	    NULL, NULL, NULL);
2170 	if (Undiag_reason != NULL)
2171 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2172 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2173 	fmd_case_solve(f->hdl, f->fmcase);
2174 	destroy_fme_bufs(f);
2175 	fmd_case_close(f->hdl, f->fmcase);
2176 }
2177 
2178 /*
2179  * fme_close_case
2180  *
2181  *	Find the requested case amongst our fmes and close it.  Free up
2182  *	the related fme.
2183  */
2184 void
2185 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
2186 {
2187 	struct case_list *ucasep, *prevcasep = NULL;
2188 	struct fme *prev = NULL;
2189 	struct fme *fmep;
2190 
2191 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
2192 		if (fmcase != ucasep->fmcase) {
2193 			prevcasep = ucasep;
2194 			continue;
2195 		}
2196 
2197 		if (prevcasep == NULL)
2198 			Undiagablecaselist = Undiagablecaselist->next;
2199 		else
2200 			prevcasep->next = ucasep->next;
2201 
2202 		FREE(ucasep);
2203 		return;
2204 	}
2205 
2206 	for (fmep = FMElist; fmep; fmep = fmep->next) {
2207 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
2208 			break;
2209 		prev = fmep;
2210 	}
2211 
2212 	if (fmep == NULL) {
2213 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
2214 		    fmd_case_uuid(hdl, fmcase));
2215 		return;
2216 	}
2217 
2218 	if (EFMElist == fmep)
2219 		EFMElist = prev;
2220 
2221 	if (prev == NULL)
2222 		FMElist = FMElist->next;
2223 	else
2224 		prev->next = fmep->next;
2225 
2226 	fmep->next = NULL;
2227 
2228 	/* Get rid of any timer this fme has set */
2229 	if (fmep->wull != 0)
2230 		fmd_timer_remove(fmep->hdl, fmep->timer);
2231 
2232 	if (ClosedFMEs == NULL) {
2233 		ClosedFMEs = fmep;
2234 	} else {
2235 		fmep->next = ClosedFMEs;
2236 		ClosedFMEs = fmep;
2237 	}
2238 
2239 	Open_fme_count--;
2240 
2241 	/* See if we can close the overflow FME */
2242 	if (Open_fme_count <= Max_fme) {
2243 		for (fmep = FMElist; fmep; fmep = fmep->next) {
2244 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
2245 			    fmep->fmcase)))
2246 				break;
2247 		}
2248 
2249 		if (fmep != NULL)
2250 			fmd_case_close(fmep->hdl, fmep->fmcase);
2251 	}
2252 }
2253 
2254 /*
2255  * fme_set_timer()
2256  *	If the time we need to wait for the given FME is less than the
2257  *	current timer, kick that old timer out and establish a new one.
2258  */
2259 static int
2260 fme_set_timer(struct fme *fmep, unsigned long long wull)
2261 {
2262 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
2263 	ptree_timeval(O_ALTFP|O_VERB, &wull);
2264 
2265 	if (wull <= fmep->pull) {
2266 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
2267 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
2268 		out(O_ALTFP|O_VERB, NULL);
2269 		/* we've waited at least wull already, don't need timer */
2270 		return (0);
2271 	}
2272 
2273 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
2274 	if (fmep->wull != 0) {
2275 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
2276 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
2277 		out(O_ALTFP|O_VERB, NULL);
2278 	} else {
2279 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
2280 		out(O_ALTFP|O_VERB, NULL);
2281 	}
2282 
2283 	if (fmep->wull != 0)
2284 		if (wull >= fmep->wull)
2285 			/* New timer would fire later than established timer */
2286 			return (0);
2287 
2288 	if (fmep->wull != 0) {
2289 		fmd_timer_remove(fmep->hdl, fmep->timer);
2290 		if (fmep->timer == fmep->htid) {
2291 			out(O_ALTFP,
2292 			    "[stopped hesitating FME%d, case %s]",
2293 			    fmep->id,
2294 			    fmd_case_uuid(fmep->hdl,
2295 			    fmep->fmcase));
2296 			fmep->htid = 0;
2297 		}
2298 	}
2299 
2300 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
2301 	    fmep->e0r, wull);
2302 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
2303 	fmep->wull = wull;
2304 	return (1);
2305 }
2306 
2307 void
2308 fme_timer_fired(struct fme *fmep, id_t tid)
2309 {
2310 	struct fme *ffmep = NULL;
2311 
2312 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
2313 		if (ffmep == fmep)
2314 			break;
2315 
2316 	if (ffmep == NULL) {
2317 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
2318 		    (void *)fmep);
2319 		return;
2320 	}
2321 
2322 	out(O_ALTFP, "Timer fired %lx %lx", tid, fmep->htid);
2323 	if (tid != fmep->htid) {
2324 		/*
2325 		 * normal timer (not the hesitation timer)
2326 		 */
2327 		fmep->pull = fmep->wull;
2328 		fmep->wull = 0;
2329 		fmd_buf_write(fmep->hdl, fmep->fmcase,
2330 		    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
2331 		/*
2332 		 * no point in heistating if we've already waited.
2333 		 */
2334 		fmep->hesitated = 1;
2335 	} else {
2336 		fmep->hesitated = 1;
2337 	}
2338 	fme_eval(fmep, fmep->e0r);
2339 }
2340 
2341 /*
2342  * Preserve the fme's suspect list in its psuspects list, NULLing the
2343  * suspects list in the meantime.
2344  */
2345 static void
2346 save_suspects(struct fme *fmep)
2347 {
2348 	struct event *ep;
2349 	struct event *nextep;
2350 
2351 	/* zero out the previous suspect list */
2352 	for (ep = fmep->psuspects; ep; ep = nextep) {
2353 		nextep = ep->psuspects;
2354 		ep->psuspects = NULL;
2355 	}
2356 	fmep->psuspects = NULL;
2357 
2358 	/* zero out the suspect list, copying it to previous suspect list */
2359 	fmep->psuspects = fmep->suspects;
2360 	for (ep = fmep->suspects; ep; ep = nextep) {
2361 		nextep = ep->suspects;
2362 		ep->psuspects = ep->suspects;
2363 		ep->suspects = NULL;
2364 		ep->is_suspect = 0;
2365 	}
2366 	fmep->suspects = NULL;
2367 	fmep->nsuspects = 0;
2368 	fmep->nonfault = 0;
2369 }
2370 
2371 /*
2372  * Retrieve the fme's suspect list from its psuspects list.
2373  */
2374 static void
2375 restore_suspects(struct fme *fmep)
2376 {
2377 	struct event *ep;
2378 	struct event *nextep;
2379 
2380 	fmep->nsuspects = fmep->nonfault = 0;
2381 	fmep->suspects = fmep->psuspects;
2382 	for (ep = fmep->psuspects; ep; ep = nextep) {
2383 		fmep->nsuspects++;
2384 		if (!is_fault(ep->t))
2385 			fmep->nonfault++;
2386 		nextep = ep->psuspects;
2387 		ep->suspects = ep->psuspects;
2388 	}
2389 }
2390 
2391 /*
2392  * this is what we use to call the Emrys prototype code instead of main()
2393  */
2394 static void
2395 fme_eval(struct fme *fmep, fmd_event_t *ffep)
2396 {
2397 	struct event *ep;
2398 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
2399 
2400 	save_suspects(fmep);
2401 
2402 	out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id);
2403 	indent_set("  ");
2404 
2405 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
2406 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
2407 
2408 	out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
2409 	    fme_state2str(fmep->state));
2410 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
2411 		out(O_ALTFP|O_VERB|O_NONL, " ");
2412 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2413 	}
2414 	out(O_ALTFP|O_VERB, NULL);
2415 
2416 	if (fmep->posted_suspects) {
2417 		/*
2418 		 * this FME has already posted a diagnosis, so see if
2419 		 * the event changed the diagnosis and print a warning
2420 		 * if it did.
2421 		 *
2422 		 */
2423 		if (suspects_changed(fmep)) {
2424 			print_suspects(SLCHANGED, fmep);
2425 			publish_suspects(fmep);
2426 		}
2427 	} else {
2428 		switch (fmep->state) {
2429 		case FME_CREDIBLE:
2430 			/*
2431 			 * if the suspect list contains any upsets, we
2432 			 * turn off the hesitation logic (by setting
2433 			 * the hesitate flag which normally indicates
2434 			 * we've already done the hesitate logic).
2435 			 * this is done because hesitating with upsets
2436 			 * causes us to explain away additional soft errors
2437 			 * while the upset FME stays open.
2438 			 */
2439 			if (fmep->hesitated == 0) {
2440 				struct event *s;
2441 
2442 				for (s = fmep->suspects; s; s = s->suspects) {
2443 					if (s->t == N_UPSET) {
2444 						fmep->hesitated = 1;
2445 						break;
2446 					}
2447 				}
2448 			}
2449 
2450 			if (Hesitate &&
2451 			    fmep->suspects != NULL &&
2452 			    fmep->suspects->suspects != NULL &&
2453 			    fmep->hesitated == 0) {
2454 				/*
2455 				 * about to publish multi-entry suspect list,
2456 				 * set the hesitation timer if not already set.
2457 				 */
2458 				if (fmep->htid == 0) {
2459 					out(O_ALTFP|O_NONL,
2460 					    "[hesitate FME%d, case %s ",
2461 					    fmep->id,
2462 					    fmd_case_uuid(fmep->hdl,
2463 					    fmep->fmcase));
2464 					ptree_timeval(O_ALTFP|O_NONL,
2465 					    (unsigned long long *)&Hesitate);
2466 					out(O_ALTFP, "]");
2467 					if (fme_set_timer(fmep, Hesitate))
2468 						fmep->htid = fmep->timer;
2469 				} else {
2470 					out(O_ALTFP,
2471 					    "[still hesitating FME%d, case %s]",
2472 					    fmep->id,
2473 					    fmd_case_uuid(fmep->hdl,
2474 					    fmep->fmcase));
2475 				}
2476 			} else {
2477 				print_suspects(SLNEW, fmep);
2478 				(void) upsets_eval(fmep, ffep);
2479 				publish_suspects(fmep);
2480 				fmep->posted_suspects = 1;
2481 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2482 				    WOBUF_POSTD,
2483 				    (void *)&fmep->posted_suspects,
2484 				    sizeof (fmep->posted_suspects));
2485 			}
2486 			break;
2487 
2488 		case FME_WAIT:
2489 			/*
2490 			 * singleton suspect list implies
2491 			 * no point in waiting
2492 			 */
2493 			if (fmep->suspects &&
2494 			    fmep->suspects->suspects == NULL) {
2495 				print_suspects(SLNEW, fmep);
2496 				(void) upsets_eval(fmep, ffep);
2497 				publish_suspects(fmep);
2498 				fmep->posted_suspects = 1;
2499 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2500 				    WOBUF_POSTD,
2501 				    (void *)&fmep->posted_suspects,
2502 				    sizeof (fmep->posted_suspects));
2503 				fmep->state = FME_CREDIBLE;
2504 			} else {
2505 				ASSERT(my_delay > fmep->ull);
2506 				(void) fme_set_timer(fmep, my_delay);
2507 				print_suspects(SLWAIT, fmep);
2508 			}
2509 			break;
2510 
2511 		case FME_DISPROVED:
2512 			print_suspects(SLDISPROVED, fmep);
2513 			Undiag_reason = UD_UNSOLVD;
2514 			fme_undiagnosable(fmep);
2515 			break;
2516 		}
2517 	}
2518 
2519 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
2520 		int doclose = 0;
2521 
2522 		if (strcmp(Autoclose, "true") == 0 ||
2523 		    strcmp(Autoclose, "all") == 0)
2524 			doclose = 1;
2525 
2526 		if (strcmp(Autoclose, "upsets") == 0) {
2527 			doclose = 1;
2528 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
2529 				if (ep->t != N_UPSET) {
2530 					doclose = 0;
2531 					break;
2532 				}
2533 			}
2534 		}
2535 
2536 		if (doclose) {
2537 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
2538 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2539 
2540 			destroy_fme_bufs(fmep);
2541 			fmd_case_close(fmep->hdl, fmep->fmcase);
2542 		}
2543 	}
2544 	itree_prune(fmep->eventtree);
2545 }
2546 
2547 static void indent(void);
2548 static int triggered(struct fme *fmep, struct event *ep, int mark);
2549 static enum fme_state effects_test(struct fme *fmep,
2550     struct event *fault_event, unsigned long long at_latest_by,
2551     unsigned long long *pdelay);
2552 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
2553     unsigned long long at_latest_by, unsigned long long *pdelay);
2554 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
2555     unsigned long long at_latest_by, unsigned long long *pdelay);
2556 
2557 static int
2558 checkconstraints(struct fme *fmep, struct arrow *arrowp)
2559 {
2560 	struct constraintlist *ctp;
2561 	struct evalue value;
2562 
2563 	if (arrowp->forever_false) {
2564 		char *sep = "";
2565 		indent();
2566 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
2567 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
2568 			out(O_ALTFP|O_VERB|O_NONL, sep);
2569 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2570 			sep = ", ";
2571 		}
2572 		out(O_ALTFP|O_VERB, NULL);
2573 		return (0);
2574 	}
2575 
2576 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
2577 		if (eval_expr(ctp->cnode, NULL, NULL,
2578 		    &fmep->globals, fmep->cfgdata->cooked,
2579 		    arrowp, 0, &value)) {
2580 			/* evaluation successful */
2581 			if (value.t == UNDEFINED || value.v == 0) {
2582 				/* known false */
2583 				arrowp->forever_false = 1;
2584 				indent();
2585 				out(O_ALTFP|O_VERB|O_NONL,
2586 				    "  False constraint: ");
2587 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2588 				out(O_ALTFP|O_VERB, NULL);
2589 				return (0);
2590 			}
2591 		} else {
2592 			/* evaluation unsuccessful -- unknown value */
2593 			indent();
2594 			out(O_ALTFP|O_VERB|O_NONL,
2595 			    "  Deferred constraint: ");
2596 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2597 			out(O_ALTFP|O_VERB, NULL);
2598 			return (2);
2599 		}
2600 	}
2601 	/* known true */
2602 	return (1);
2603 }
2604 
2605 static int
2606 triggered(struct fme *fmep, struct event *ep, int mark)
2607 {
2608 	struct bubble *bp;
2609 	struct arrowlist *ap;
2610 	int count = 0;
2611 
2612 	stats_counter_bump(fmep->Tcallcount);
2613 	for (bp = itree_next_bubble(ep, NULL); bp;
2614 	    bp = itree_next_bubble(ep, bp)) {
2615 		if (bp->t != B_TO)
2616 			continue;
2617 		for (ap = itree_next_arrow(bp, NULL); ap;
2618 		    ap = itree_next_arrow(bp, ap)) {
2619 			/* check count of marks against K in the bubble */
2620 			if ((ap->arrowp->mark & mark) &&
2621 			    ++count >= bp->nork)
2622 				return (1);
2623 		}
2624 	}
2625 	return (0);
2626 }
2627 
2628 static int
2629 mark_arrows(struct fme *fmep, struct event *ep, int mark,
2630     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
2631 {
2632 	struct bubble *bp;
2633 	struct arrowlist *ap;
2634 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2635 	unsigned long long my_delay;
2636 	enum fme_state result;
2637 	int retval = 0;
2638 
2639 	for (bp = itree_next_bubble(ep, NULL); bp;
2640 	    bp = itree_next_bubble(ep, bp)) {
2641 		if (bp->t != B_FROM)
2642 			continue;
2643 		stats_counter_bump(fmep->Marrowcount);
2644 		for (ap = itree_next_arrow(bp, NULL); ap;
2645 		    ap = itree_next_arrow(bp, ap)) {
2646 			struct event *ep2 = ap->arrowp->head->myevent;
2647 			/*
2648 			 * if we're clearing marks, we can avoid doing
2649 			 * all that work evaluating constraints.
2650 			 */
2651 			if (mark == 0) {
2652 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
2653 				if (keep && (ep2->cached_state &
2654 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
2655 					ep2->keep_in_tree = 1;
2656 				ep2->cached_state &=
2657 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
2658 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
2659 				    keep);
2660 				continue;
2661 			}
2662 			if (ep2->cached_state & REQMNTS_DISPROVED) {
2663 				indent();
2664 				out(O_ALTFP|O_VERB|O_NONL,
2665 				    "  ALREADY DISPROVED ");
2666 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2667 				out(O_ALTFP|O_VERB, NULL);
2668 				continue;
2669 			}
2670 			if (ep2->cached_state & WAIT_EFFECT) {
2671 				indent();
2672 				out(O_ALTFP|O_VERB|O_NONL,
2673 				    "  ALREADY EFFECTS WAIT ");
2674 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2675 				out(O_ALTFP|O_VERB, NULL);
2676 				continue;
2677 			}
2678 			if (ep2->cached_state & CREDIBLE_EFFECT) {
2679 				indent();
2680 				out(O_ALTFP|O_VERB|O_NONL,
2681 				    "  ALREADY EFFECTS CREDIBLE ");
2682 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2683 				out(O_ALTFP|O_VERB, NULL);
2684 				continue;
2685 			}
2686 			if ((ep2->cached_state & PARENT_WAIT) &&
2687 			    (mark & PARENT_WAIT)) {
2688 				indent();
2689 				out(O_ALTFP|O_VERB|O_NONL,
2690 				    "  ALREADY PARENT EFFECTS WAIT ");
2691 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2692 				out(O_ALTFP|O_VERB, NULL);
2693 				continue;
2694 			}
2695 			platform_set_payloadnvp(ep2->nvp);
2696 			if (checkconstraints(fmep, ap->arrowp) == 0) {
2697 				platform_set_payloadnvp(NULL);
2698 				indent();
2699 				out(O_ALTFP|O_VERB|O_NONL,
2700 				    "  CONSTRAINTS FAIL ");
2701 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2702 				out(O_ALTFP|O_VERB, NULL);
2703 				continue;
2704 			}
2705 			platform_set_payloadnvp(NULL);
2706 			ap->arrowp->mark |= EFFECTS_COUNTER;
2707 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
2708 				indent();
2709 				out(O_ALTFP|O_VERB|O_NONL,
2710 				    "  K-COUNT NOT YET MET ");
2711 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2712 				out(O_ALTFP|O_VERB, NULL);
2713 				continue;
2714 			}
2715 			ep2->cached_state &= ~PARENT_WAIT;
2716 			result = requirements_test(fmep, ep2, at_latest_by +
2717 			    ap->arrowp->maxdelay,
2718 			    &my_delay);
2719 			if (result == FME_WAIT) {
2720 				retval = WAIT_EFFECT;
2721 				if (overall_delay > my_delay)
2722 					overall_delay = my_delay;
2723 				ep2->cached_state |= WAIT_EFFECT;
2724 				indent();
2725 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
2726 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2727 				out(O_ALTFP|O_VERB, NULL);
2728 				indent_push("  E");
2729 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
2730 				    at_latest_by, &my_delay, 0) ==
2731 				    WAIT_EFFECT) {
2732 					retval = WAIT_EFFECT;
2733 					if (overall_delay > my_delay)
2734 						overall_delay = my_delay;
2735 				}
2736 				indent_pop();
2737 			} else if (result == FME_DISPROVED) {
2738 				indent();
2739 				out(O_ALTFP|O_VERB|O_NONL,
2740 				    "  EFFECTS DISPROVED ");
2741 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2742 				out(O_ALTFP|O_VERB, NULL);
2743 			} else {
2744 				ep2->cached_state |= mark;
2745 				indent();
2746 				if (mark == CREDIBLE_EFFECT)
2747 					out(O_ALTFP|O_VERB|O_NONL,
2748 					    "  EFFECTS CREDIBLE ");
2749 				else
2750 					out(O_ALTFP|O_VERB|O_NONL,
2751 					    "  PARENT EFFECTS WAIT ");
2752 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
2753 				out(O_ALTFP|O_VERB, NULL);
2754 				indent_push("  E");
2755 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
2756 				    &my_delay, 0) == WAIT_EFFECT) {
2757 					retval = WAIT_EFFECT;
2758 					if (overall_delay > my_delay)
2759 						overall_delay = my_delay;
2760 				}
2761 				indent_pop();
2762 			}
2763 		}
2764 	}
2765 	if (retval == WAIT_EFFECT)
2766 		*pdelay = overall_delay;
2767 	return (retval);
2768 }
2769 
2770 static enum fme_state
2771 effects_test(struct fme *fmep, struct event *fault_event,
2772     unsigned long long at_latest_by, unsigned long long *pdelay)
2773 {
2774 	struct event *error_event;
2775 	enum fme_state return_value = FME_CREDIBLE;
2776 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2777 	unsigned long long my_delay;
2778 
2779 	stats_counter_bump(fmep->Ecallcount);
2780 	indent_push("  E");
2781 	indent();
2782 	out(O_ALTFP|O_VERB|O_NONL, "->");
2783 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2784 	out(O_ALTFP|O_VERB, NULL);
2785 
2786 	(void) mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
2787 	    &my_delay, 0);
2788 	for (error_event = fmep->observations;
2789 	    error_event; error_event = error_event->observations) {
2790 		indent();
2791 		out(O_ALTFP|O_VERB|O_NONL, " ");
2792 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
2793 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
2794 			if (error_event->cached_state &
2795 			    (PARENT_WAIT|WAIT_EFFECT)) {
2796 				return_value = FME_WAIT;
2797 				if (overall_delay > my_delay)
2798 					overall_delay = my_delay;
2799 				out(O_ALTFP|O_VERB, " NOT YET triggered");
2800 				continue;
2801 			}
2802 			return_value = FME_DISPROVED;
2803 			out(O_ALTFP|O_VERB, " NOT triggered");
2804 			break;
2805 		} else {
2806 			out(O_ALTFP|O_VERB, " triggered");
2807 		}
2808 	}
2809 	if (return_value == FME_DISPROVED) {
2810 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
2811 	} else {
2812 		fault_event->keep_in_tree = 1;
2813 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
2814 	}
2815 
2816 	indent();
2817 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
2818 	    fme_state2str(return_value));
2819 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2820 	out(O_ALTFP|O_VERB, NULL);
2821 	indent_pop();
2822 	if (return_value == FME_WAIT)
2823 		*pdelay = overall_delay;
2824 	return (return_value);
2825 }
2826 
2827 static enum fme_state
2828 requirements_test(struct fme *fmep, struct event *ep,
2829     unsigned long long at_latest_by, unsigned long long *pdelay)
2830 {
2831 	int waiting_events;
2832 	int credible_events;
2833 	int deferred_events;
2834 	enum fme_state return_value = FME_CREDIBLE;
2835 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2836 	unsigned long long arrow_delay;
2837 	unsigned long long my_delay;
2838 	struct event *ep2;
2839 	struct bubble *bp;
2840 	struct arrowlist *ap;
2841 
2842 	if (ep->cached_state & REQMNTS_CREDIBLE) {
2843 		indent();
2844 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
2845 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2846 		out(O_ALTFP|O_VERB, NULL);
2847 		return (FME_CREDIBLE);
2848 	}
2849 	if (ep->cached_state & REQMNTS_DISPROVED) {
2850 		indent();
2851 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
2852 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2853 		out(O_ALTFP|O_VERB, NULL);
2854 		return (FME_DISPROVED);
2855 	}
2856 	if (ep->cached_state & REQMNTS_WAIT) {
2857 		indent();
2858 		*pdelay = ep->cached_delay;
2859 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
2860 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2861 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
2862 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2863 		out(O_ALTFP|O_VERB, NULL);
2864 		return (FME_WAIT);
2865 	}
2866 	stats_counter_bump(fmep->Rcallcount);
2867 	indent_push("  R");
2868 	indent();
2869 	out(O_ALTFP|O_VERB|O_NONL, "->");
2870 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2871 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2872 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2873 	out(O_ALTFP|O_VERB, NULL);
2874 
2875 	if (ep->t == N_EREPORT) {
2876 		if (ep->count == 0) {
2877 			if (fmep->pull >= at_latest_by) {
2878 				return_value = FME_DISPROVED;
2879 			} else {
2880 				ep->cached_delay = *pdelay = at_latest_by;
2881 				return_value = FME_WAIT;
2882 			}
2883 		}
2884 
2885 		indent();
2886 		switch (return_value) {
2887 		case FME_CREDIBLE:
2888 			ep->cached_state |= REQMNTS_CREDIBLE;
2889 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
2890 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2891 			break;
2892 		case FME_DISPROVED:
2893 			ep->cached_state |= REQMNTS_DISPROVED;
2894 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
2895 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2896 			break;
2897 		case FME_WAIT:
2898 			ep->cached_state |= REQMNTS_WAIT;
2899 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
2900 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2901 			out(O_ALTFP|O_VERB|O_NONL, " to ");
2902 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2903 			break;
2904 		default:
2905 			out(O_DIE, "requirements_test: unexpected fme_state");
2906 			break;
2907 		}
2908 		out(O_ALTFP|O_VERB, NULL);
2909 		indent_pop();
2910 
2911 		return (return_value);
2912 	}
2913 
2914 	/* this event is not a report, descend the tree */
2915 	for (bp = itree_next_bubble(ep, NULL); bp;
2916 	    bp = itree_next_bubble(ep, bp)) {
2917 		int n;
2918 
2919 		if (bp->t != B_FROM)
2920 			continue;
2921 
2922 		n = bp->nork;
2923 
2924 		credible_events = 0;
2925 		waiting_events = 0;
2926 		deferred_events = 0;
2927 		arrow_delay = TIMEVAL_EVENTUALLY;
2928 		/*
2929 		 * n is -1 for 'A' so adjust it.
2930 		 * XXX just count up the arrows for now.
2931 		 */
2932 		if (n < 0) {
2933 			n = 0;
2934 			for (ap = itree_next_arrow(bp, NULL); ap;
2935 			    ap = itree_next_arrow(bp, ap))
2936 				n++;
2937 			indent();
2938 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
2939 		} else {
2940 			indent();
2941 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
2942 		}
2943 
2944 		if (n == 0)
2945 			continue;
2946 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
2947 			for (ap = itree_next_arrow(bp, NULL); ap;
2948 			    ap = itree_next_arrow(bp, ap)) {
2949 				ep2 = ap->arrowp->head->myevent;
2950 				platform_set_payloadnvp(ep2->nvp);
2951 				if (checkconstraints(fmep, ap->arrowp) == 0) {
2952 					/*
2953 					 * if any arrow is invalidated by the
2954 					 * constraints, then we should elide the
2955 					 * whole bubble to be consistant with
2956 					 * the tree creation time behaviour
2957 					 */
2958 					bp->mark |= BUBBLE_ELIDED;
2959 					platform_set_payloadnvp(NULL);
2960 					break;
2961 				}
2962 				platform_set_payloadnvp(NULL);
2963 			}
2964 		}
2965 		if (bp->mark & BUBBLE_ELIDED)
2966 			continue;
2967 		bp->mark |= BUBBLE_OK;
2968 		for (ap = itree_next_arrow(bp, NULL); ap;
2969 		    ap = itree_next_arrow(bp, ap)) {
2970 			ep2 = ap->arrowp->head->myevent;
2971 			if (n <= credible_events)
2972 				break;
2973 
2974 			ap->arrowp->mark |= REQMNTS_COUNTER;
2975 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
2976 				/* XXX adding max timevals! */
2977 				switch (requirements_test(fmep, ep2,
2978 				    at_latest_by + ap->arrowp->maxdelay,
2979 				    &my_delay)) {
2980 				case FME_DEFERRED:
2981 					deferred_events++;
2982 					break;
2983 				case FME_CREDIBLE:
2984 					credible_events++;
2985 					break;
2986 				case FME_DISPROVED:
2987 					break;
2988 				case FME_WAIT:
2989 					if (my_delay < arrow_delay)
2990 						arrow_delay = my_delay;
2991 					waiting_events++;
2992 					break;
2993 				default:
2994 					out(O_DIE,
2995 					"Bug in requirements_test.");
2996 				}
2997 			else
2998 				deferred_events++;
2999 		}
3000 		indent();
3001 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
3002 		    credible_events + deferred_events, waiting_events);
3003 		if (credible_events + deferred_events + waiting_events < n) {
3004 			/* Can never meet requirements */
3005 			ep->cached_state |= REQMNTS_DISPROVED;
3006 			indent();
3007 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3008 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3009 			out(O_ALTFP|O_VERB, NULL);
3010 			indent_pop();
3011 			return (FME_DISPROVED);
3012 		}
3013 		if (credible_events + deferred_events < n) {
3014 			/* will have to wait */
3015 			/* wait time is shortest known */
3016 			if (arrow_delay < overall_delay)
3017 				overall_delay = arrow_delay;
3018 			return_value = FME_WAIT;
3019 		} else if (credible_events < n) {
3020 			if (return_value != FME_WAIT)
3021 				return_value = FME_DEFERRED;
3022 		}
3023 	}
3024 
3025 	/*
3026 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
3027 	 * path, then this will be considered FME_CREDIBLE. But if it is
3028 	 * reached by a different path so the K-count is met, then might
3029 	 * get overridden by FME_WAIT or FME_DISPROVED.
3030 	 */
3031 	if (return_value == FME_WAIT) {
3032 		ep->cached_state |= REQMNTS_WAIT;
3033 		ep->cached_delay = *pdelay = overall_delay;
3034 	} else if (return_value == FME_CREDIBLE) {
3035 		ep->cached_state |= REQMNTS_CREDIBLE;
3036 	}
3037 	indent();
3038 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
3039 	    fme_state2str(return_value));
3040 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3041 	out(O_ALTFP|O_VERB, NULL);
3042 	indent_pop();
3043 	return (return_value);
3044 }
3045 
3046 static enum fme_state
3047 causes_test(struct fme *fmep, struct event *ep,
3048     unsigned long long at_latest_by, unsigned long long *pdelay)
3049 {
3050 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3051 	unsigned long long my_delay;
3052 	int credible_results = 0;
3053 	int waiting_results = 0;
3054 	enum fme_state fstate;
3055 	struct event *tail_event;
3056 	struct bubble *bp;
3057 	struct arrowlist *ap;
3058 	int k = 1;
3059 
3060 	stats_counter_bump(fmep->Ccallcount);
3061 	indent_push("  C");
3062 	indent();
3063 	out(O_ALTFP|O_VERB|O_NONL, "->");
3064 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3065 	out(O_ALTFP|O_VERB, NULL);
3066 
3067 	for (bp = itree_next_bubble(ep, NULL); bp;
3068 	    bp = itree_next_bubble(ep, bp)) {
3069 		if (bp->t != B_TO)
3070 			continue;
3071 		k = bp->nork;	/* remember the K value */
3072 		for (ap = itree_next_arrow(bp, NULL); ap;
3073 		    ap = itree_next_arrow(bp, ap)) {
3074 			int do_not_follow = 0;
3075 
3076 			/*
3077 			 * if we get to the same event multiple times
3078 			 * only worry about the first one.
3079 			 */
3080 			if (ap->arrowp->tail->myevent->cached_state &
3081 			    CAUSES_TESTED) {
3082 				indent();
3083 				out(O_ALTFP|O_VERB|O_NONL,
3084 				    "  causes test already run for ");
3085 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3086 				    ap->arrowp->tail->myevent);
3087 				out(O_ALTFP|O_VERB, NULL);
3088 				continue;
3089 			}
3090 
3091 			/*
3092 			 * see if false constraint prevents us
3093 			 * from traversing this arrow
3094 			 */
3095 			platform_set_payloadnvp(ep->nvp);
3096 			if (checkconstraints(fmep, ap->arrowp) == 0)
3097 				do_not_follow = 1;
3098 			platform_set_payloadnvp(NULL);
3099 			if (do_not_follow) {
3100 				indent();
3101 				out(O_ALTFP|O_VERB|O_NONL,
3102 				    "  False arrow from ");
3103 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3104 				    ap->arrowp->tail->myevent);
3105 				out(O_ALTFP|O_VERB, NULL);
3106 				continue;
3107 			}
3108 
3109 			ap->arrowp->tail->myevent->cached_state |=
3110 			    CAUSES_TESTED;
3111 			tail_event = ap->arrowp->tail->myevent;
3112 			fstate = hypothesise(fmep, tail_event, at_latest_by,
3113 			    &my_delay);
3114 
3115 			switch (fstate) {
3116 			case FME_WAIT:
3117 				if (my_delay < overall_delay)
3118 					overall_delay = my_delay;
3119 				waiting_results++;
3120 				break;
3121 			case FME_CREDIBLE:
3122 				credible_results++;
3123 				break;
3124 			case FME_DISPROVED:
3125 				break;
3126 			default:
3127 				out(O_DIE, "Bug in causes_test");
3128 			}
3129 		}
3130 	}
3131 	/* compare against K */
3132 	if (credible_results + waiting_results < k) {
3133 		indent();
3134 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
3135 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3136 		out(O_ALTFP|O_VERB, NULL);
3137 		indent_pop();
3138 		return (FME_DISPROVED);
3139 	}
3140 	if (waiting_results != 0) {
3141 		*pdelay = overall_delay;
3142 		indent();
3143 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
3144 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3145 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3146 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3147 		out(O_ALTFP|O_VERB, NULL);
3148 		indent_pop();
3149 		return (FME_WAIT);
3150 	}
3151 	indent();
3152 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
3153 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3154 	out(O_ALTFP|O_VERB, NULL);
3155 	indent_pop();
3156 	return (FME_CREDIBLE);
3157 }
3158 
3159 static enum fme_state
3160 hypothesise(struct fme *fmep, struct event *ep,
3161 	unsigned long long at_latest_by, unsigned long long *pdelay)
3162 {
3163 	enum fme_state rtr, otr;
3164 	unsigned long long my_delay;
3165 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3166 
3167 	stats_counter_bump(fmep->Hcallcount);
3168 	indent_push("  H");
3169 	indent();
3170 	out(O_ALTFP|O_VERB|O_NONL, "->");
3171 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3172 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3173 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3174 	out(O_ALTFP|O_VERB, NULL);
3175 
3176 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
3177 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
3178 		overall_delay = my_delay;
3179 	if (rtr != FME_DISPROVED) {
3180 		if (is_problem(ep->t)) {
3181 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
3182 			if (otr != FME_DISPROVED) {
3183 				if (fmep->peek == 0 && ep->is_suspect++ == 0) {
3184 					ep->suspects = fmep->suspects;
3185 					fmep->suspects = ep;
3186 					fmep->nsuspects++;
3187 					if (!is_fault(ep->t))
3188 						fmep->nonfault++;
3189 				}
3190 			}
3191 		} else
3192 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
3193 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
3194 			overall_delay = my_delay;
3195 		if ((otr != FME_DISPROVED) &&
3196 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
3197 			*pdelay = overall_delay;
3198 	}
3199 	if (rtr == FME_DISPROVED) {
3200 		indent();
3201 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3202 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3203 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
3204 		indent_pop();
3205 		return (FME_DISPROVED);
3206 	}
3207 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
3208 		indent();
3209 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3210 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3211 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
3212 		indent_pop();
3213 		return (FME_DISPROVED);
3214 	}
3215 	if (otr == FME_DISPROVED) {
3216 		indent();
3217 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3218 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3219 		out(O_ALTFP|O_VERB, " (causes are not credible)");
3220 		indent_pop();
3221 		return (FME_DISPROVED);
3222 	}
3223 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
3224 		indent();
3225 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
3226 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3227 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3228 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
3229 		out(O_ALTFP|O_VERB, NULL);
3230 		indent_pop();
3231 		return (FME_WAIT);
3232 	}
3233 	indent();
3234 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
3235 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3236 	out(O_ALTFP|O_VERB, NULL);
3237 	indent_pop();
3238 	return (FME_CREDIBLE);
3239 }
3240 
3241 /*
3242  * fme_istat_load -- reconstitute any persistent istats
3243  */
3244 void
3245 fme_istat_load(fmd_hdl_t *hdl)
3246 {
3247 	int sz;
3248 	char *sbuf;
3249 	char *ptr;
3250 
3251 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
3252 		out(O_ALTFP, "fme_istat_load: No stats");
3253 		return;
3254 	}
3255 
3256 	sbuf = alloca(sz);
3257 
3258 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
3259 
3260 	/*
3261 	 * pick apart the serialized stats
3262 	 *
3263 	 * format is:
3264 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
3265 	 * for example:
3266 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
3267 	 *
3268 	 * since this is parsing our own serialized data, any parsing issues
3269 	 * are fatal, so we check for them all with ASSERT() below.
3270 	 */
3271 	ptr = sbuf;
3272 	while (ptr < &sbuf[sz]) {
3273 		char *sepptr;
3274 		struct node *np;
3275 		int val;
3276 
3277 		sepptr = strchr(ptr, '@');
3278 		ASSERT(sepptr != NULL);
3279 		*sepptr = '\0';
3280 
3281 		/* construct the event */
3282 		np = newnode(T_EVENT, NULL, 0);
3283 		np->u.event.ename = newnode(T_NAME, NULL, 0);
3284 		np->u.event.ename->u.name.t = N_STAT;
3285 		np->u.event.ename->u.name.s = stable(ptr);
3286 		np->u.event.ename->u.name.it = IT_ENAME;
3287 		np->u.event.ename->u.name.last = np->u.event.ename;
3288 
3289 		ptr = sepptr + 1;
3290 		ASSERT(ptr < &sbuf[sz]);
3291 		ptr += strlen(ptr);
3292 		ptr++;	/* move past the '\0' separating path from value */
3293 		ASSERT(ptr < &sbuf[sz]);
3294 		ASSERT(isdigit(*ptr));
3295 		val = atoi(ptr);
3296 		ASSERT(val > 0);
3297 		ptr += strlen(ptr);
3298 		ptr++;	/* move past the final '\0' for this entry */
3299 
3300 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
3301 		ASSERT(np->u.event.epname != NULL);
3302 
3303 		istat_bump(np, val);
3304 		tree_free(np);
3305 	}
3306 
3307 	istat_save();
3308 }
3309