xref: /titanic_52/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision f56c1286e5113aa46bd6e723da14d30c123153f2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 
58 /* imported from eft.c... */
59 extern int Autoconvict;
60 extern char *Autoclose;
61 extern hrtime_t Hesitate;
62 extern nv_alloc_t Eft_nv_hdl;
63 extern int Max_fme;
64 
65 /* fme under construction is global so we can free it on module abort */
66 static struct fme *Nfmep;
67 
68 static const char *Undiag_reason;
69 
70 static int Nextid = 0;
71 
72 static int Open_fme_count = 0;	/* Count of open FMEs */
73 
74 /* list of fault management exercises underway */
75 static struct fme {
76 	struct fme *next;		/* next exercise */
77 	unsigned long long ull;		/* time when fme was created */
78 	int id;				/* FME id */
79 	struct cfgdata *cfgdata;	/* full configuration data */
80 	struct lut *eventtree;		/* propagation tree for this FME */
81 	/*
82 	 * The initial error report that created this FME is kept in
83 	 * two forms.  e0 points to the instance tree node and is used
84 	 * by fme_eval() as the starting point for the inference
85 	 * algorithm.  e0r is the event handle FMD passed to us when
86 	 * the ereport first arrived and is used when setting timers,
87 	 * which are always relative to the time of this initial
88 	 * report.
89 	 */
90 	struct event *e0;
91 	fmd_event_t *e0r;
92 
93 	id_t    timer;			/* for setting an fmd time-out */
94 	id_t	htid;			/* for setting hesitation timer */
95 
96 	struct event *ecurrent;		/* ereport under consideration */
97 	struct event *suspects;		/* current suspect list */
98 	struct event *psuspects;	/* previous suspect list */
99 	int nsuspects;			/* count of suspects */
100 	int nonfault;			/* zero if all suspects T_FAULT */
101 	int posted_suspects;		/* true if we've posted a diagnosis */
102 	int hesitated;			/* true if we hesitated */
103 	int uniqobs;			/* number of unique events observed */
104 	int peek;			/* just peeking, don't track suspects */
105 	int overflow;			/* true if overflow FME */
106 	enum fme_state {
107 		FME_NOTHING = 5000,	/* not evaluated yet */
108 		FME_WAIT,		/* need to wait for more info */
109 		FME_CREDIBLE,		/* suspect list is credible */
110 		FME_DISPROVED		/* no valid suspects found */
111 	} state;
112 
113 	unsigned long long pull;	/* time passed since created */
114 	unsigned long long wull;	/* wait until this time for re-eval */
115 	struct event *observations;	/* observation list */
116 	struct lut *globals;		/* values of global variables */
117 	/* fmd interfacing */
118 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
119 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
120 	/* stats */
121 	struct stats *Rcount;
122 	struct stats *Hcallcount;
123 	struct stats *Rcallcount;
124 	struct stats *Ccallcount;
125 	struct stats *Ecallcount;
126 	struct stats *Tcallcount;
127 	struct stats *Marrowcount;
128 	struct stats *diags;
129 } *FMElist, *EFMElist, *ClosedFMEs;
130 
131 static struct case_list {
132 	fmd_case_t *fmcase;
133 	struct case_list *next;
134 } *Undiagablecaselist;
135 
136 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
137 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
138 	unsigned long long at_latest_by, unsigned long long *pdelay,
139 	struct arrow *arrowp);
140 static struct node *eventprop_lookup(struct event *ep, const char *propname);
141 static struct node *pathstring2epnamenp(char *path);
142 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep);
143 static void restore_suspects(struct fme *fmep);
144 static void save_suspects(struct fme *fmep);
145 static void destroy_fme(struct fme *f);
146 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
147     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
148 
149 static struct fme *
150 alloc_fme(void)
151 {
152 	struct fme *fmep;
153 
154 	fmep = MALLOC(sizeof (*fmep));
155 	bzero(fmep, sizeof (*fmep));
156 	return (fmep);
157 }
158 
159 /*
160  * fme_ready -- called when all initialization of the FME (except for
161  *	stats) has completed successfully.  Adds the fme to global lists
162  *	and establishes its stats.
163  */
164 static struct fme *
165 fme_ready(struct fme *fmep)
166 {
167 	char nbuf[100];
168 
169 	Nfmep = NULL;	/* don't need to free this on module abort now */
170 
171 	if (EFMElist) {
172 		EFMElist->next = fmep;
173 		EFMElist = fmep;
174 	} else
175 		FMElist = EFMElist = fmep;
176 
177 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
178 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
179 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
180 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
181 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
182 	fmep->Rcallcount = stats_new_counter(nbuf,
183 	    "calls to requirements_test()", 1);
184 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
185 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
186 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
187 	fmep->Ecallcount =
188 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
189 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
190 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
191 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
192 	fmep->Marrowcount = stats_new_counter(nbuf,
193 	    "arrows marked by mark_arrows()", 1);
194 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
195 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
196 
197 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
198 	config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked);
199 
200 	return (fmep);
201 }
202 
203 static struct fme *
204 newfme(const char *e0class, const struct ipath *e0ipp)
205 {
206 	struct cfgdata *cfgdata;
207 
208 	if ((cfgdata = config_snapshot()) == NULL) {
209 		out(O_ALTFP, "newfme: NULL configuration");
210 		Undiag_reason = UD_NOCONF;
211 		return (NULL);
212 	}
213 
214 	Nfmep = alloc_fme();
215 
216 	Nfmep->id = Nextid++;
217 	Nfmep->cfgdata = cfgdata;
218 	Nfmep->posted_suspects = 0;
219 	Nfmep->uniqobs = 0;
220 	Nfmep->state = FME_NOTHING;
221 	Nfmep->pull = 0ULL;
222 	Nfmep->overflow = 0;
223 
224 	Nfmep->fmcase = NULL;
225 	Nfmep->hdl = NULL;
226 
227 	if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
228 		out(O_ALTFP, "newfme: NULL instance tree");
229 		Undiag_reason = UD_INSTFAIL;
230 		config_free(cfgdata);
231 		FREE(Nfmep);
232 		Nfmep = NULL;
233 		return (NULL);
234 	}
235 
236 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
237 
238 	if ((Nfmep->e0 =
239 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
240 		out(O_ALTFP, "newfme: e0 not in instance tree");
241 		Undiag_reason = UD_BADEVENTI;
242 		itree_free(Nfmep->eventtree);
243 		config_free(cfgdata);
244 		FREE(Nfmep);
245 		Nfmep = NULL;
246 		return (NULL);
247 	}
248 
249 	return (fme_ready(Nfmep));
250 }
251 
252 void
253 fme_fini(void)
254 {
255 	struct fme *sfp, *fp;
256 	struct case_list *ucasep, *nextcasep;
257 
258 	ucasep = Undiagablecaselist;
259 	while (ucasep != NULL) {
260 		nextcasep = ucasep->next;
261 		FREE(ucasep);
262 		ucasep = nextcasep;
263 	}
264 	Undiagablecaselist = NULL;
265 
266 	/* clean up closed fmes */
267 	fp = ClosedFMEs;
268 	while (fp != NULL) {
269 		sfp = fp->next;
270 		destroy_fme(fp);
271 		fp = sfp;
272 	}
273 	ClosedFMEs = NULL;
274 
275 	fp = FMElist;
276 	while (fp != NULL) {
277 		sfp = fp->next;
278 		destroy_fme(fp);
279 		fp = sfp;
280 	}
281 	FMElist = EFMElist = NULL;
282 
283 	/* if we were in the middle of creating an fme, free it now */
284 	if (Nfmep) {
285 		destroy_fme(Nfmep);
286 		Nfmep = NULL;
287 	}
288 }
289 
290 /*
291  * Allocated space for a buffer name.  20 bytes allows for
292  * a ridiculous 9,999,999 unique observations.
293  */
294 #define	OBBUFNMSZ 20
295 
296 /*
297  *  serialize_observation
298  *
299  *  Create a recoverable version of the current observation
300  *  (f->ecurrent).  We keep a serialized version of each unique
301  *  observation in order that we may resume correctly the fme in the
302  *  correct state if eft or fmd crashes and we're restarted.
303  */
304 static void
305 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
306 {
307 	size_t pkdlen;
308 	char tmpbuf[OBBUFNMSZ];
309 	char *pkd = NULL;
310 	char *estr;
311 
312 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
313 	estr = ipath2str(cls, ipp);
314 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
315 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
316 	    strlen(estr) + 1);
317 	FREE(estr);
318 
319 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
320 		(void) snprintf(tmpbuf,
321 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
322 		if (nvlist_xpack(fp->ecurrent->nvp,
323 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
324 			out(O_DIE|O_SYS, "pack of observed nvl failed");
325 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
326 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
327 		FREE(pkd);
328 	}
329 
330 	fp->uniqobs++;
331 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
332 	    sizeof (fp->uniqobs));
333 }
334 
335 /*
336  *  init_fme_bufs -- We keep several bits of state about an fme for
337  *	use if eft or fmd crashes and we're restarted.
338  */
339 static void
340 init_fme_bufs(struct fme *fp)
341 {
342 	size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin;
343 
344 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen));
345 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen,
346 	    sizeof (cfglen));
347 	if (cfglen != 0) {
348 		fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen);
349 		fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG,
350 		    fp->cfgdata->begin, cfglen);
351 	}
352 
353 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
354 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
355 	    sizeof (fp->pull));
356 
357 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
358 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
359 	    sizeof (fp->id));
360 
361 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
362 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
363 	    sizeof (fp->uniqobs));
364 
365 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
366 	    sizeof (fp->posted_suspects));
367 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
368 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
369 }
370 
371 static void
372 destroy_fme_bufs(struct fme *fp)
373 {
374 	char tmpbuf[OBBUFNMSZ];
375 	int o;
376 
377 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
378 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
379 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
380 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
381 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
382 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
383 
384 	for (o = 0; o < fp->uniqobs; o++) {
385 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
386 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
387 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
388 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
389 	}
390 }
391 
392 /*
393  * reconstitute_observations -- convert a case's serialized observations
394  *	back into struct events.  Returns zero if all observations are
395  *	successfully reconstituted.
396  */
397 static int
398 reconstitute_observations(struct fme *fmep)
399 {
400 	struct event *ep;
401 	struct node *epnamenp = NULL;
402 	size_t pkdlen;
403 	char *pkd = NULL;
404 	char *tmpbuf = alloca(OBBUFNMSZ);
405 	char *sepptr;
406 	char *estr;
407 	int ocnt;
408 	int elen;
409 
410 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
411 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
412 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
413 		if (elen == 0) {
414 			out(O_ALTFP,
415 			    "reconstitute_observation: no %s buffer found.",
416 			    tmpbuf);
417 			Undiag_reason = UD_MISSINGOBS;
418 			break;
419 		}
420 
421 		estr = MALLOC(elen);
422 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
423 		sepptr = strchr(estr, '@');
424 		if (sepptr == NULL) {
425 			out(O_ALTFP,
426 			    "reconstitute_observation: %s: "
427 			    "missing @ separator in %s.",
428 			    tmpbuf, estr);
429 			Undiag_reason = UD_MISSINGPATH;
430 			FREE(estr);
431 			break;
432 		}
433 
434 		*sepptr = '\0';
435 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
436 			out(O_ALTFP,
437 			    "reconstitute_observation: %s: "
438 			    "trouble converting path string \"%s\" "
439 			    "to internal representation.",
440 			    tmpbuf, sepptr + 1);
441 			Undiag_reason = UD_MISSINGPATH;
442 			FREE(estr);
443 			break;
444 		}
445 
446 		/* construct the event */
447 		ep = itree_lookup(fmep->eventtree,
448 		    stable(estr), ipath(epnamenp));
449 		if (ep == NULL) {
450 			out(O_ALTFP,
451 			    "reconstitute_observation: %s: "
452 			    "lookup of  \"%s\" in itree failed.",
453 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
454 			Undiag_reason = UD_BADOBS;
455 			tree_free(epnamenp);
456 			FREE(estr);
457 			break;
458 		}
459 		tree_free(epnamenp);
460 
461 		/*
462 		 * We may or may not have a saved nvlist for the observation
463 		 */
464 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
465 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
466 		if (pkdlen != 0) {
467 			pkd = MALLOC(pkdlen);
468 			fmd_buf_read(fmep->hdl,
469 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
470 			if (nvlist_xunpack(pkd,
471 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
472 				out(O_DIE|O_SYS, "pack of observed nvl failed");
473 			FREE(pkd);
474 		}
475 
476 		if (ocnt == 0)
477 			fmep->e0 = ep;
478 
479 		FREE(estr);
480 		fmep->ecurrent = ep;
481 		ep->count++;
482 
483 		/* link it into list of observations seen */
484 		ep->observations = fmep->observations;
485 		fmep->observations = ep;
486 	}
487 
488 	if (ocnt == fmep->uniqobs) {
489 		(void) fme_ready(fmep);
490 		return (0);
491 	}
492 
493 	return (1);
494 }
495 
496 /*
497  * restart_fme -- called during eft initialization.  Reconstitutes
498  *	an in-progress fme.
499  */
500 void
501 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
502 {
503 	nvlist_t *defect;
504 	struct case_list *bad;
505 	struct fme *fmep;
506 	struct cfgdata *cfgdata = NULL;
507 	size_t rawsz;
508 
509 	fmep = alloc_fme();
510 	fmep->fmcase = inprogress;
511 	fmep->hdl = hdl;
512 
513 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
514 		out(O_ALTFP, "restart_fme: No config data");
515 		Undiag_reason = UD_MISSINGINFO;
516 		goto badcase;
517 	}
518 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
519 	    sizeof (size_t));
520 
521 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
522 		out(O_ALTFP, "restart_fme: No event zero");
523 		Undiag_reason = UD_MISSINGZERO;
524 		goto badcase;
525 	}
526 
527 	cfgdata = MALLOC(sizeof (struct cfgdata));
528 	cfgdata->cooked = NULL;
529 	cfgdata->devcache = NULL;
530 	cfgdata->cpucache = NULL;
531 	cfgdata->refcnt = 1;
532 
533 	if (rawsz > 0) {
534 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
535 			out(O_ALTFP, "restart_fme: Config data size mismatch");
536 			Undiag_reason = UD_CFGMISMATCH;
537 			goto badcase;
538 		}
539 		cfgdata->begin = MALLOC(rawsz);
540 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
541 		fmd_buf_read(hdl,
542 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
543 	} else {
544 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
545 	}
546 	fmep->cfgdata = cfgdata;
547 
548 	config_cook(cfgdata);
549 	if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
550 		/* case not properly saved or irretrievable */
551 		out(O_ALTFP, "restart_fme: NULL instance tree");
552 		Undiag_reason = UD_INSTFAIL;
553 		goto badcase;
554 	}
555 
556 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
557 
558 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
559 		out(O_ALTFP, "restart_fme: no saved wait time");
560 		Undiag_reason = UD_MISSINGINFO;
561 		goto badcase;
562 	} else {
563 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
564 		    sizeof (fmep->pull));
565 	}
566 
567 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
568 		out(O_ALTFP, "restart_fme: no saved posted status");
569 		Undiag_reason = UD_MISSINGINFO;
570 		goto badcase;
571 	} else {
572 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
573 		    (void *)&fmep->posted_suspects,
574 		    sizeof (fmep->posted_suspects));
575 	}
576 
577 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
578 		out(O_ALTFP, "restart_fme: no saved id");
579 		Undiag_reason = UD_MISSINGINFO;
580 		goto badcase;
581 	} else {
582 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
583 		    sizeof (fmep->id));
584 	}
585 	if (Nextid <= fmep->id)
586 		Nextid = fmep->id + 1;
587 
588 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
589 		out(O_ALTFP, "restart_fme: no count of observations");
590 		Undiag_reason = UD_MISSINGINFO;
591 		goto badcase;
592 	} else {
593 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
594 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
595 	}
596 
597 	if (reconstitute_observations(fmep) != 0)
598 		goto badcase;
599 
600 	Open_fme_count++;
601 
602 	/* give the diagnosis algorithm a shot at the new FME state */
603 	fme_eval(fmep, NULL);
604 	return;
605 
606 badcase:
607 	if (fmep->eventtree != NULL)
608 		itree_free(fmep->eventtree);
609 	config_free(cfgdata);
610 	destroy_fme_bufs(fmep);
611 	FREE(fmep);
612 
613 	/*
614 	 * Since we're unable to restart the case, add it to the undiagable
615 	 * list and solve and close it as appropriate.
616 	 */
617 	bad = MALLOC(sizeof (struct case_list));
618 	bad->next = NULL;
619 
620 	if (Undiagablecaselist != NULL)
621 		bad->next = Undiagablecaselist;
622 	Undiagablecaselist = bad;
623 	bad->fmcase = inprogress;
624 
625 	out(O_ALTFP, "[case %s (unable to restart), ",
626 	    fmd_case_uuid(hdl, bad->fmcase));
627 
628 	if (fmd_case_solved(hdl, bad->fmcase)) {
629 		out(O_ALTFP, "already solved, ");
630 	} else {
631 		out(O_ALTFP, "solving, ");
632 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
633 		    NULL, NULL, NULL);
634 		if (Undiag_reason != NULL)
635 			(void) nvlist_add_string(defect,
636 			    UNDIAG_REASON, Undiag_reason);
637 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
638 		fmd_case_solve(hdl, bad->fmcase);
639 	}
640 
641 	if (fmd_case_closed(hdl, bad->fmcase)) {
642 		out(O_ALTFP, "already closed ]");
643 	} else {
644 		out(O_ALTFP, "closing ]");
645 		fmd_case_close(hdl, bad->fmcase);
646 	}
647 }
648 
649 void
650 destroy_fme(struct fme *f)
651 {
652 	stats_delete(f->Rcount);
653 	stats_delete(f->Hcallcount);
654 	stats_delete(f->Rcallcount);
655 	stats_delete(f->Ccallcount);
656 	stats_delete(f->Ecallcount);
657 	stats_delete(f->Tcallcount);
658 	stats_delete(f->Marrowcount);
659 	stats_delete(f->diags);
660 
661 	itree_free(f->eventtree);
662 	config_free(f->cfgdata);
663 	FREE(f);
664 }
665 
666 static const char *
667 fme_state2str(enum fme_state s)
668 {
669 	switch (s) {
670 	case FME_NOTHING:	return ("NOTHING");
671 	case FME_WAIT:		return ("WAIT");
672 	case FME_CREDIBLE:	return ("CREDIBLE");
673 	case FME_DISPROVED:	return ("DISPROVED");
674 	default:		return ("UNKNOWN");
675 	}
676 }
677 
678 static int
679 is_problem(enum nametype t)
680 {
681 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
682 }
683 
684 static int
685 is_fault(enum nametype t)
686 {
687 	return (t == N_FAULT);
688 }
689 
690 static int
691 is_defect(enum nametype t)
692 {
693 	return (t == N_DEFECT);
694 }
695 
696 static int
697 is_upset(enum nametype t)
698 {
699 	return (t == N_UPSET);
700 }
701 
702 /*ARGSUSED*/
703 static void
704 clear_causes_tested(struct event *lhs, struct event *ep, void *arg)
705 {
706 	struct bubble *bp;
707 	struct arrowlist *ap;
708 
709 	for (bp = itree_next_bubble(ep, NULL); bp;
710 	    bp = itree_next_bubble(ep, bp)) {
711 		if (bp->t != B_FROM)
712 			continue;
713 		for (ap = itree_next_arrow(bp, NULL); ap;
714 		    ap = itree_next_arrow(bp, ap))
715 			ap->arrowp->causes_tested = 0;
716 	}
717 }
718 
719 /*
720  * call this function with initcode set to 0 to initialize cycle tracking
721  */
722 static void
723 initialize_cycles(struct fme *fmep)
724 {
725 	lut_walk(fmep->eventtree, (lut_cb)clear_causes_tested, NULL);
726 }
727 
728 static void
729 fme_print(int flags, struct fme *fmep)
730 {
731 	struct event *ep;
732 
733 	out(flags, "Fault Management Exercise %d", fmep->id);
734 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
735 	out(flags|O_NONL, "\t  Start time: ");
736 	ptree_timeval(flags|O_NONL, &fmep->ull);
737 	out(flags, NULL);
738 	if (fmep->wull) {
739 		out(flags|O_NONL, "\t   Wait time: ");
740 		ptree_timeval(flags|O_NONL, &fmep->wull);
741 		out(flags, NULL);
742 	}
743 	out(flags|O_NONL, "\t          E0: ");
744 	if (fmep->e0)
745 		itree_pevent_brief(flags|O_NONL, fmep->e0);
746 	else
747 		out(flags|O_NONL, "NULL");
748 	out(flags, NULL);
749 	out(flags|O_NONL, "\tObservations:");
750 	for (ep = fmep->observations; ep; ep = ep->observations) {
751 		out(flags|O_NONL, " ");
752 		itree_pevent_brief(flags|O_NONL, ep);
753 	}
754 	out(flags, NULL);
755 	out(flags|O_NONL, "\tSuspect list:");
756 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
757 		out(flags|O_NONL, " ");
758 		itree_pevent_brief(flags|O_NONL, ep);
759 	}
760 	out(flags, NULL);
761 	out(flags|O_VERB2, "\t        Tree:");
762 	itree_ptree(flags|O_VERB2, fmep->eventtree);
763 }
764 
765 static struct node *
766 pathstring2epnamenp(char *path)
767 {
768 	char *sep = "/";
769 	struct node *ret;
770 	char *ptr;
771 
772 	if ((ptr = strtok(path, sep)) == NULL)
773 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
774 
775 	ret = tree_iname(stable(ptr), NULL, 0);
776 
777 	while ((ptr = strtok(NULL, sep)) != NULL)
778 		ret = tree_name_append(ret,
779 		    tree_iname(stable(ptr), NULL, 0));
780 
781 	return (ret);
782 }
783 
784 /*
785  * for a given upset sp, increment the corresponding SERD engine.  if the
786  * SERD engine trips, return the ename and ipp of the resulting ereport.
787  * returns true if engine tripped and *enamep and *ippp were filled in.
788  */
789 static int
790 serd_eval(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase,
791 	struct event *sp, const char **enamep, const struct ipath **ippp)
792 {
793 	struct node *serdinst;
794 	char *serdname;
795 
796 	ASSERT(sp->t == N_UPSET);
797 	ASSERT(ffep != NULL);
798 
799 	/*
800 	 * obtain instanced SERD engine from the upset sp.  from this
801 	 * derive serdname, the string used to identify the SERD engine.
802 	 */
803 	serdinst = eventprop_lookup(sp, L_engine);
804 
805 	if (serdinst == NULL)
806 		return (NULL);
807 
808 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
809 	    ipath(serdinst->u.stmt.np->u.event.epname));
810 
811 	if (!fmd_serd_exists(hdl, serdname)) {
812 		struct node *nN, *nT;
813 
814 		/* no SERD engine yet, so create it */
815 		nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL);
816 		nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL);
817 
818 		ASSERT(nN->t == T_NUM);
819 		ASSERT(nT->t == T_TIMEVAL);
820 
821 		fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull,
822 		    (hrtime_t)nT->u.ull);
823 	}
824 
825 
826 	/*
827 	 * increment SERD engine.  if engine fires, reset serd
828 	 * engine and return trip_strcode
829 	 */
830 	if (fmd_serd_record(hdl, serdname, ffep)) {
831 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
832 		    (void *)L_trip, NULL);
833 
834 		ASSERT(tripinst != NULL);
835 
836 		*enamep = tripinst->u.event.ename->u.name.s;
837 		*ippp = ipath(tripinst->u.event.epname);
838 
839 		fmd_case_add_serd(hdl, fmcase, serdname);
840 		fmd_serd_reset(hdl, serdname);
841 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
842 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
843 		out(O_ALTFP, "]");
844 
845 		FREE(serdname);
846 		return (1);
847 	}
848 
849 	FREE(serdname);
850 	return (0);
851 }
852 
853 /*
854  * search a suspect list for upsets.  feed each upset to serd_eval() and
855  * build up tripped[], an array of ereports produced by the firing of
856  * any SERD engines.  then feed each ereport back into
857  * fme_receive_report().
858  *
859  * returns ntrip, the number of these ereports produced.
860  */
861 static int
862 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
863 {
864 	/* we build an array of tripped ereports that we send ourselves */
865 	struct {
866 		const char *ename;
867 		const struct ipath *ipp;
868 	} *tripped;
869 	struct event *sp;
870 	int ntrip, nupset, i;
871 
872 	/*
873 	 * we avoid recursion by calling fme_receive_report() at the end of
874 	 * this function with a NULL ffep
875 	 */
876 	if (ffep == NULL)
877 		return (0);
878 
879 	/*
880 	 * count the number of upsets to determine the upper limit on
881 	 * expected trip ereport strings.  remember that one upset can
882 	 * lead to at most one ereport.
883 	 */
884 	nupset = 0;
885 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
886 		if (sp->t == N_UPSET)
887 			nupset++;
888 	}
889 
890 	if (nupset == 0)
891 		return (0);
892 
893 	/*
894 	 * get to this point if we have upsets and expect some trip
895 	 * ereports
896 	 */
897 	tripped = alloca(sizeof (*tripped) * nupset);
898 	bzero((void *)tripped, sizeof (*tripped) * nupset);
899 
900 	ntrip = 0;
901 	for (sp = fmep->suspects; sp; sp = sp->suspects)
902 		if (sp->t == N_UPSET &&
903 		    serd_eval(fmep->hdl, ffep, fmep->fmcase, sp,
904 			    &tripped[ntrip].ename, &tripped[ntrip].ipp))
905 			ntrip++;
906 
907 	for (i = 0; i < ntrip; i++)
908 		fme_receive_report(fmep->hdl, NULL,
909 		    tripped[i].ename, tripped[i].ipp, NULL);
910 
911 	return (ntrip);
912 }
913 
914 /*
915  * fme_receive_external_report -- call when an external ereport comes in
916  *
917  * this routine just converts the relevant information from the ereport
918  * into a format used internally and passes it on to fme_receive_report().
919  */
920 void
921 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
922     const char *eventstring)
923 {
924 	struct node *epnamenp = platform_getpath(nvl);
925 	const struct ipath *ipp;
926 
927 	/*
928 	 * XFILE: If we ended up without a path, it's an X-file.
929 	 * For now, use our undiagnosable interface.
930 	 */
931 	if (epnamenp == NULL) {
932 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
933 		Undiag_reason = UD_NOPATH;
934 		publish_undiagnosable(hdl, ffep);
935 		return;
936 	}
937 
938 	ipp = ipath(epnamenp);
939 	tree_free(epnamenp);
940 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
941 }
942 
943 static void
944 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
945     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
946 {
947 	struct event *ep;
948 	struct fme *fmep = NULL;
949 	struct fme *ofmep = NULL;
950 	struct fme *cfmep, *svfmep;
951 	int matched = 0;
952 	nvlist_t *defect;
953 
954 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
955 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
956 	out(O_ALTFP|O_STAMP, NULL);
957 
958 	/* decide which FME it goes to */
959 	for (fmep = FMElist; fmep; fmep = fmep->next) {
960 		int prev_verbose;
961 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
962 		enum fme_state state;
963 
964 		if (fmep->overflow) {
965 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
966 				ofmep = fmep;
967 
968 			continue;
969 		}
970 
971 		/* look up event in event tree for this FME */
972 		if ((ep = itree_lookup(fmep->eventtree,
973 		    eventstring, ipp)) == NULL)
974 			continue;
975 
976 		/* note observation */
977 		fmep->ecurrent = ep;
978 		if (ep->count++ == 0) {
979 			/* link it into list of observations seen */
980 			ep->observations = fmep->observations;
981 			fmep->observations = ep;
982 			ep->nvp = evnv_dupnvl(nvl);
983 		}
984 
985 		/* tell hypothesise() not to mess with suspect list */
986 		fmep->peek = 1;
987 
988 		/* don't want this to be verbose (unless Debug is set) */
989 		prev_verbose = Verbose;
990 		if (Debug == 0)
991 			Verbose = 0;
992 
993 		initialize_cycles(fmep);
994 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL);
995 
996 		fmep->peek = 0;
997 
998 		/* put verbose flag back */
999 		Verbose = prev_verbose;
1000 
1001 		if (state != FME_DISPROVED) {
1002 			/* found an FME that explains the ereport */
1003 			matched++;
1004 			out(O_ALTFP|O_NONL, "[");
1005 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1006 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1007 
1008 			if (ep->count == 1)
1009 				serialize_observation(fmep, eventstring, ipp);
1010 
1011 			if (ffep)
1012 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1013 
1014 			stats_counter_bump(fmep->Rcount);
1015 
1016 			/* re-eval FME */
1017 			fme_eval(fmep, ffep);
1018 		} else {
1019 
1020 			/* not a match, undo noting of observation */
1021 			fmep->ecurrent = NULL;
1022 			if (--ep->count == 0) {
1023 				/* unlink it from observations */
1024 				fmep->observations = ep->observations;
1025 				ep->observations = NULL;
1026 				nvlist_free(ep->nvp);
1027 				ep->nvp = NULL;
1028 			}
1029 		}
1030 	}
1031 
1032 	if (matched)
1033 		return;	/* explained by at least one existing FME */
1034 
1035 	/* clean up closed fmes */
1036 	cfmep = ClosedFMEs;
1037 	while (cfmep != NULL) {
1038 		svfmep = cfmep->next;
1039 		destroy_fme(cfmep);
1040 		cfmep = svfmep;
1041 	}
1042 	ClosedFMEs = NULL;
1043 
1044 	if (ofmep) {
1045 		out(O_ALTFP|O_NONL, "[");
1046 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1047 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1048 		if (ffep)
1049 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1050 
1051 		return;
1052 
1053 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1054 		out(O_ALTFP|O_NONL, "[");
1055 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1056 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1057 		/* Create overflow fme */
1058 		if ((fmep = newfme(eventstring, ipp)) == NULL) {
1059 			out(O_ALTFP|O_NONL, "[");
1060 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1061 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1062 			publish_undiagnosable(hdl, ffep);
1063 			return;
1064 		}
1065 
1066 		Open_fme_count++;
1067 
1068 		fmep->fmcase = fmd_case_open(hdl, NULL);
1069 		fmep->hdl = hdl;
1070 		init_fme_bufs(fmep);
1071 		fmep->overflow = B_TRUE;
1072 
1073 		if (ffep)
1074 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1075 
1076 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1077 		    NULL, NULL, NULL);
1078 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1079 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1080 		fmd_case_solve(hdl, fmep->fmcase);
1081 		return;
1082 	}
1083 
1084 	/* start a new FME */
1085 	if ((fmep = newfme(eventstring, ipp)) == NULL) {
1086 		out(O_ALTFP|O_NONL, "[");
1087 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1088 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1089 		publish_undiagnosable(hdl, ffep);
1090 		return;
1091 	}
1092 
1093 	Open_fme_count++;
1094 
1095 	/* open a case */
1096 	fmep->fmcase = fmd_case_open(hdl, NULL);
1097 	fmep->hdl = hdl;
1098 	init_fme_bufs(fmep);
1099 
1100 	out(O_ALTFP|O_NONL, "[");
1101 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1102 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1103 	    fmd_case_uuid(hdl, fmep->fmcase));
1104 
1105 	ep = fmep->e0;
1106 	ASSERT(ep != NULL);
1107 
1108 	/* note observation */
1109 	fmep->ecurrent = ep;
1110 	if (ep->count++ == 0) {
1111 		/* link it into list of observations seen */
1112 		ep->observations = fmep->observations;
1113 		fmep->observations = ep;
1114 		ep->nvp = evnv_dupnvl(nvl);
1115 		serialize_observation(fmep, eventstring, ipp);
1116 	}
1117 
1118 	stats_counter_bump(fmep->Rcount);
1119 
1120 	if (ffep) {
1121 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1122 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1123 		fmep->e0r = ffep;
1124 	}
1125 
1126 	/* give the diagnosis algorithm a shot at the new FME state */
1127 	fme_eval(fmep, ffep);
1128 }
1129 
1130 void
1131 fme_status(int flags)
1132 {
1133 	struct fme *fmep;
1134 
1135 	if (FMElist == NULL) {
1136 		out(flags, "No fault management exercises underway.");
1137 		return;
1138 	}
1139 
1140 	for (fmep = FMElist; fmep; fmep = fmep->next)
1141 		fme_print(flags, fmep);
1142 }
1143 
1144 /*
1145  * "indent" routines used mostly for nicely formatted debug output, but also
1146  * for sanity checking for infinite recursion bugs.
1147  */
1148 
1149 #define	MAX_INDENT 1024
1150 static const char *indent_s[MAX_INDENT];
1151 static int current_indent;
1152 
1153 static void
1154 indent_push(const char *s)
1155 {
1156 	if (current_indent < MAX_INDENT)
1157 		indent_s[current_indent++] = s;
1158 	else
1159 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1160 }
1161 
1162 static void
1163 indent_set(const char *s)
1164 {
1165 	current_indent = 0;
1166 	indent_push(s);
1167 }
1168 
1169 static void
1170 indent_pop(void)
1171 {
1172 	if (current_indent > 0)
1173 		current_indent--;
1174 	else
1175 		out(O_DIE, "recursion underflow");
1176 }
1177 
1178 static void
1179 indent(void)
1180 {
1181 	int i;
1182 	if (!Verbose)
1183 		return;
1184 	for (i = 0; i < current_indent; i++)
1185 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1186 }
1187 
1188 static int
1189 suspects_changed(struct fme *fmep)
1190 {
1191 	struct event *suspects = fmep->suspects;
1192 	struct event *psuspects = fmep->psuspects;
1193 
1194 	while (suspects != NULL && psuspects != NULL) {
1195 		if (suspects != psuspects)
1196 			return (1);
1197 		suspects = suspects->suspects;
1198 		psuspects = psuspects->psuspects;
1199 	}
1200 
1201 	return (suspects != psuspects);
1202 }
1203 
1204 #define	SLNEW		1
1205 #define	SLCHANGED	2
1206 #define	SLWAIT		3
1207 #define	SLDISPROVED	4
1208 
1209 static void
1210 print_suspects(int circumstance, struct fme *fmep)
1211 {
1212 	struct event *ep;
1213 
1214 	out(O_ALTFP|O_NONL, "[");
1215 	if (circumstance == SLCHANGED) {
1216 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1217 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1218 	} else if (circumstance == SLWAIT) {
1219 		out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id);
1220 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1221 	} else if (circumstance == SLDISPROVED) {
1222 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1223 	} else {
1224 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1225 	}
1226 
1227 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1228 		out(O_ALTFP, "]");
1229 		return;
1230 	}
1231 
1232 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1233 		out(O_ALTFP|O_NONL, " ");
1234 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1235 	}
1236 	out(O_ALTFP, "]");
1237 }
1238 
1239 static struct node *
1240 eventprop_lookup(struct event *ep, const char *propname)
1241 {
1242 	return (lut_lookup(ep->props, (void *)propname, NULL));
1243 }
1244 
1245 #define	MAXDIGITIDX	23
1246 static char numbuf[MAXDIGITIDX + 1];
1247 
1248 static int
1249 node2uint(struct node *n, uint_t *valp)
1250 {
1251 	struct evalue value;
1252 	struct lut *globals = NULL;
1253 
1254 	if (n == NULL)
1255 		return (1);
1256 
1257 	/*
1258 	 * check value.v since we are being asked to convert an unsigned
1259 	 * long long int to an unsigned int
1260 	 */
1261 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1262 	    value.t != UINT64 || value.v > (1ULL << 32))
1263 		return (1);
1264 
1265 	*valp = (uint_t)value.v;
1266 
1267 	return (0);
1268 }
1269 
1270 static nvlist_t *
1271 node2fmri(struct node *n)
1272 {
1273 	nvlist_t **pa, *f, *p;
1274 	struct node *nc;
1275 	uint_t depth = 0;
1276 	char *numstr, *nullbyte;
1277 	char *failure;
1278 	int err, i;
1279 
1280 	/* XXX do we need to be able to handle a non-T_NAME node? */
1281 	if (n == NULL || n->t != T_NAME)
1282 		return (NULL);
1283 
1284 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1285 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1286 			break;
1287 		depth++;
1288 	}
1289 
1290 	if (nc != NULL) {
1291 		/* We bailed early, something went wrong */
1292 		return (NULL);
1293 	}
1294 
1295 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1296 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1297 	pa = alloca(depth * sizeof (nvlist_t *));
1298 	for (i = 0; i < depth; i++)
1299 		pa[i] = NULL;
1300 
1301 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1302 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1303 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1304 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1305 	if (err != 0) {
1306 		failure = "basic construction of FMRI failed";
1307 		goto boom;
1308 	}
1309 
1310 	numbuf[MAXDIGITIDX] = '\0';
1311 	nullbyte = &numbuf[MAXDIGITIDX];
1312 	i = 0;
1313 
1314 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1315 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1316 		if (err != 0) {
1317 			failure = "alloc of an hc-pair failed";
1318 			goto boom;
1319 		}
1320 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1321 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1322 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1323 		if (err != 0) {
1324 			failure = "construction of an hc-pair failed";
1325 			goto boom;
1326 		}
1327 		pa[i++] = p;
1328 	}
1329 
1330 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1331 	if (err == 0) {
1332 		for (i = 0; i < depth; i++)
1333 			if (pa[i] != NULL)
1334 				nvlist_free(pa[i]);
1335 		return (f);
1336 	}
1337 	failure = "addition of hc-pair array to FMRI failed";
1338 
1339 boom:
1340 	for (i = 0; i < depth; i++)
1341 		if (pa[i] != NULL)
1342 			nvlist_free(pa[i]);
1343 	nvlist_free(f);
1344 	out(O_DIE, "%s", failure);
1345 	/*NOTREACHED*/
1346 }
1347 
1348 static uint_t
1349 avg(uint_t sum, uint_t cnt)
1350 {
1351 	unsigned long long s = sum * 10;
1352 
1353 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1354 }
1355 
1356 static uint8_t
1357 percentof(uint_t part, uint_t whole)
1358 {
1359 	unsigned long long p = part * 1000;
1360 
1361 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1362 }
1363 
1364 static struct rsl {
1365 	struct event *suspect;
1366 	nvlist_t *asru;
1367 	nvlist_t *fru;
1368 	nvlist_t *rsrc;
1369 };
1370 
1371 /*
1372  *  rslfree -- free internal members of struct rsl not expected to be
1373  *	freed elsewhere.
1374  */
1375 static void
1376 rslfree(struct rsl *freeme)
1377 {
1378 	if (freeme->asru != NULL)
1379 		nvlist_free(freeme->asru);
1380 	if (freeme->fru != NULL)
1381 		nvlist_free(freeme->fru);
1382 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
1383 		nvlist_free(freeme->rsrc);
1384 }
1385 
1386 /*
1387  *  rslcmp -- compare two rsl structures.  Use the following
1388  *	comparisons to establish cardinality:
1389  *
1390  *	1. Name of the suspect's class. (simple strcmp)
1391  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
1392  *
1393  */
1394 static int
1395 rslcmp(const void *a, const void *b)
1396 {
1397 	struct rsl *r1 = (struct rsl *)a;
1398 	struct rsl *r2 = (struct rsl *)b;
1399 	int rv;
1400 
1401 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
1402 	    r2->suspect->enode->u.event.ename->u.name.s);
1403 	if (rv != 0)
1404 		return (rv);
1405 
1406 	if (r1->asru == NULL && r2->asru == NULL)
1407 		return (0);
1408 	if (r1->asru == NULL)
1409 		return (-1);
1410 	if (r2->asru == NULL)
1411 		return (1);
1412 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
1413 }
1414 
1415 /*
1416  *  rsluniq -- given an array of rsl structures, seek out and "remove"
1417  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
1418  *	of the array element.  Removal also means updating the number of
1419  *	problems and the number of problems which are not faults.  User
1420  *	provides the first and last element pointers.
1421  */
1422 static void
1423 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
1424 {
1425 	struct rsl *cr;
1426 
1427 	if (*nprobs == 1)
1428 		return;
1429 
1430 	/*
1431 	 *  At this point, we only expect duplicate defects.
1432 	 *  Eversholt's diagnosis algorithm prevents duplicate
1433 	 *  suspects, but we rewrite defects in the platform code after
1434 	 *  the diagnosis is made, and that can introduce new
1435 	 *  duplicates.
1436 	 */
1437 	while (first <= last) {
1438 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
1439 			first++;
1440 			continue;
1441 		}
1442 		cr = first + 1;
1443 		while (cr <= last) {
1444 			if (is_defect(first->suspect->t)) {
1445 				if (rslcmp(first, cr) == 0) {
1446 					cr->suspect = NULL;
1447 					rslfree(cr);
1448 					(*nprobs)--;
1449 					(*nnonf)--;
1450 				}
1451 			}
1452 			/*
1453 			 * assume all defects are in order after our
1454 			 * sort and short circuit here with "else break" ?
1455 			 */
1456 			cr++;
1457 		}
1458 		first++;
1459 	}
1460 }
1461 
1462 /*
1463  * get_resources -- for a given suspect, determine what ASRU, FRU and
1464  *     RSRC nvlists should be advertised in the final suspect list.
1465  */
1466 void
1467 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
1468 {
1469 	struct node *asrudef, *frudef;
1470 	nvlist_t *asru, *fru;
1471 	nvlist_t *rsrc = NULL;
1472 	char *pathstr;
1473 
1474 	/*
1475 	 * First find any ASRU and/or FRU defined in the
1476 	 * initial fault tree.
1477 	 */
1478 	asrudef = eventprop_lookup(sp, L_ASRU);
1479 	frudef = eventprop_lookup(sp, L_FRU);
1480 
1481 	/*
1482 	 * Create FMRIs based on those definitions
1483 	 */
1484 	asru = node2fmri(asrudef);
1485 	fru = node2fmri(frudef);
1486 	pathstr = ipath2str(NULL, sp->ipp);
1487 
1488 	/*
1489 	 * Allow for platform translations of the FMRIs
1490 	 */
1491 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
1492 	    pathstr);
1493 
1494 	FREE(pathstr);
1495 	rsrcs->suspect = sp;
1496 	rsrcs->asru = asru;
1497 	rsrcs->fru = fru;
1498 	rsrcs->rsrc = rsrc;
1499 }
1500 
1501 /*
1502  * trim_suspects -- prior to publishing, we may need to remove some
1503  *    suspects from the list.  If we're auto-closing upsets, we don't
1504  *    want any of those in the published list.  If the ASRUs for multiple
1505  *    defects resolve to the same ASRU (driver) we only want to publish
1506  *    that as a single suspect.
1507  */
1508 static void
1509 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
1510     struct rsl **end)
1511 {
1512 	struct event *ep;
1513 	struct rsl *rp;
1514 	int rpcnt;
1515 
1516 	/*
1517 	 * First save the suspects in the psuspects, then copy back
1518 	 * only the ones we wish to retain.  This resets nsuspects to
1519 	 * zero.
1520 	 */
1521 	rpcnt = fmep->nsuspects;
1522 	save_suspects(fmep);
1523 
1524 	/*
1525 	 * allocate an array of resource pointers for the suspects.
1526 	 * We may end up using less than the full allocation, but this
1527 	 * is a very short-lived array.  publish_suspects() will free
1528 	 * this array when it's done using it.
1529 	 */
1530 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
1531 	bzero(rp, rpcnt * sizeof (struct rsl));
1532 
1533 	/* first pass, remove any unwanted upsets and populate our array */
1534 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
1535 		if (no_upsets && is_upset(ep->t))
1536 			continue;
1537 		get_resources(ep, rp, fmep->cfgdata->cooked);
1538 		rp++;
1539 		fmep->nsuspects++;
1540 		if (!is_fault(ep->t))
1541 			fmep->nonfault++;
1542 	}
1543 
1544 	/* if all we had was unwanted upsets, we're done */
1545 	if (fmep->nsuspects == 0)
1546 		return;
1547 
1548 	*end = rp - 1;
1549 
1550 	/* sort the array */
1551 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
1552 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
1553 }
1554 
1555 static void
1556 publish_suspects(struct fme *fmep)
1557 {
1558 	struct event *ep;
1559 	struct rsl *srl = NULL;
1560 	struct rsl *erl;
1561 	struct rsl *rp;
1562 	nvlist_t *fault;
1563 	uint8_t cert;
1564 	uint_t *frs;
1565 	uint_t fravg, frsum, fr;
1566 	int frcnt, fridx;
1567 	boolean_t no_upsets = B_FALSE;
1568 
1569 	stats_counter_bump(fmep->diags);
1570 
1571 	/*
1572 	 * The current fmd interfaces don't allow us to solve a case
1573 	 * that's already solved.  If we make a new case, what of the
1574 	 * ereports?  We don't appear to have an interface that allows
1575 	 * us to access the ereports attached to a case (if we wanted
1576 	 * to copy the original case's ereport attachments to the new
1577 	 * case) and it's also a bit unclear if there would be any
1578 	 * problems with having ereports attached to multiple cases
1579 	 * and/or attaching DIAGNOSED ereports to a case.  For now,
1580 	 * we'll just output a message.
1581 	 */
1582 	if (fmep->posted_suspects ||
1583 	    fmd_case_solved(fmep->hdl, fmep->fmcase)) {
1584 		out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ",
1585 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1586 		for (ep = fmep->suspects; ep; ep = ep->suspects) {
1587 			out(O_ALTFP|O_NONL, " ");
1588 			itree_pevent_brief(O_ALTFP|O_NONL, ep);
1589 		}
1590 		out(O_ALTFP, NULL);
1591 		return;
1592 	}
1593 
1594 	/*
1595 	 * If we're auto-closing upsets, we don't want to include them
1596 	 * in any produced suspect lists or certainty accounting.
1597 	 */
1598 	if (Autoclose != NULL)
1599 		if (strcmp(Autoclose, "true") == 0 ||
1600 		    strcmp(Autoclose, "all") == 0 ||
1601 		    strcmp(Autoclose, "upsets") == 0)
1602 			no_upsets = B_TRUE;
1603 
1604 	trim_suspects(fmep, no_upsets, &srl, &erl);
1605 
1606 	/*
1607 	 * If the resulting suspect list has no members, we're
1608 	 * done.  Returning here will simply close the case.
1609 	 */
1610 	if (fmep->nsuspects == 0) {
1611 		out(O_ALTFP,
1612 		    "[FME%d, case %s (all suspects are upsets)]",
1613 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
1614 		FREE(srl);
1615 		restore_suspects(fmep);
1616 		return;
1617 	}
1618 
1619 	/*
1620 	 * If the suspect list is all faults, then for a given fault,
1621 	 * say X of N, X's certainty is computed via:
1622 	 *
1623 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
1624 	 *
1625 	 * If none of the suspects are faults, and there are N suspects,
1626 	 * the certainty of a given suspect is 100/N.
1627 	 *
1628 	 * If there are are a mixture of faults and other problems in
1629 	 * the suspect list, we take an average of the faults'
1630 	 * FITrates and treat this average as the FITrate for any
1631 	 * non-faults.  The fitrate of any given suspect is then
1632 	 * computed per the first formula above.
1633 	 */
1634 	if (fmep->nonfault == fmep->nsuspects) {
1635 		/* NO faults in the suspect list */
1636 		cert = percentof(1, fmep->nsuspects);
1637 	} else {
1638 		/* sum the fitrates */
1639 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
1640 		fridx = frcnt = frsum = 0;
1641 
1642 		for (rp = srl; rp <= erl; rp++) {
1643 			struct node *n;
1644 
1645 			if (rp->suspect == NULL)
1646 				continue;
1647 			if (!is_fault(rp->suspect->t)) {
1648 				frs[fridx++] = 0;
1649 				continue;
1650 			}
1651 			n = eventprop_lookup(rp->suspect, L_FITrate);
1652 			if (node2uint(n, &fr) != 0) {
1653 				out(O_DEBUG|O_NONL, "event ");
1654 				ipath_print(O_DEBUG|O_NONL,
1655 				    ep->enode->u.event.ename->u.name.s,
1656 				    ep->ipp);
1657 				out(O_DEBUG, " has no FITrate (using 1)");
1658 				fr = 1;
1659 			} else if (fr == 0) {
1660 				out(O_DEBUG|O_NONL, "event ");
1661 				ipath_print(O_DEBUG|O_NONL,
1662 				    ep->enode->u.event.ename->u.name.s,
1663 				    ep->ipp);
1664 				out(O_DEBUG, " has zero FITrate (using 1)");
1665 				fr = 1;
1666 			}
1667 
1668 			frs[fridx++] = fr;
1669 			frsum += fr;
1670 			frcnt++;
1671 		}
1672 		fravg = avg(frsum, frcnt);
1673 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
1674 			if (frs[fridx] == 0) {
1675 				frs[fridx] = fravg;
1676 				frsum += fravg;
1677 			}
1678 	}
1679 
1680 	/* Add them in reverse order of our sort, as fmd reverses order */
1681 	for (rp = erl; rp >= srl; rp--) {
1682 		if (rp->suspect == NULL)
1683 			continue;
1684 		if (fmep->nonfault != fmep->nsuspects)
1685 			cert = percentof(frs[--fridx], frsum);
1686 		fault = fmd_nvl_create_fault(fmep->hdl,
1687 		    rp->suspect->enode->u.event.ename->u.name.s,
1688 		    cert,
1689 		    rp->asru,
1690 		    rp->fru,
1691 		    rp->rsrc);
1692 		if (fault == NULL)
1693 			out(O_DIE, "fault creation failed");
1694 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
1695 		rp->suspect->fault = fault;
1696 		rslfree(rp);
1697 	}
1698 	fmd_case_solve(fmep->hdl, fmep->fmcase);
1699 	out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
1700 	    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1701 
1702 	if (Autoconvict) {
1703 		for (rp = srl; rp <= erl; rp++) {
1704 			if (rp->suspect == NULL)
1705 				continue;
1706 			fmd_case_convict(fmep->hdl,
1707 			    fmep->fmcase, rp->suspect->fault);
1708 		}
1709 		out(O_ALTFP, "[convicting FME%d, case %s]", fmep->id,
1710 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1711 	}
1712 
1713 	/*
1714 	 * revert to the original suspect list
1715 	 */
1716 	FREE(srl);
1717 	restore_suspects(fmep);
1718 }
1719 
1720 static void
1721 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep)
1722 {
1723 	struct case_list *newcase;
1724 	nvlist_t *defect;
1725 
1726 	out(O_ALTFP,
1727 	    "[undiagnosable ereport received, "
1728 	    "creating and closing a new case (%s)]",
1729 	    Undiag_reason ? Undiag_reason : "reason not provided");
1730 
1731 	newcase = MALLOC(sizeof (struct case_list));
1732 	newcase->next = NULL;
1733 
1734 	newcase->fmcase = fmd_case_open(hdl, NULL);
1735 	if (Undiagablecaselist != NULL)
1736 		newcase->next = Undiagablecaselist;
1737 	Undiagablecaselist = newcase;
1738 
1739 	if (ffep != NULL)
1740 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
1741 
1742 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1743 	    NULL, NULL, NULL);
1744 	if (Undiag_reason != NULL)
1745 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
1746 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
1747 
1748 	fmd_case_solve(hdl, newcase->fmcase);
1749 	fmd_case_close(hdl, newcase->fmcase);
1750 }
1751 
1752 static void
1753 fme_undiagnosable(struct fme *f)
1754 {
1755 	nvlist_t *defect;
1756 
1757 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
1758 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
1759 	    Undiag_reason ? Undiag_reason : "undiagnosable");
1760 
1761 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
1762 	    NULL, NULL, NULL);
1763 	if (Undiag_reason != NULL)
1764 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
1765 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
1766 	fmd_case_solve(f->hdl, f->fmcase);
1767 	destroy_fme_bufs(f);
1768 	fmd_case_close(f->hdl, f->fmcase);
1769 }
1770 
1771 /*
1772  * fme_close_case
1773  *
1774  *	Find the requested case amongst our fmes and close it.  Free up
1775  *	the related fme.
1776  */
1777 void
1778 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
1779 {
1780 	struct case_list *ucasep, *prevcasep = NULL;
1781 	struct fme *prev = NULL;
1782 	struct fme *fmep;
1783 
1784 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
1785 		if (fmcase != ucasep->fmcase) {
1786 			prevcasep = ucasep;
1787 			continue;
1788 		}
1789 
1790 		if (prevcasep == NULL)
1791 			Undiagablecaselist = Undiagablecaselist->next;
1792 		else
1793 			prevcasep->next = ucasep->next;
1794 
1795 		FREE(ucasep);
1796 		return;
1797 	}
1798 
1799 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1800 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
1801 			break;
1802 		prev = fmep;
1803 	}
1804 
1805 	if (fmep == NULL) {
1806 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
1807 		    fmd_case_uuid(hdl, fmcase));
1808 		return;
1809 	}
1810 
1811 	if (EFMElist == fmep)
1812 		EFMElist = prev;
1813 
1814 	if (prev == NULL)
1815 		FMElist = FMElist->next;
1816 	else
1817 		prev->next = fmep->next;
1818 
1819 	fmep->next = NULL;
1820 
1821 	/* Get rid of any timer this fme has set */
1822 	if (fmep->wull != 0)
1823 		fmd_timer_remove(fmep->hdl, fmep->timer);
1824 
1825 	if (ClosedFMEs == NULL) {
1826 		ClosedFMEs = fmep;
1827 	} else {
1828 		fmep->next = ClosedFMEs;
1829 		ClosedFMEs = fmep;
1830 	}
1831 
1832 	Open_fme_count--;
1833 
1834 	/* See if we can close the overflow FME */
1835 	if (Open_fme_count <= Max_fme) {
1836 		for (fmep = FMElist; fmep; fmep = fmep->next) {
1837 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
1838 			    fmep->fmcase)))
1839 				break;
1840 		}
1841 
1842 		if (fmep != NULL)
1843 			fmd_case_close(fmep->hdl, fmep->fmcase);
1844 	}
1845 }
1846 
1847 /*
1848  * fme_set_timer()
1849  *	If the time we need to wait for the given FME is less than the
1850  *	current timer, kick that old timer out and establish a new one.
1851  */
1852 static void
1853 fme_set_timer(struct fme *fmep, unsigned long long wull)
1854 {
1855 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
1856 	ptree_timeval(O_ALTFP|O_VERB, &wull);
1857 
1858 	if (wull <= fmep->pull) {
1859 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
1860 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
1861 		out(O_ALTFP|O_VERB, NULL);
1862 		/* we've waited at least wull already, don't need timer */
1863 		return;
1864 	}
1865 
1866 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
1867 	if (fmep->wull != 0) {
1868 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
1869 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
1870 		out(O_ALTFP|O_VERB, NULL);
1871 	} else {
1872 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
1873 		out(O_ALTFP|O_VERB, NULL);
1874 	}
1875 
1876 	if (fmep->wull != 0)
1877 		if (wull >= fmep->wull)
1878 			/* New timer would fire later than established timer */
1879 			return;
1880 
1881 	if (fmep->wull != 0)
1882 		fmd_timer_remove(fmep->hdl, fmep->timer);
1883 
1884 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
1885 	    fmep->e0r, wull);
1886 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
1887 	fmep->wull = wull;
1888 }
1889 
1890 void
1891 fme_timer_fired(struct fme *fmep, id_t tid)
1892 {
1893 	struct fme *ffmep = NULL;
1894 
1895 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
1896 		if (ffmep == fmep)
1897 			break;
1898 
1899 	if (ffmep == NULL) {
1900 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
1901 		    (void *)fmep);
1902 		return;
1903 	}
1904 
1905 	if (tid != fmep->htid) {
1906 		/*
1907 		 * normal timer (not the hesitation timer
1908 		 */
1909 		fmep->pull = fmep->wull;
1910 		fmep->wull = 0;
1911 		fmd_buf_write(fmep->hdl, fmep->fmcase,
1912 		    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
1913 	} else {
1914 		fmep->hesitated = 1;
1915 	}
1916 	fme_eval(fmep, NULL);
1917 }
1918 
1919 /*
1920  * Preserve the fme's suspect list in its psuspects list, NULLing the
1921  * suspects list in the meantime.
1922  */
1923 static void
1924 save_suspects(struct fme *fmep)
1925 {
1926 	struct event *ep;
1927 	struct event *nextep;
1928 
1929 	/* zero out the previous suspect list */
1930 	for (ep = fmep->psuspects; ep; ep = nextep) {
1931 		nextep = ep->psuspects;
1932 		ep->psuspects = NULL;
1933 	}
1934 	fmep->psuspects = NULL;
1935 
1936 	/* zero out the suspect list, copying it to previous suspect list */
1937 	fmep->psuspects = fmep->suspects;
1938 	for (ep = fmep->suspects; ep; ep = nextep) {
1939 		nextep = ep->suspects;
1940 		ep->psuspects = ep->suspects;
1941 		ep->suspects = NULL;
1942 		ep->is_suspect = 0;
1943 	}
1944 	fmep->suspects = NULL;
1945 	fmep->nsuspects = 0;
1946 	fmep->nonfault = 0;
1947 }
1948 
1949 /*
1950  * Retrieve the fme's suspect list from its psuspects list.
1951  */
1952 static void
1953 restore_suspects(struct fme *fmep)
1954 {
1955 	struct event *ep;
1956 	struct event *nextep;
1957 
1958 	fmep->nsuspects = fmep->nonfault = 0;
1959 	fmep->suspects = fmep->psuspects;
1960 	for (ep = fmep->psuspects; ep; ep = nextep) {
1961 		fmep->nsuspects++;
1962 		if (!is_fault(ep->t))
1963 			fmep->nonfault++;
1964 		nextep = ep->psuspects;
1965 		ep->suspects = ep->psuspects;
1966 	}
1967 }
1968 
1969 /*
1970  * this is what we use to call the Emrys prototype code instead of main()
1971  */
1972 static void
1973 fme_eval(struct fme *fmep, fmd_event_t *ffep)
1974 {
1975 	struct event *ep;
1976 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1977 
1978 	save_suspects(fmep);
1979 
1980 	out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id);
1981 	indent_set("  ");
1982 
1983 	initialize_cycles(fmep);
1984 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL);
1985 
1986 	out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
1987 	    fme_state2str(fmep->state));
1988 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1989 		out(O_ALTFP|O_VERB|O_NONL, " ");
1990 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
1991 	}
1992 	out(O_ALTFP|O_VERB, NULL);
1993 
1994 	if (fmep->posted_suspects) {
1995 		/*
1996 		 * this FME has already posted a diagnosis, so see if
1997 		 * the event changed the diagnosis and print a warning
1998 		 * if it did.
1999 		 *
2000 		 */
2001 		if (suspects_changed(fmep)) {
2002 			print_suspects(SLCHANGED, fmep);
2003 			publish_suspects(fmep);
2004 		}
2005 	} else {
2006 		switch (fmep->state) {
2007 		case FME_CREDIBLE:
2008 			/*
2009 			 * if the suspect list contains any upsets, we
2010 			 * turn off the hesitation logic (by setting
2011 			 * the hesitate flag which normally indicates
2012 			 * we've already done the hesitate logic).
2013 			 * this is done because hesitating with upsets
2014 			 * causes us to explain away additional soft errors
2015 			 * while the upset FME stays open.
2016 			 */
2017 			if (fmep->hesitated == 0) {
2018 				struct event *s;
2019 
2020 				for (s = fmep->suspects; s; s = s->suspects) {
2021 					if (s->t == N_UPSET) {
2022 						fmep->hesitated = 1;
2023 						break;
2024 					}
2025 				}
2026 			}
2027 
2028 			if (Hesitate &&
2029 			    fmep->suspects != NULL &&
2030 			    fmep->suspects->suspects != NULL &&
2031 			    fmep->hesitated == 0) {
2032 				/*
2033 				 * about to publish multi-entry suspect list,
2034 				 * set the hesitation timer if not already set.
2035 				 */
2036 				if (fmep->htid == 0) {
2037 					out(O_ALTFP|O_NONL,
2038 					    "[hesitate FME%d, case %s ",
2039 					    fmep->id,
2040 					    fmd_case_uuid(fmep->hdl,
2041 					    fmep->fmcase));
2042 					ptree_timeval(O_ALTFP|O_NONL,
2043 					    (unsigned long long *)&Hesitate);
2044 					out(O_ALTFP, "]");
2045 					fme_set_timer(fmep, my_delay);
2046 					fmep->htid =
2047 					    fmd_timer_install(fmep->hdl,
2048 					    (void *)fmep, NULL, Hesitate);
2049 				} else {
2050 					out(O_ALTFP,
2051 					    "[still hesitating FME%d, case %s]",
2052 					    fmep->id,
2053 					    fmd_case_uuid(fmep->hdl,
2054 					    fmep->fmcase));
2055 				}
2056 			} else {
2057 				print_suspects(SLNEW, fmep);
2058 				(void) upsets_eval(fmep, ffep);
2059 				publish_suspects(fmep);
2060 				fmep->posted_suspects = 1;
2061 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2062 				    WOBUF_POSTD,
2063 				    (void *)&fmep->posted_suspects,
2064 				    sizeof (fmep->posted_suspects));
2065 			}
2066 			break;
2067 
2068 		case FME_WAIT:
2069 			/*
2070 			 * singleton suspect list implies
2071 			 * no point in waiting
2072 			 */
2073 			if (fmep->suspects &&
2074 			    fmep->suspects->suspects == NULL) {
2075 				print_suspects(SLNEW, fmep);
2076 				(void) upsets_eval(fmep, ffep);
2077 				publish_suspects(fmep);
2078 				fmep->posted_suspects = 1;
2079 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2080 				    WOBUF_POSTD,
2081 				    (void *)&fmep->posted_suspects,
2082 				    sizeof (fmep->posted_suspects));
2083 				fmep->state = FME_CREDIBLE;
2084 			} else {
2085 				ASSERT(my_delay > fmep->ull);
2086 				fme_set_timer(fmep, my_delay);
2087 				print_suspects(SLWAIT, fmep);
2088 			}
2089 			break;
2090 
2091 		case FME_DISPROVED:
2092 			print_suspects(SLDISPROVED, fmep);
2093 			Undiag_reason = UD_UNSOLVD;
2094 			fme_undiagnosable(fmep);
2095 			break;
2096 		}
2097 	}
2098 
2099 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
2100 		int doclose = 0;
2101 
2102 		if (strcmp(Autoclose, "true") == 0 ||
2103 		    strcmp(Autoclose, "all") == 0)
2104 			doclose = 1;
2105 
2106 		if (strcmp(Autoclose, "upsets") == 0) {
2107 			doclose = 1;
2108 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
2109 				if (ep->t != N_UPSET) {
2110 					doclose = 0;
2111 					break;
2112 				}
2113 			}
2114 		}
2115 
2116 		if (doclose) {
2117 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
2118 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2119 
2120 			destroy_fme_bufs(fmep);
2121 			fmd_case_close(fmep->hdl, fmep->fmcase);
2122 		}
2123 	}
2124 }
2125 
2126 /*
2127  * below here is the code derived from the Emrys prototype
2128  */
2129 
2130 static void indent(void);
2131 static int triggered(struct fme *fmep, struct event *ep, int mark);
2132 static void mark_arrows(struct fme *fmep, struct event *ep, int mark);
2133 static enum fme_state effects_test(struct fme *fmep,
2134     struct event *fault_event);
2135 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
2136     unsigned long long at_latest_by, unsigned long long *pdelay,
2137     struct arrow *arrowp);
2138 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
2139     unsigned long long at_latest_by, unsigned long long *pdelay);
2140 
2141 static int
2142 triggered(struct fme *fmep, struct event *ep, int mark)
2143 {
2144 	struct bubble *bp;
2145 	struct arrowlist *ap;
2146 	int count = 0;
2147 
2148 	stats_counter_bump(fmep->Tcallcount);
2149 	for (bp = itree_next_bubble(ep, NULL); bp;
2150 	    bp = itree_next_bubble(ep, bp)) {
2151 		if (bp->t != B_TO)
2152 			continue;
2153 		for (ap = itree_next_arrow(bp, NULL); ap;
2154 		    ap = itree_next_arrow(bp, ap)) {
2155 			/* check count of marks against K in the bubble */
2156 			if (ap->arrowp->tail->mark == mark &&
2157 			    ++count >= bp->nork)
2158 				return (1);
2159 		}
2160 	}
2161 	return (0);
2162 }
2163 
2164 static void
2165 mark_arrows(struct fme *fmep, struct event *ep, int mark)
2166 {
2167 	struct bubble *bp;
2168 	struct arrowlist *ap;
2169 
2170 	for (bp = itree_next_bubble(ep, NULL); bp;
2171 	    bp = itree_next_bubble(ep, bp)) {
2172 		if (bp->t != B_FROM)
2173 			continue;
2174 		if (bp->mark != mark) {
2175 			stats_counter_bump(fmep->Marrowcount);
2176 			bp->mark = mark;
2177 			for (ap = itree_next_arrow(bp, NULL); ap;
2178 			    ap = itree_next_arrow(bp, ap)) {
2179 				struct constraintlist *ctp;
2180 				struct evalue value;
2181 				int do_not_follow = 0;
2182 				/*
2183 				 * see if false constraint prevents us
2184 				 * from traversing this arrow, but don't
2185 				 * bother if the event is an ereport we
2186 				 * haven't seen
2187 				 */
2188 				if (ap->arrowp->head->myevent->t != N_EREPORT ||
2189 				    ap->arrowp->head->myevent->count != 0) {
2190 					platform_set_payloadnvp(
2191 					    ap->arrowp->head->myevent->nvp);
2192 					for (ctp = ap->arrowp->constraints;
2193 					    ctp != NULL; ctp = ctp->next) {
2194 						if (eval_expr(ctp->cnode,
2195 						    NULL, NULL,
2196 						    &fmep->globals,
2197 						    fmep->cfgdata->cooked,
2198 						    ap->arrowp, 0,
2199 						    &value) == 0 ||
2200 						    value.t == UNDEFINED ||
2201 						    value.v == 0) {
2202 							do_not_follow = 1;
2203 							break;
2204 						}
2205 					}
2206 					platform_set_payloadnvp(NULL);
2207 				}
2208 
2209 				if (do_not_follow) {
2210 					indent();
2211 					out(O_ALTFP|O_VERB|O_NONL,
2212 					    "  False arrow to ");
2213 					itree_pevent_brief(
2214 					    O_ALTFP|O_VERB|O_NONL,
2215 					    ap->arrowp->head->myevent);
2216 					out(O_ALTFP|O_VERB|O_NONL, " ");
2217 					ptree(O_ALTFP|O_VERB|O_NONL,
2218 					    ctp->cnode, 1, 0);
2219 					out(O_ALTFP|O_VERB, NULL);
2220 					continue;
2221 				}
2222 
2223 				if (triggered(fmep, ap->arrowp->head->myevent,
2224 				    mark))
2225 					mark_arrows(fmep,
2226 					    ap->arrowp->head->myevent, mark);
2227 			}
2228 		}
2229 	}
2230 }
2231 
2232 static enum fme_state
2233 effects_test(struct fme *fmep, struct event *fault_event)
2234 {
2235 	struct event *error_event;
2236 	enum fme_state return_value = FME_CREDIBLE;
2237 
2238 	stats_counter_bump(fmep->Ecallcount);
2239 	indent_push("  E");
2240 	indent();
2241 	out(O_ALTFP|O_VERB|O_NONL, "->");
2242 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2243 	out(O_ALTFP|O_VERB, NULL);
2244 
2245 	mark_arrows(fmep, fault_event, 1);
2246 	for (error_event = fmep->observations;
2247 	    error_event; error_event = error_event->observations) {
2248 		indent();
2249 		out(O_ALTFP|O_VERB|O_NONL, " ");
2250 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
2251 		if (!triggered(fmep, error_event, 1)) {
2252 			return_value = FME_DISPROVED;
2253 			out(O_ALTFP|O_VERB, " NOT triggered");
2254 			break;
2255 		} else {
2256 			out(O_ALTFP|O_VERB, " triggered");
2257 		}
2258 	}
2259 	mark_arrows(fmep, fault_event, 0);
2260 
2261 	indent();
2262 	out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value));
2263 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2264 	out(O_ALTFP|O_VERB, NULL);
2265 	indent_pop();
2266 	return (return_value);
2267 }
2268 
2269 static enum fme_state
2270 requirements_test(struct fme *fmep, struct event *ep,
2271     unsigned long long at_latest_by, unsigned long long *pdelay,
2272     struct arrow *arrowp)
2273 {
2274 	int waiting_events;
2275 	int credible_events;
2276 	enum fme_state return_value = FME_CREDIBLE;
2277 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2278 	unsigned long long arrow_delay;
2279 	unsigned long long my_delay;
2280 	struct event *ep2;
2281 	struct bubble *bp;
2282 	struct arrowlist *ap;
2283 
2284 	stats_counter_bump(fmep->Rcallcount);
2285 	indent_push("  R");
2286 	indent();
2287 	out(O_ALTFP|O_VERB|O_NONL, "->");
2288 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2289 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2290 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2291 	out(O_ALTFP|O_VERB, NULL);
2292 
2293 	if (ep->t == N_EREPORT) {
2294 		if (ep->count == 0) {
2295 			if (fmep->pull >= at_latest_by) {
2296 				return_value = FME_DISPROVED;
2297 			} else {
2298 				*pdelay = at_latest_by;
2299 				return_value = FME_WAIT;
2300 			}
2301 		} else if (arrowp != NULL) {
2302 			/*
2303 			 * evaluate constraints only for current observation
2304 			 */
2305 			struct constraintlist *ctp;
2306 			struct evalue value;
2307 
2308 			platform_set_payloadnvp(ep->nvp);
2309 			for (ctp = arrowp->constraints; ctp != NULL;
2310 				ctp = ctp->next) {
2311 				if (eval_expr(ctp->cnode, NULL, NULL,
2312 				    &fmep->globals, fmep->cfgdata->cooked,
2313 				    arrowp, 0, &value) == 0 ||
2314 				    value.t == UNDEFINED || value.v == 0) {
2315 					indent();
2316 					out(O_ALTFP|O_VERB|O_NONL,
2317 					    "  False constraint ");
2318 					out(O_ALTFP|O_VERB|O_NONL, " ");
2319 					ptree(O_ALTFP|O_VERB|O_NONL,
2320 					    ctp->cnode, 1, 0);
2321 					out(O_ALTFP|O_VERB, NULL);
2322 					return_value = FME_DISPROVED;
2323 					break;
2324 				}
2325 			}
2326 			platform_set_payloadnvp(NULL);
2327 		}
2328 
2329 		indent();
2330 		switch (return_value) {
2331 		case FME_CREDIBLE:
2332 			out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2333 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2334 			break;
2335 		case FME_DISPROVED:
2336 			out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2337 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2338 			break;
2339 		case FME_WAIT:
2340 			out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2341 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2342 			out(O_ALTFP|O_VERB|O_NONL, " to ");
2343 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2344 			break;
2345 		default:
2346 			out(O_DIE, "requirements_test: unexpected fme_state");
2347 			break;
2348 		}
2349 		out(O_ALTFP|O_VERB, NULL);
2350 		indent_pop();
2351 
2352 		return (return_value);
2353 	}
2354 
2355 	/* this event is not a report, descend the tree */
2356 	for (bp = itree_next_bubble(ep, NULL); bp;
2357 	    bp = itree_next_bubble(ep, bp)) {
2358 		if (bp->t != B_FROM)
2359 			continue;
2360 		if (bp->mark == 0) {
2361 			int n = bp->nork;
2362 
2363 			bp->mark = 1;
2364 			credible_events = 0;
2365 			waiting_events = 0;
2366 			arrow_delay = TIMEVAL_EVENTUALLY;
2367 			/*
2368 			 * n is -1 for 'A' so adjust it.
2369 			 * XXX just count up the arrows for now.
2370 			 */
2371 			if (n < 0) {
2372 				n = 0;
2373 				for (ap = itree_next_arrow(bp, NULL); ap;
2374 				    ap = itree_next_arrow(bp, ap))
2375 					n++;
2376 				indent();
2377 				out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
2378 			} else {
2379 				indent();
2380 				out(O_ALTFP|O_VERB, " Bubble N=%d", n);
2381 			}
2382 
2383 			for (ap = itree_next_arrow(bp, NULL); ap;
2384 			    ap = itree_next_arrow(bp, ap)) {
2385 				ep2 = ap->arrowp->head->myevent;
2386 				if (n <= credible_events)
2387 					break;
2388 
2389 				if (triggered(fmep, ep2, 1))
2390 					/* XXX adding max timevals! */
2391 					switch (requirements_test(fmep, ep2,
2392 					    at_latest_by + ap->arrowp->maxdelay,
2393 					    &my_delay, ap->arrowp)) {
2394 					case FME_CREDIBLE:
2395 						credible_events++;
2396 						break;
2397 					case FME_DISPROVED:
2398 						break;
2399 					case FME_WAIT:
2400 						if (my_delay < arrow_delay)
2401 							arrow_delay = my_delay;
2402 						waiting_events++;
2403 						break;
2404 					default:
2405 						out(O_DIE,
2406 						"Bug in requirements_test.");
2407 					}
2408 				else
2409 					credible_events++;
2410 			}
2411 			indent();
2412 			out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
2413 			    credible_events, waiting_events);
2414 			if (credible_events + waiting_events < n) {
2415 				/* Can never meet requirements */
2416 				indent();
2417 				out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2418 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2419 				out(O_ALTFP|O_VERB, NULL);
2420 				indent_pop();
2421 				return (FME_DISPROVED);
2422 			}
2423 			if (credible_events < n) { /* will have to wait */
2424 				/* wait time is shortest known */
2425 				if (arrow_delay < overall_delay)
2426 					overall_delay = arrow_delay;
2427 				return_value = FME_WAIT;
2428 			}
2429 		} else {
2430 			indent();
2431 			out(O_ALTFP|O_VERB|O_NONL, " Mark was set: ");
2432 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2433 			out(O_ALTFP|O_VERB|O_NONL, " to");
2434 			for (ap = itree_next_arrow(bp, NULL); ap;
2435 			    ap = itree_next_arrow(bp, ap)) {
2436 				out(O_ALTFP|O_VERB|O_NONL, " ");
2437 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
2438 				    ap->arrowp->head->myevent);
2439 			}
2440 			out(O_ALTFP|O_VERB, NULL);
2441 		}
2442 	}
2443 
2444 	/*
2445 	 * evaluate constraints for ctlist, which is the list of
2446 	 * constraints for the arrow pointing into this node of the tree
2447 	 */
2448 	if (return_value == FME_CREDIBLE && arrowp != NULL) {
2449 		struct constraintlist *ctp;
2450 		struct evalue value;
2451 
2452 		platform_set_payloadnvp(ep->nvp);
2453 		for (ctp = arrowp->constraints; ctp != NULL;
2454 			ctp = ctp->next) {
2455 			if (eval_expr(ctp->cnode, NULL,	NULL, &fmep->globals,
2456 			    fmep->cfgdata->cooked, arrowp, 0, &value) == 0 ||
2457 			    value.t == UNDEFINED || value.v == 0) {
2458 				indent();
2459 				out(O_ALTFP|O_VERB|O_NONL,
2460 				    "  False constraint ");
2461 				out(O_ALTFP|O_VERB|O_NONL, " ");
2462 				ptree(O_ALTFP|O_VERB|O_NONL,
2463 				    ctp->cnode, 1, 0);
2464 				out(O_ALTFP|O_VERB, NULL);
2465 				return_value = FME_DISPROVED;
2466 				break;
2467 			}
2468 		}
2469 		platform_set_payloadnvp(NULL);
2470 	}
2471 
2472 	if (return_value == FME_WAIT)
2473 		*pdelay = overall_delay;
2474 	indent();
2475 	out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value));
2476 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2477 	out(O_ALTFP|O_VERB, NULL);
2478 	indent_pop();
2479 	return (return_value);
2480 }
2481 
2482 static enum fme_state
2483 causes_test(struct fme *fmep, struct event *ep,
2484     unsigned long long at_latest_by, unsigned long long *pdelay)
2485 {
2486 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2487 	unsigned long long my_delay;
2488 	int credible_results = 0;
2489 	int waiting_results = 0;
2490 	enum fme_state fstate;
2491 	struct event *tail_event;
2492 	struct bubble *bp;
2493 	struct arrowlist *ap;
2494 	int k = 1;
2495 
2496 	stats_counter_bump(fmep->Ccallcount);
2497 	indent_push("  C");
2498 	indent();
2499 	out(O_ALTFP|O_VERB|O_NONL, "->");
2500 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2501 	out(O_ALTFP|O_VERB, NULL);
2502 
2503 	for (bp = itree_next_bubble(ep, NULL); bp;
2504 	    bp = itree_next_bubble(ep, bp)) {
2505 		if (bp->t != B_TO)
2506 			continue;
2507 		k = bp->nork;	/* remember the K value */
2508 		for (ap = itree_next_arrow(bp, NULL); ap;
2509 		    ap = itree_next_arrow(bp, ap)) {
2510 			struct constraintlist *ctp;
2511 			struct evalue value;
2512 			int do_not_follow = 0;
2513 			/*
2514 			 * see if false constraint prevents us
2515 			 * from traversing this arrow
2516 			 */
2517 			platform_set_payloadnvp(ep->nvp);
2518 			for (ctp = ap->arrowp->constraints;
2519 			    ctp != NULL; ctp = ctp->next) {
2520 				if (eval_expr(ctp->cnode, NULL, NULL,
2521 				    &fmep->globals,
2522 				    fmep->cfgdata->cooked,
2523 				    ap->arrowp, 0,
2524 				    &value) == 0 ||
2525 				    value.t == UNDEFINED ||
2526 				    value.v == 0) {
2527 					do_not_follow = 1;
2528 					break;
2529 				}
2530 			}
2531 			platform_set_payloadnvp(NULL);
2532 			if (do_not_follow) {
2533 				indent();
2534 				out(O_ALTFP|O_VERB|O_NONL,
2535 				    "  False arrow from ");
2536 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
2537 				    ap->arrowp->tail->myevent);
2538 				out(O_ALTFP|O_VERB|O_NONL, " ");
2539 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2540 				out(O_ALTFP|O_VERB, NULL);
2541 				continue;
2542 			}
2543 
2544 			if (ap->arrowp->causes_tested++ > 0) {
2545 				/*
2546 				 * get to this point if this is not the
2547 				 * first time we're going through this
2548 				 * arrow in the causes test.  consider this
2549 				 * branch to be credible and let the
2550 				 * credible/noncredible outcome depend on
2551 				 * the other branches in this cycle.
2552 				 */
2553 				fstate = FME_CREDIBLE;
2554 			} else {
2555 				/*
2556 				 * get to this point if this is the first
2557 				 * time we're going through this arrow.
2558 				 */
2559 				tail_event = ap->arrowp->tail->myevent;
2560 				fstate = hypothesise(fmep, tail_event,
2561 						    at_latest_by,
2562 						    &my_delay, ap->arrowp);
2563 			}
2564 
2565 			switch (fstate) {
2566 			case FME_WAIT:
2567 				if (my_delay < overall_delay)
2568 					overall_delay = my_delay;
2569 				waiting_results++;
2570 				break;
2571 			case FME_CREDIBLE:
2572 				credible_results++;
2573 				break;
2574 			case FME_DISPROVED:
2575 				break;
2576 			default:
2577 				out(O_DIE, "Bug in causes_test");
2578 			}
2579 
2580 			ap->arrowp->causes_tested--;
2581 			ASSERT(ap->arrowp->causes_tested >= 0);
2582 		}
2583 	}
2584 	/* compare against K */
2585 	if (credible_results + waiting_results < k) {
2586 		indent();
2587 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2588 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2589 		out(O_ALTFP|O_VERB, NULL);
2590 		indent_pop();
2591 		return (FME_DISPROVED);
2592 	}
2593 	if (waiting_results != 0) {
2594 		*pdelay = overall_delay;
2595 		indent();
2596 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2597 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2598 		out(O_ALTFP|O_VERB|O_NONL, " to ");
2599 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2600 		out(O_ALTFP|O_VERB, NULL);
2601 		indent_pop();
2602 		return (FME_WAIT);
2603 	}
2604 	indent();
2605 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2606 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2607 	out(O_ALTFP|O_VERB, NULL);
2608 	indent_pop();
2609 	return (FME_CREDIBLE);
2610 }
2611 
2612 static enum fme_state
2613 hypothesise(struct fme *fmep, struct event *ep,
2614 	unsigned long long at_latest_by, unsigned long long *pdelay,
2615 	struct arrow *arrowp)
2616 {
2617 	enum fme_state rtr, otr;
2618 	unsigned long long my_delay;
2619 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2620 
2621 	stats_counter_bump(fmep->Hcallcount);
2622 	indent_push("  H");
2623 	indent();
2624 	out(O_ALTFP|O_VERB|O_NONL, "->");
2625 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2626 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2627 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2628 	out(O_ALTFP|O_VERB, NULL);
2629 
2630 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay, arrowp);
2631 	mark_arrows(fmep, ep, 0); /* clean up after requirements test */
2632 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
2633 		overall_delay = my_delay;
2634 	if (rtr != FME_DISPROVED) {
2635 		if (is_problem(ep->t)) {
2636 			otr = effects_test(fmep, ep);
2637 			if (otr != FME_DISPROVED) {
2638 				if (fmep->peek == 0 && ep->is_suspect++ == 0) {
2639 					ep->suspects = fmep->suspects;
2640 					fmep->suspects = ep;
2641 					fmep->nsuspects++;
2642 					if (!is_fault(ep->t))
2643 						fmep->nonfault++;
2644 				}
2645 			}
2646 		} else
2647 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
2648 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
2649 			overall_delay = my_delay;
2650 		if ((otr != FME_DISPROVED) &&
2651 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
2652 			*pdelay = overall_delay;
2653 	}
2654 	if (rtr == FME_DISPROVED) {
2655 		indent();
2656 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2657 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2658 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
2659 		indent_pop();
2660 		return (FME_DISPROVED);
2661 	}
2662 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
2663 		indent();
2664 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2665 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2666 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
2667 		indent_pop();
2668 		return (FME_DISPROVED);
2669 	}
2670 	if (otr == FME_DISPROVED) {
2671 		indent();
2672 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2673 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2674 		out(O_ALTFP|O_VERB, " (causes are not credible)");
2675 		indent_pop();
2676 		return (FME_DISPROVED);
2677 	}
2678 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
2679 		indent();
2680 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2681 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2682 		out(O_ALTFP|O_VERB|O_NONL, " to ");
2683 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
2684 		out(O_ALTFP|O_VERB, NULL);
2685 		indent_pop();
2686 		return (FME_WAIT);
2687 	}
2688 	indent();
2689 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2690 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2691 	out(O_ALTFP|O_VERB, NULL);
2692 	indent_pop();
2693 	return (FME_CREDIBLE);
2694 }
2695