xref: /illumos-gate/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision 7f7322febbcfe774b7270abc3b191c094bfcc517)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 
58 /* imported from eft.c... */
59 extern char *Autoclose;
60 extern hrtime_t Hesitate;
61 extern nv_alloc_t Eft_nv_hdl;
62 extern int Max_fme;
63 
64 /* fme under construction is global so we can free it on module abort */
65 static struct fme *Nfmep;
66 
67 static const char *Undiag_reason;
68 
69 static int Nextid = 0;
70 
71 static int Open_fme_count = 0;	/* Count of open FMEs */
72 
73 /* list of fault management exercises underway */
74 static struct fme {
75 	struct fme *next;		/* next exercise */
76 	unsigned long long ull;		/* time when fme was created */
77 	int id;				/* FME id */
78 	struct cfgdata *cfgdata;	/* full configuration data */
79 	struct lut *eventtree;		/* propagation tree for this FME */
80 	/*
81 	 * The initial error report that created this FME is kept in
82 	 * two forms.  e0 points to the instance tree node and is used
83 	 * by fme_eval() as the starting point for the inference
84 	 * algorithm.  e0r is the event handle FMD passed to us when
85 	 * the ereport first arrived and is used when setting timers,
86 	 * which are always relative to the time of this initial
87 	 * report.
88 	 */
89 	struct event *e0;
90 	fmd_event_t *e0r;
91 
92 	id_t    timer;			/* for setting an fmd time-out */
93 	id_t	htid;			/* for setting hesitation timer */
94 
95 	struct event *ecurrent;		/* ereport under consideration */
96 	struct event *suspects;		/* current suspect list */
97 	struct event *psuspects;	/* previous suspect list */
98 	int nsuspects;			/* count of suspects */
99 	int nonfault;			/* zero if all suspects T_FAULT */
100 	int posted_suspects;		/* true if we've posted a diagnosis */
101 	int hesitated;			/* true if we hesitated */
102 	int uniqobs;			/* number of unique events observed */
103 	int peek;			/* just peeking, don't track suspects */
104 	int overflow;			/* true if overflow FME */
105 	enum fme_state {
106 		FME_NOTHING = 5000,	/* not evaluated yet */
107 		FME_WAIT,		/* need to wait for more info */
108 		FME_CREDIBLE,		/* suspect list is credible */
109 		FME_DISPROVED		/* no valid suspects found */
110 	} state;
111 
112 	unsigned long long pull;	/* time passed since created */
113 	unsigned long long wull;	/* wait until this time for re-eval */
114 	struct event *observations;	/* observation list */
115 	struct lut *globals;		/* values of global variables */
116 	/* fmd interfacing */
117 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
118 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
119 	/* stats */
120 	struct stats *Rcount;
121 	struct stats *Hcallcount;
122 	struct stats *Rcallcount;
123 	struct stats *Ccallcount;
124 	struct stats *Ecallcount;
125 	struct stats *Tcallcount;
126 	struct stats *Marrowcount;
127 	struct stats *diags;
128 } *FMElist, *EFMElist, *ClosedFMEs;
129 
130 static struct case_list {
131 	fmd_case_t *fmcase;
132 	struct case_list *next;
133 } *Undiagablecaselist;
134 
135 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
136 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
137 	unsigned long long at_latest_by, unsigned long long *pdelay,
138 	struct arrow *arrowp);
139 static struct node *eventprop_lookup(struct event *ep, const char *propname);
140 static struct node *pathstring2epnamenp(char *path);
141 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep);
142 static void restore_suspects(struct fme *fmep);
143 static void save_suspects(struct fme *fmep);
144 static void destroy_fme(struct fme *f);
145 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
146     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
147 
148 static struct fme *
149 alloc_fme(void)
150 {
151 	struct fme *fmep;
152 
153 	fmep = MALLOC(sizeof (*fmep));
154 	bzero(fmep, sizeof (*fmep));
155 	return (fmep);
156 }
157 
158 /*
159  * fme_ready -- called when all initialization of the FME (except for
160  *	stats) has completed successfully.  Adds the fme to global lists
161  *	and establishes its stats.
162  */
163 static struct fme *
164 fme_ready(struct fme *fmep)
165 {
166 	char nbuf[100];
167 
168 	Nfmep = NULL;	/* don't need to free this on module abort now */
169 
170 	if (EFMElist) {
171 		EFMElist->next = fmep;
172 		EFMElist = fmep;
173 	} else
174 		FMElist = EFMElist = fmep;
175 
176 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
177 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
178 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
179 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
180 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
181 	fmep->Rcallcount = stats_new_counter(nbuf,
182 	    "calls to requirements_test()", 1);
183 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
184 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
185 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
186 	fmep->Ecallcount =
187 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
188 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
189 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
190 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
191 	fmep->Marrowcount = stats_new_counter(nbuf,
192 	    "arrows marked by mark_arrows()", 1);
193 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
194 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
195 
196 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
197 	config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked);
198 
199 	return (fmep);
200 }
201 
202 static struct fme *
203 newfme(const char *e0class, const struct ipath *e0ipp)
204 {
205 	struct cfgdata *cfgdata;
206 
207 	if ((cfgdata = config_snapshot()) == NULL) {
208 		out(O_ALTFP, "newfme: NULL configuration");
209 		Undiag_reason = UD_NOCONF;
210 		return (NULL);
211 	}
212 
213 	Nfmep = alloc_fme();
214 
215 	Nfmep->id = Nextid++;
216 	Nfmep->cfgdata = cfgdata;
217 	Nfmep->posted_suspects = 0;
218 	Nfmep->uniqobs = 0;
219 	Nfmep->state = FME_NOTHING;
220 	Nfmep->pull = 0ULL;
221 	Nfmep->overflow = 0;
222 
223 	Nfmep->fmcase = NULL;
224 	Nfmep->hdl = NULL;
225 
226 	if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
227 		out(O_ALTFP, "newfme: NULL instance tree");
228 		Undiag_reason = UD_INSTFAIL;
229 		config_free(cfgdata);
230 		FREE(Nfmep);
231 		Nfmep = NULL;
232 		return (NULL);
233 	}
234 
235 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
236 
237 	if ((Nfmep->e0 =
238 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
239 		out(O_ALTFP, "newfme: e0 not in instance tree");
240 		Undiag_reason = UD_BADEVENTI;
241 		itree_free(Nfmep->eventtree);
242 		config_free(cfgdata);
243 		FREE(Nfmep);
244 		Nfmep = NULL;
245 		return (NULL);
246 	}
247 
248 	return (fme_ready(Nfmep));
249 }
250 
251 void
252 fme_fini(void)
253 {
254 	struct fme *sfp, *fp;
255 	struct case_list *ucasep, *nextcasep;
256 
257 	ucasep = Undiagablecaselist;
258 	while (ucasep != NULL) {
259 		nextcasep = ucasep->next;
260 		FREE(ucasep);
261 		ucasep = nextcasep;
262 	}
263 	Undiagablecaselist = NULL;
264 
265 	/* clean up closed fmes */
266 	fp = ClosedFMEs;
267 	while (fp != NULL) {
268 		sfp = fp->next;
269 		destroy_fme(fp);
270 		fp = sfp;
271 	}
272 	ClosedFMEs = NULL;
273 
274 	fp = FMElist;
275 	while (fp != NULL) {
276 		sfp = fp->next;
277 		destroy_fme(fp);
278 		fp = sfp;
279 	}
280 	FMElist = EFMElist = NULL;
281 
282 	/* if we were in the middle of creating an fme, free it now */
283 	if (Nfmep) {
284 		destroy_fme(Nfmep);
285 		Nfmep = NULL;
286 	}
287 }
288 
289 /*
290  * Allocated space for a buffer name.  20 bytes allows for
291  * a ridiculous 9,999,999 unique observations.
292  */
293 #define	OBBUFNMSZ 20
294 
295 /*
296  *  serialize_observation
297  *
298  *  Create a recoverable version of the current observation
299  *  (f->ecurrent).  We keep a serialized version of each unique
300  *  observation in order that we may resume correctly the fme in the
301  *  correct state if eft or fmd crashes and we're restarted.
302  */
303 static void
304 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
305 {
306 	size_t pkdlen;
307 	char tmpbuf[OBBUFNMSZ];
308 	char *pkd = NULL;
309 	char *estr;
310 
311 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
312 	estr = ipath2str(cls, ipp);
313 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
314 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
315 	    strlen(estr) + 1);
316 	FREE(estr);
317 
318 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
319 		(void) snprintf(tmpbuf,
320 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
321 		if (nvlist_xpack(fp->ecurrent->nvp,
322 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
323 			out(O_DIE|O_SYS, "pack of observed nvl failed");
324 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
325 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
326 		FREE(pkd);
327 	}
328 
329 	fp->uniqobs++;
330 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
331 	    sizeof (fp->uniqobs));
332 }
333 
334 /*
335  *  init_fme_bufs -- We keep several bits of state about an fme for
336  *	use if eft or fmd crashes and we're restarted.
337  */
338 static void
339 init_fme_bufs(struct fme *fp)
340 {
341 	size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin;
342 
343 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen));
344 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen,
345 	    sizeof (cfglen));
346 	if (cfglen != 0) {
347 		fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen);
348 		fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG,
349 		    fp->cfgdata->begin, cfglen);
350 	}
351 
352 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
353 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
354 	    sizeof (fp->pull));
355 
356 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
357 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
358 	    sizeof (fp->id));
359 
360 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
361 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
362 	    sizeof (fp->uniqobs));
363 
364 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
365 	    sizeof (fp->posted_suspects));
366 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
367 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
368 }
369 
370 static void
371 destroy_fme_bufs(struct fme *fp)
372 {
373 	char tmpbuf[OBBUFNMSZ];
374 	int o;
375 
376 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
377 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
378 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
379 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
380 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
381 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
382 
383 	for (o = 0; o < fp->uniqobs; o++) {
384 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
385 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
386 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
387 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
388 	}
389 }
390 
391 /*
392  * reconstitute_observations -- convert a case's serialized observations
393  *	back into struct events.  Returns zero if all observations are
394  *	successfully reconstituted.
395  */
396 static int
397 reconstitute_observations(struct fme *fmep)
398 {
399 	struct event *ep;
400 	struct node *epnamenp = NULL;
401 	size_t pkdlen;
402 	char *pkd = NULL;
403 	char *tmpbuf = alloca(OBBUFNMSZ);
404 	char *sepptr;
405 	char *estr;
406 	int ocnt;
407 	int elen;
408 
409 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
410 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
411 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
412 		if (elen == 0) {
413 			out(O_ALTFP,
414 			    "reconstitute_observation: no %s buffer found.",
415 			    tmpbuf);
416 			Undiag_reason = UD_MISSINGOBS;
417 			break;
418 		}
419 
420 		estr = MALLOC(elen);
421 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
422 		sepptr = strchr(estr, '@');
423 		if (sepptr == NULL) {
424 			out(O_ALTFP,
425 			    "reconstitute_observation: %s: "
426 			    "missing @ separator in %s.",
427 			    tmpbuf, estr);
428 			Undiag_reason = UD_MISSINGPATH;
429 			FREE(estr);
430 			break;
431 		}
432 
433 		*sepptr = '\0';
434 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
435 			out(O_ALTFP,
436 			    "reconstitute_observation: %s: "
437 			    "trouble converting path string \"%s\" "
438 			    "to internal representation.",
439 			    tmpbuf, sepptr + 1);
440 			Undiag_reason = UD_MISSINGPATH;
441 			FREE(estr);
442 			break;
443 		}
444 
445 		/* construct the event */
446 		ep = itree_lookup(fmep->eventtree,
447 		    stable(estr), ipath(epnamenp));
448 		if (ep == NULL) {
449 			out(O_ALTFP,
450 			    "reconstitute_observation: %s: "
451 			    "lookup of  \"%s\" in itree failed.",
452 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
453 			Undiag_reason = UD_BADOBS;
454 			tree_free(epnamenp);
455 			FREE(estr);
456 			break;
457 		}
458 		tree_free(epnamenp);
459 
460 		/*
461 		 * We may or may not have a saved nvlist for the observation
462 		 */
463 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
464 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
465 		if (pkdlen != 0) {
466 			pkd = MALLOC(pkdlen);
467 			fmd_buf_read(fmep->hdl,
468 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
469 			if (nvlist_xunpack(pkd,
470 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
471 				out(O_DIE|O_SYS, "pack of observed nvl failed");
472 			FREE(pkd);
473 		}
474 
475 		if (ocnt == 0)
476 			fmep->e0 = ep;
477 
478 		FREE(estr);
479 		fmep->ecurrent = ep;
480 		ep->count++;
481 
482 		/* link it into list of observations seen */
483 		ep->observations = fmep->observations;
484 		fmep->observations = ep;
485 	}
486 
487 	if (ocnt == fmep->uniqobs) {
488 		(void) fme_ready(fmep);
489 		return (0);
490 	}
491 
492 	return (1);
493 }
494 
495 /*
496  * restart_fme -- called during eft initialization.  Reconstitutes
497  *	an in-progress fme.
498  */
499 void
500 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
501 {
502 	nvlist_t *defect;
503 	struct case_list *bad;
504 	struct fme *fmep;
505 	struct cfgdata *cfgdata = NULL;
506 	size_t rawsz;
507 
508 	fmep = alloc_fme();
509 	fmep->fmcase = inprogress;
510 	fmep->hdl = hdl;
511 
512 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
513 		out(O_ALTFP, "restart_fme: No config data");
514 		Undiag_reason = UD_MISSINGINFO;
515 		goto badcase;
516 	}
517 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
518 	    sizeof (size_t));
519 
520 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
521 		out(O_ALTFP, "restart_fme: No event zero");
522 		Undiag_reason = UD_MISSINGZERO;
523 		goto badcase;
524 	}
525 
526 	cfgdata = MALLOC(sizeof (struct cfgdata));
527 	cfgdata->cooked = NULL;
528 	cfgdata->devcache = NULL;
529 	cfgdata->cpucache = NULL;
530 	cfgdata->refcnt = 1;
531 
532 	if (rawsz > 0) {
533 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
534 			out(O_ALTFP, "restart_fme: Config data size mismatch");
535 			Undiag_reason = UD_CFGMISMATCH;
536 			goto badcase;
537 		}
538 		cfgdata->begin = MALLOC(rawsz);
539 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
540 		fmd_buf_read(hdl,
541 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
542 	} else {
543 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
544 	}
545 	fmep->cfgdata = cfgdata;
546 
547 	config_cook(cfgdata);
548 	if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) {
549 		/* case not properly saved or irretrievable */
550 		out(O_ALTFP, "restart_fme: NULL instance tree");
551 		Undiag_reason = UD_INSTFAIL;
552 		goto badcase;
553 	}
554 
555 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
556 
557 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
558 		out(O_ALTFP, "restart_fme: no saved wait time");
559 		Undiag_reason = UD_MISSINGINFO;
560 		goto badcase;
561 	} else {
562 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
563 		    sizeof (fmep->pull));
564 	}
565 
566 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
567 		out(O_ALTFP, "restart_fme: no saved posted status");
568 		Undiag_reason = UD_MISSINGINFO;
569 		goto badcase;
570 	} else {
571 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
572 		    (void *)&fmep->posted_suspects,
573 		    sizeof (fmep->posted_suspects));
574 	}
575 
576 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
577 		out(O_ALTFP, "restart_fme: no saved id");
578 		Undiag_reason = UD_MISSINGINFO;
579 		goto badcase;
580 	} else {
581 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
582 		    sizeof (fmep->id));
583 	}
584 	if (Nextid <= fmep->id)
585 		Nextid = fmep->id + 1;
586 
587 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
588 		out(O_ALTFP, "restart_fme: no count of observations");
589 		Undiag_reason = UD_MISSINGINFO;
590 		goto badcase;
591 	} else {
592 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
593 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
594 	}
595 
596 	if (reconstitute_observations(fmep) != 0)
597 		goto badcase;
598 
599 	Open_fme_count++;
600 
601 	/* give the diagnosis algorithm a shot at the new FME state */
602 	fme_eval(fmep, NULL);
603 	return;
604 
605 badcase:
606 	if (fmep->eventtree != NULL)
607 		itree_free(fmep->eventtree);
608 	config_free(cfgdata);
609 	destroy_fme_bufs(fmep);
610 	FREE(fmep);
611 
612 	/*
613 	 * Since we're unable to restart the case, add it to the undiagable
614 	 * list and solve and close it as appropriate.
615 	 */
616 	bad = MALLOC(sizeof (struct case_list));
617 	bad->next = NULL;
618 
619 	if (Undiagablecaselist != NULL)
620 		bad->next = Undiagablecaselist;
621 	Undiagablecaselist = bad;
622 	bad->fmcase = inprogress;
623 
624 	out(O_ALTFP, "[case %s (unable to restart), ",
625 	    fmd_case_uuid(hdl, bad->fmcase));
626 
627 	if (fmd_case_solved(hdl, bad->fmcase)) {
628 		out(O_ALTFP, "already solved, ");
629 	} else {
630 		out(O_ALTFP, "solving, ");
631 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
632 		    NULL, NULL, NULL);
633 		if (Undiag_reason != NULL)
634 			(void) nvlist_add_string(defect,
635 			    UNDIAG_REASON, Undiag_reason);
636 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
637 		fmd_case_solve(hdl, bad->fmcase);
638 	}
639 
640 	if (fmd_case_closed(hdl, bad->fmcase)) {
641 		out(O_ALTFP, "already closed ]");
642 	} else {
643 		out(O_ALTFP, "closing ]");
644 		fmd_case_close(hdl, bad->fmcase);
645 	}
646 }
647 
648 void
649 destroy_fme(struct fme *f)
650 {
651 	stats_delete(f->Rcount);
652 	stats_delete(f->Hcallcount);
653 	stats_delete(f->Rcallcount);
654 	stats_delete(f->Ccallcount);
655 	stats_delete(f->Ecallcount);
656 	stats_delete(f->Tcallcount);
657 	stats_delete(f->Marrowcount);
658 	stats_delete(f->diags);
659 
660 	itree_free(f->eventtree);
661 	config_free(f->cfgdata);
662 	FREE(f);
663 }
664 
665 static const char *
666 fme_state2str(enum fme_state s)
667 {
668 	switch (s) {
669 	case FME_NOTHING:	return ("NOTHING");
670 	case FME_WAIT:		return ("WAIT");
671 	case FME_CREDIBLE:	return ("CREDIBLE");
672 	case FME_DISPROVED:	return ("DISPROVED");
673 	default:		return ("UNKNOWN");
674 	}
675 }
676 
677 static int
678 is_problem(enum nametype t)
679 {
680 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
681 }
682 
683 static int
684 is_fault(enum nametype t)
685 {
686 	return (t == N_FAULT);
687 }
688 
689 static int
690 is_defect(enum nametype t)
691 {
692 	return (t == N_DEFECT);
693 }
694 
695 static int
696 is_upset(enum nametype t)
697 {
698 	return (t == N_UPSET);
699 }
700 
701 /*ARGSUSED*/
702 static void
703 clear_causes_tested(struct event *lhs, struct event *ep, void *arg)
704 {
705 	struct bubble *bp;
706 	struct arrowlist *ap;
707 
708 	for (bp = itree_next_bubble(ep, NULL); bp;
709 	    bp = itree_next_bubble(ep, bp)) {
710 		if (bp->t != B_FROM)
711 			continue;
712 		for (ap = itree_next_arrow(bp, NULL); ap;
713 		    ap = itree_next_arrow(bp, ap))
714 			ap->arrowp->causes_tested = 0;
715 	}
716 }
717 
718 /*
719  * call this function with initcode set to 0 to initialize cycle tracking
720  */
721 static void
722 initialize_cycles(struct fme *fmep)
723 {
724 	lut_walk(fmep->eventtree, (lut_cb)clear_causes_tested, NULL);
725 }
726 
727 static void
728 fme_print(int flags, struct fme *fmep)
729 {
730 	struct event *ep;
731 
732 	out(flags, "Fault Management Exercise %d", fmep->id);
733 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
734 	out(flags|O_NONL, "\t  Start time: ");
735 	ptree_timeval(flags|O_NONL, &fmep->ull);
736 	out(flags, NULL);
737 	if (fmep->wull) {
738 		out(flags|O_NONL, "\t   Wait time: ");
739 		ptree_timeval(flags|O_NONL, &fmep->wull);
740 		out(flags, NULL);
741 	}
742 	out(flags|O_NONL, "\t          E0: ");
743 	if (fmep->e0)
744 		itree_pevent_brief(flags|O_NONL, fmep->e0);
745 	else
746 		out(flags|O_NONL, "NULL");
747 	out(flags, NULL);
748 	out(flags|O_NONL, "\tObservations:");
749 	for (ep = fmep->observations; ep; ep = ep->observations) {
750 		out(flags|O_NONL, " ");
751 		itree_pevent_brief(flags|O_NONL, ep);
752 	}
753 	out(flags, NULL);
754 	out(flags|O_NONL, "\tSuspect list:");
755 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
756 		out(flags|O_NONL, " ");
757 		itree_pevent_brief(flags|O_NONL, ep);
758 	}
759 	out(flags, NULL);
760 	out(flags|O_VERB2, "\t        Tree:");
761 	itree_ptree(flags|O_VERB2, fmep->eventtree);
762 }
763 
764 static struct node *
765 pathstring2epnamenp(char *path)
766 {
767 	char *sep = "/";
768 	struct node *ret;
769 	char *ptr;
770 
771 	if ((ptr = strtok(path, sep)) == NULL)
772 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
773 
774 	ret = tree_iname(stable(ptr), NULL, 0);
775 
776 	while ((ptr = strtok(NULL, sep)) != NULL)
777 		ret = tree_name_append(ret,
778 		    tree_iname(stable(ptr), NULL, 0));
779 
780 	return (ret);
781 }
782 
783 /*
784  * for a given upset sp, increment the corresponding SERD engine.  if the
785  * SERD engine trips, return the ename and ipp of the resulting ereport.
786  * returns true if engine tripped and *enamep and *ippp were filled in.
787  */
788 static int
789 serd_eval(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase,
790 	struct event *sp, const char **enamep, const struct ipath **ippp)
791 {
792 	struct node *serdinst;
793 	char *serdname;
794 
795 	ASSERT(sp->t == N_UPSET);
796 	ASSERT(ffep != NULL);
797 
798 	/*
799 	 * obtain instanced SERD engine from the upset sp.  from this
800 	 * derive serdname, the string used to identify the SERD engine.
801 	 */
802 	serdinst = eventprop_lookup(sp, L_engine);
803 
804 	if (serdinst == NULL)
805 		return (NULL);
806 
807 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
808 	    ipath(serdinst->u.stmt.np->u.event.epname));
809 
810 	if (!fmd_serd_exists(hdl, serdname)) {
811 		struct node *nN, *nT;
812 
813 		/* no SERD engine yet, so create it */
814 		nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL);
815 		nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL);
816 
817 		ASSERT(nN->t == T_NUM);
818 		ASSERT(nT->t == T_TIMEVAL);
819 
820 		fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull,
821 		    (hrtime_t)nT->u.ull);
822 	}
823 
824 
825 	/*
826 	 * increment SERD engine.  if engine fires, reset serd
827 	 * engine and return trip_strcode
828 	 */
829 	if (fmd_serd_record(hdl, serdname, ffep)) {
830 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
831 		    (void *)L_trip, NULL);
832 
833 		ASSERT(tripinst != NULL);
834 
835 		*enamep = tripinst->u.event.ename->u.name.s;
836 		*ippp = ipath(tripinst->u.event.epname);
837 
838 		fmd_case_add_serd(hdl, fmcase, serdname);
839 		fmd_serd_reset(hdl, serdname);
840 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
841 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
842 		out(O_ALTFP, "]");
843 
844 		FREE(serdname);
845 		return (1);
846 	}
847 
848 	FREE(serdname);
849 	return (0);
850 }
851 
852 /*
853  * search a suspect list for upsets.  feed each upset to serd_eval() and
854  * build up tripped[], an array of ereports produced by the firing of
855  * any SERD engines.  then feed each ereport back into
856  * fme_receive_report().
857  *
858  * returns ntrip, the number of these ereports produced.
859  */
860 static int
861 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
862 {
863 	/* we build an array of tripped ereports that we send ourselves */
864 	struct {
865 		const char *ename;
866 		const struct ipath *ipp;
867 	} *tripped;
868 	struct event *sp;
869 	int ntrip, nupset, i;
870 
871 	/*
872 	 * we avoid recursion by calling fme_receive_report() at the end of
873 	 * this function with a NULL ffep
874 	 */
875 	if (ffep == NULL)
876 		return (0);
877 
878 	/*
879 	 * count the number of upsets to determine the upper limit on
880 	 * expected trip ereport strings.  remember that one upset can
881 	 * lead to at most one ereport.
882 	 */
883 	nupset = 0;
884 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
885 		if (sp->t == N_UPSET)
886 			nupset++;
887 	}
888 
889 	if (nupset == 0)
890 		return (0);
891 
892 	/*
893 	 * get to this point if we have upsets and expect some trip
894 	 * ereports
895 	 */
896 	tripped = alloca(sizeof (*tripped) * nupset);
897 	bzero((void *)tripped, sizeof (*tripped) * nupset);
898 
899 	ntrip = 0;
900 	for (sp = fmep->suspects; sp; sp = sp->suspects)
901 		if (sp->t == N_UPSET &&
902 		    serd_eval(fmep->hdl, ffep, fmep->fmcase, sp,
903 			    &tripped[ntrip].ename, &tripped[ntrip].ipp))
904 			ntrip++;
905 
906 	for (i = 0; i < ntrip; i++)
907 		fme_receive_report(fmep->hdl, NULL,
908 		    tripped[i].ename, tripped[i].ipp, NULL);
909 
910 	return (ntrip);
911 }
912 
913 /*
914  * fme_receive_external_report -- call when an external ereport comes in
915  *
916  * this routine just converts the relevant information from the ereport
917  * into a format used internally and passes it on to fme_receive_report().
918  */
919 void
920 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
921     const char *eventstring)
922 {
923 	struct node *epnamenp = platform_getpath(nvl);
924 	const struct ipath *ipp;
925 
926 	/*
927 	 * XFILE: If we ended up without a path, it's an X-file.
928 	 * For now, use our undiagnosable interface.
929 	 */
930 	if (epnamenp == NULL) {
931 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
932 		Undiag_reason = UD_NOPATH;
933 		publish_undiagnosable(hdl, ffep);
934 		return;
935 	}
936 
937 	ipp = ipath(epnamenp);
938 	tree_free(epnamenp);
939 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
940 }
941 
942 static void
943 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
944     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
945 {
946 	struct event *ep;
947 	struct fme *fmep = NULL;
948 	struct fme *ofmep = NULL;
949 	struct fme *cfmep, *svfmep;
950 	int matched = 0;
951 	nvlist_t *defect;
952 
953 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
954 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
955 	out(O_ALTFP|O_STAMP, NULL);
956 
957 	/* decide which FME it goes to */
958 	for (fmep = FMElist; fmep; fmep = fmep->next) {
959 		int prev_verbose;
960 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
961 		enum fme_state state;
962 
963 		if (fmep->overflow) {
964 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
965 				ofmep = fmep;
966 
967 			continue;
968 		}
969 
970 		/* look up event in event tree for this FME */
971 		if ((ep = itree_lookup(fmep->eventtree,
972 		    eventstring, ipp)) == NULL)
973 			continue;
974 
975 		/* note observation */
976 		fmep->ecurrent = ep;
977 		if (ep->count++ == 0) {
978 			/* link it into list of observations seen */
979 			ep->observations = fmep->observations;
980 			fmep->observations = ep;
981 			ep->nvp = evnv_dupnvl(nvl);
982 		}
983 
984 		/* tell hypothesise() not to mess with suspect list */
985 		fmep->peek = 1;
986 
987 		/* don't want this to be verbose (unless Debug is set) */
988 		prev_verbose = Verbose;
989 		if (Debug == 0)
990 			Verbose = 0;
991 
992 		initialize_cycles(fmep);
993 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL);
994 
995 		fmep->peek = 0;
996 
997 		/* put verbose flag back */
998 		Verbose = prev_verbose;
999 
1000 		if (state != FME_DISPROVED) {
1001 			/* found an FME that explains the ereport */
1002 			matched++;
1003 			out(O_ALTFP|O_NONL, "[");
1004 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1005 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1006 
1007 			if (ep->count == 1)
1008 				serialize_observation(fmep, eventstring, ipp);
1009 
1010 			if (ffep)
1011 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1012 
1013 			stats_counter_bump(fmep->Rcount);
1014 
1015 			/* re-eval FME */
1016 			fme_eval(fmep, ffep);
1017 		} else {
1018 
1019 			/* not a match, undo noting of observation */
1020 			fmep->ecurrent = NULL;
1021 			if (--ep->count == 0) {
1022 				/* unlink it from observations */
1023 				fmep->observations = ep->observations;
1024 				ep->observations = NULL;
1025 				nvlist_free(ep->nvp);
1026 				ep->nvp = NULL;
1027 			}
1028 		}
1029 	}
1030 
1031 	if (matched)
1032 		return;	/* explained by at least one existing FME */
1033 
1034 	/* clean up closed fmes */
1035 	cfmep = ClosedFMEs;
1036 	while (cfmep != NULL) {
1037 		svfmep = cfmep->next;
1038 		destroy_fme(cfmep);
1039 		cfmep = svfmep;
1040 	}
1041 	ClosedFMEs = NULL;
1042 
1043 	if (ofmep) {
1044 		out(O_ALTFP|O_NONL, "[");
1045 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1046 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1047 		if (ffep)
1048 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1049 
1050 		return;
1051 
1052 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1053 		out(O_ALTFP|O_NONL, "[");
1054 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1055 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1056 		/* Create overflow fme */
1057 		if ((fmep = newfme(eventstring, ipp)) == NULL) {
1058 			out(O_ALTFP|O_NONL, "[");
1059 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1060 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1061 			publish_undiagnosable(hdl, ffep);
1062 			return;
1063 		}
1064 
1065 		Open_fme_count++;
1066 
1067 		fmep->fmcase = fmd_case_open(hdl, NULL);
1068 		fmep->hdl = hdl;
1069 		init_fme_bufs(fmep);
1070 		fmep->overflow = B_TRUE;
1071 
1072 		if (ffep)
1073 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1074 
1075 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1076 		    NULL, NULL, NULL);
1077 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1078 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1079 		fmd_case_solve(hdl, fmep->fmcase);
1080 		return;
1081 	}
1082 
1083 	/* start a new FME */
1084 	if ((fmep = newfme(eventstring, ipp)) == NULL) {
1085 		out(O_ALTFP|O_NONL, "[");
1086 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1087 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1088 		publish_undiagnosable(hdl, ffep);
1089 		return;
1090 	}
1091 
1092 	Open_fme_count++;
1093 
1094 	/* open a case */
1095 	fmep->fmcase = fmd_case_open(hdl, NULL);
1096 	fmep->hdl = hdl;
1097 	init_fme_bufs(fmep);
1098 
1099 	out(O_ALTFP|O_NONL, "[");
1100 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1101 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1102 	    fmd_case_uuid(hdl, fmep->fmcase));
1103 
1104 	ep = fmep->e0;
1105 	ASSERT(ep != NULL);
1106 
1107 	/* note observation */
1108 	fmep->ecurrent = ep;
1109 	if (ep->count++ == 0) {
1110 		/* link it into list of observations seen */
1111 		ep->observations = fmep->observations;
1112 		fmep->observations = ep;
1113 		ep->nvp = evnv_dupnvl(nvl);
1114 		serialize_observation(fmep, eventstring, ipp);
1115 	}
1116 
1117 	stats_counter_bump(fmep->Rcount);
1118 
1119 	if (ffep) {
1120 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1121 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1122 		fmep->e0r = ffep;
1123 	}
1124 
1125 	/* give the diagnosis algorithm a shot at the new FME state */
1126 	fme_eval(fmep, ffep);
1127 }
1128 
1129 void
1130 fme_status(int flags)
1131 {
1132 	struct fme *fmep;
1133 
1134 	if (FMElist == NULL) {
1135 		out(flags, "No fault management exercises underway.");
1136 		return;
1137 	}
1138 
1139 	for (fmep = FMElist; fmep; fmep = fmep->next)
1140 		fme_print(flags, fmep);
1141 }
1142 
1143 /*
1144  * "indent" routines used mostly for nicely formatted debug output, but also
1145  * for sanity checking for infinite recursion bugs.
1146  */
1147 
1148 #define	MAX_INDENT 1024
1149 static const char *indent_s[MAX_INDENT];
1150 static int current_indent;
1151 
1152 static void
1153 indent_push(const char *s)
1154 {
1155 	if (current_indent < MAX_INDENT)
1156 		indent_s[current_indent++] = s;
1157 	else
1158 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1159 }
1160 
1161 static void
1162 indent_set(const char *s)
1163 {
1164 	current_indent = 0;
1165 	indent_push(s);
1166 }
1167 
1168 static void
1169 indent_pop(void)
1170 {
1171 	if (current_indent > 0)
1172 		current_indent--;
1173 	else
1174 		out(O_DIE, "recursion underflow");
1175 }
1176 
1177 static void
1178 indent(void)
1179 {
1180 	int i;
1181 	if (!Verbose)
1182 		return;
1183 	for (i = 0; i < current_indent; i++)
1184 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1185 }
1186 
1187 static int
1188 suspects_changed(struct fme *fmep)
1189 {
1190 	struct event *suspects = fmep->suspects;
1191 	struct event *psuspects = fmep->psuspects;
1192 
1193 	while (suspects != NULL && psuspects != NULL) {
1194 		if (suspects != psuspects)
1195 			return (1);
1196 		suspects = suspects->suspects;
1197 		psuspects = psuspects->psuspects;
1198 	}
1199 
1200 	return (suspects != psuspects);
1201 }
1202 
1203 #define	SLNEW		1
1204 #define	SLCHANGED	2
1205 #define	SLWAIT		3
1206 #define	SLDISPROVED	4
1207 
1208 static void
1209 print_suspects(int circumstance, struct fme *fmep)
1210 {
1211 	struct event *ep;
1212 
1213 	out(O_ALTFP|O_NONL, "[");
1214 	if (circumstance == SLCHANGED) {
1215 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1216 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1217 	} else if (circumstance == SLWAIT) {
1218 		out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id);
1219 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1220 	} else if (circumstance == SLDISPROVED) {
1221 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1222 	} else {
1223 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1224 	}
1225 
1226 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1227 		out(O_ALTFP, "]");
1228 		return;
1229 	}
1230 
1231 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1232 		out(O_ALTFP|O_NONL, " ");
1233 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1234 	}
1235 	out(O_ALTFP, "]");
1236 }
1237 
1238 static struct node *
1239 eventprop_lookup(struct event *ep, const char *propname)
1240 {
1241 	return (lut_lookup(ep->props, (void *)propname, NULL));
1242 }
1243 
1244 #define	MAXDIGITIDX	23
1245 static char numbuf[MAXDIGITIDX + 1];
1246 
1247 static int
1248 node2uint(struct node *n, uint_t *valp)
1249 {
1250 	struct evalue value;
1251 	struct lut *globals = NULL;
1252 
1253 	if (n == NULL)
1254 		return (1);
1255 
1256 	/*
1257 	 * check value.v since we are being asked to convert an unsigned
1258 	 * long long int to an unsigned int
1259 	 */
1260 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1261 	    value.t != UINT64 || value.v > (1ULL << 32))
1262 		return (1);
1263 
1264 	*valp = (uint_t)value.v;
1265 
1266 	return (0);
1267 }
1268 
1269 static nvlist_t *
1270 node2fmri(struct node *n)
1271 {
1272 	nvlist_t **pa, *f, *p;
1273 	struct node *nc;
1274 	uint_t depth = 0;
1275 	char *numstr, *nullbyte;
1276 	char *failure;
1277 	int err, i;
1278 
1279 	/* XXX do we need to be able to handle a non-T_NAME node? */
1280 	if (n == NULL || n->t != T_NAME)
1281 		return (NULL);
1282 
1283 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1284 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1285 			break;
1286 		depth++;
1287 	}
1288 
1289 	if (nc != NULL) {
1290 		/* We bailed early, something went wrong */
1291 		return (NULL);
1292 	}
1293 
1294 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1295 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1296 	pa = alloca(depth * sizeof (nvlist_t *));
1297 	for (i = 0; i < depth; i++)
1298 		pa[i] = NULL;
1299 
1300 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1301 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1302 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1303 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1304 	if (err != 0) {
1305 		failure = "basic construction of FMRI failed";
1306 		goto boom;
1307 	}
1308 
1309 	numbuf[MAXDIGITIDX] = '\0';
1310 	nullbyte = &numbuf[MAXDIGITIDX];
1311 	i = 0;
1312 
1313 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1314 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1315 		if (err != 0) {
1316 			failure = "alloc of an hc-pair failed";
1317 			goto boom;
1318 		}
1319 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1320 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1321 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1322 		if (err != 0) {
1323 			failure = "construction of an hc-pair failed";
1324 			goto boom;
1325 		}
1326 		pa[i++] = p;
1327 	}
1328 
1329 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1330 	if (err == 0) {
1331 		for (i = 0; i < depth; i++)
1332 			if (pa[i] != NULL)
1333 				nvlist_free(pa[i]);
1334 		return (f);
1335 	}
1336 	failure = "addition of hc-pair array to FMRI failed";
1337 
1338 boom:
1339 	for (i = 0; i < depth; i++)
1340 		if (pa[i] != NULL)
1341 			nvlist_free(pa[i]);
1342 	nvlist_free(f);
1343 	out(O_DIE, "%s", failure);
1344 	/*NOTREACHED*/
1345 }
1346 
1347 static uint_t
1348 avg(uint_t sum, uint_t cnt)
1349 {
1350 	unsigned long long s = sum * 10;
1351 
1352 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1353 }
1354 
1355 static uint8_t
1356 percentof(uint_t part, uint_t whole)
1357 {
1358 	unsigned long long p = part * 1000;
1359 
1360 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1361 }
1362 
1363 static struct rsl {
1364 	struct event *suspect;
1365 	nvlist_t *asru;
1366 	nvlist_t *fru;
1367 	nvlist_t *rsrc;
1368 };
1369 
1370 /*
1371  *  rslfree -- free internal members of struct rsl not expected to be
1372  *	freed elsewhere.
1373  */
1374 static void
1375 rslfree(struct rsl *freeme)
1376 {
1377 	if (freeme->asru != NULL)
1378 		nvlist_free(freeme->asru);
1379 	if (freeme->fru != NULL)
1380 		nvlist_free(freeme->fru);
1381 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
1382 		nvlist_free(freeme->rsrc);
1383 }
1384 
1385 /*
1386  *  rslcmp -- compare two rsl structures.  Use the following
1387  *	comparisons to establish cardinality:
1388  *
1389  *	1. Name of the suspect's class. (simple strcmp)
1390  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
1391  *
1392  */
1393 static int
1394 rslcmp(const void *a, const void *b)
1395 {
1396 	struct rsl *r1 = (struct rsl *)a;
1397 	struct rsl *r2 = (struct rsl *)b;
1398 	int rv;
1399 
1400 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
1401 	    r2->suspect->enode->u.event.ename->u.name.s);
1402 	if (rv != 0)
1403 		return (rv);
1404 
1405 	if (r1->asru == NULL && r2->asru == NULL)
1406 		return (0);
1407 	if (r1->asru == NULL)
1408 		return (-1);
1409 	if (r2->asru == NULL)
1410 		return (1);
1411 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
1412 }
1413 
1414 /*
1415  *  rsluniq -- given an array of rsl structures, seek out and "remove"
1416  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
1417  *	of the array element.  Removal also means updating the number of
1418  *	problems and the number of problems which are not faults.  User
1419  *	provides the first and last element pointers.
1420  */
1421 static void
1422 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
1423 {
1424 	struct rsl *cr;
1425 
1426 	if (*nprobs == 1)
1427 		return;
1428 
1429 	/*
1430 	 *  At this point, we only expect duplicate defects.
1431 	 *  Eversholt's diagnosis algorithm prevents duplicate
1432 	 *  suspects, but we rewrite defects in the platform code after
1433 	 *  the diagnosis is made, and that can introduce new
1434 	 *  duplicates.
1435 	 */
1436 	while (first <= last) {
1437 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
1438 			first++;
1439 			continue;
1440 		}
1441 		cr = first + 1;
1442 		while (cr <= last) {
1443 			if (is_defect(first->suspect->t)) {
1444 				if (rslcmp(first, cr) == 0) {
1445 					cr->suspect = NULL;
1446 					rslfree(cr);
1447 					(*nprobs)--;
1448 					(*nnonf)--;
1449 				}
1450 			}
1451 			/*
1452 			 * assume all defects are in order after our
1453 			 * sort and short circuit here with "else break" ?
1454 			 */
1455 			cr++;
1456 		}
1457 		first++;
1458 	}
1459 }
1460 
1461 /*
1462  * get_resources -- for a given suspect, determine what ASRU, FRU and
1463  *     RSRC nvlists should be advertised in the final suspect list.
1464  */
1465 void
1466 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
1467 {
1468 	struct node *asrudef, *frudef;
1469 	nvlist_t *asru, *fru;
1470 	nvlist_t *rsrc = NULL;
1471 	char *pathstr;
1472 
1473 	/*
1474 	 * First find any ASRU and/or FRU defined in the
1475 	 * initial fault tree.
1476 	 */
1477 	asrudef = eventprop_lookup(sp, L_ASRU);
1478 	frudef = eventprop_lookup(sp, L_FRU);
1479 
1480 	/*
1481 	 * Create FMRIs based on those definitions
1482 	 */
1483 	asru = node2fmri(asrudef);
1484 	fru = node2fmri(frudef);
1485 	pathstr = ipath2str(NULL, sp->ipp);
1486 
1487 	/*
1488 	 * Allow for platform translations of the FMRIs
1489 	 */
1490 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
1491 	    pathstr);
1492 
1493 	FREE(pathstr);
1494 	rsrcs->suspect = sp;
1495 	rsrcs->asru = asru;
1496 	rsrcs->fru = fru;
1497 	rsrcs->rsrc = rsrc;
1498 }
1499 
1500 /*
1501  * trim_suspects -- prior to publishing, we may need to remove some
1502  *    suspects from the list.  If we're auto-closing upsets, we don't
1503  *    want any of those in the published list.  If the ASRUs for multiple
1504  *    defects resolve to the same ASRU (driver) we only want to publish
1505  *    that as a single suspect.
1506  */
1507 static void
1508 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
1509     struct rsl **end)
1510 {
1511 	struct event *ep;
1512 	struct rsl *rp;
1513 	int rpcnt;
1514 
1515 	/*
1516 	 * First save the suspects in the psuspects, then copy back
1517 	 * only the ones we wish to retain.  This resets nsuspects to
1518 	 * zero.
1519 	 */
1520 	rpcnt = fmep->nsuspects;
1521 	save_suspects(fmep);
1522 
1523 	/*
1524 	 * allocate an array of resource pointers for the suspects.
1525 	 * We may end up using less than the full allocation, but this
1526 	 * is a very short-lived array.  publish_suspects() will free
1527 	 * this array when it's done using it.
1528 	 */
1529 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
1530 	bzero(rp, rpcnt * sizeof (struct rsl));
1531 
1532 	/* first pass, remove any unwanted upsets and populate our array */
1533 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
1534 		if (no_upsets && is_upset(ep->t))
1535 			continue;
1536 		get_resources(ep, rp, fmep->cfgdata->cooked);
1537 		rp++;
1538 		fmep->nsuspects++;
1539 		if (!is_fault(ep->t))
1540 			fmep->nonfault++;
1541 	}
1542 
1543 	/* if all we had was unwanted upsets, we're done */
1544 	if (fmep->nsuspects == 0)
1545 		return;
1546 
1547 	*end = rp - 1;
1548 
1549 	/* sort the array */
1550 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
1551 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
1552 }
1553 
1554 static void
1555 publish_suspects(struct fme *fmep)
1556 {
1557 	struct event *ep;
1558 	struct rsl *srl = NULL;
1559 	struct rsl *erl;
1560 	struct rsl *rp;
1561 	nvlist_t *fault;
1562 	uint8_t cert;
1563 	uint_t *frs;
1564 	uint_t fravg, frsum, fr;
1565 	int frcnt, fridx;
1566 	boolean_t no_upsets = B_FALSE;
1567 
1568 	stats_counter_bump(fmep->diags);
1569 
1570 	/*
1571 	 * The current fmd interfaces don't allow us to solve a case
1572 	 * that's already solved.  If we make a new case, what of the
1573 	 * ereports?  We don't appear to have an interface that allows
1574 	 * us to access the ereports attached to a case (if we wanted
1575 	 * to copy the original case's ereport attachments to the new
1576 	 * case) and it's also a bit unclear if there would be any
1577 	 * problems with having ereports attached to multiple cases
1578 	 * and/or attaching DIAGNOSED ereports to a case.  For now,
1579 	 * we'll just output a message.
1580 	 */
1581 	if (fmep->posted_suspects ||
1582 	    fmd_case_solved(fmep->hdl, fmep->fmcase)) {
1583 		out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ",
1584 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1585 		for (ep = fmep->suspects; ep; ep = ep->suspects) {
1586 			out(O_ALTFP|O_NONL, " ");
1587 			itree_pevent_brief(O_ALTFP|O_NONL, ep);
1588 		}
1589 		out(O_ALTFP, NULL);
1590 		return;
1591 	}
1592 
1593 	/*
1594 	 * If we're auto-closing upsets, we don't want to include them
1595 	 * in any produced suspect lists or certainty accounting.
1596 	 */
1597 	if (Autoclose != NULL)
1598 		if (strcmp(Autoclose, "true") == 0 ||
1599 		    strcmp(Autoclose, "all") == 0 ||
1600 		    strcmp(Autoclose, "upsets") == 0)
1601 			no_upsets = B_TRUE;
1602 
1603 	trim_suspects(fmep, no_upsets, &srl, &erl);
1604 
1605 	/*
1606 	 * If the resulting suspect list has no members, we're
1607 	 * done.  Returning here will simply close the case.
1608 	 */
1609 	if (fmep->nsuspects == 0) {
1610 		out(O_ALTFP,
1611 		    "[FME%d, case %s (all suspects are upsets)]",
1612 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
1613 		FREE(srl);
1614 		restore_suspects(fmep);
1615 		return;
1616 	}
1617 
1618 	/*
1619 	 * If the suspect list is all faults, then for a given fault,
1620 	 * say X of N, X's certainty is computed via:
1621 	 *
1622 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
1623 	 *
1624 	 * If none of the suspects are faults, and there are N suspects,
1625 	 * the certainty of a given suspect is 100/N.
1626 	 *
1627 	 * If there are are a mixture of faults and other problems in
1628 	 * the suspect list, we take an average of the faults'
1629 	 * FITrates and treat this average as the FITrate for any
1630 	 * non-faults.  The fitrate of any given suspect is then
1631 	 * computed per the first formula above.
1632 	 */
1633 	if (fmep->nonfault == fmep->nsuspects) {
1634 		/* NO faults in the suspect list */
1635 		cert = percentof(1, fmep->nsuspects);
1636 	} else {
1637 		/* sum the fitrates */
1638 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
1639 		fridx = frcnt = frsum = 0;
1640 
1641 		for (rp = srl; rp <= erl; rp++) {
1642 			struct node *n;
1643 
1644 			if (rp->suspect == NULL)
1645 				continue;
1646 			if (!is_fault(rp->suspect->t)) {
1647 				frs[fridx++] = 0;
1648 				continue;
1649 			}
1650 			n = eventprop_lookup(rp->suspect, L_FITrate);
1651 			if (node2uint(n, &fr) != 0) {
1652 				out(O_DEBUG|O_NONL, "event ");
1653 				ipath_print(O_DEBUG|O_NONL,
1654 				    ep->enode->u.event.ename->u.name.s,
1655 				    ep->ipp);
1656 				out(O_DEBUG, " has no FITrate (using 1)");
1657 				fr = 1;
1658 			} else if (fr == 0) {
1659 				out(O_DEBUG|O_NONL, "event ");
1660 				ipath_print(O_DEBUG|O_NONL,
1661 				    ep->enode->u.event.ename->u.name.s,
1662 				    ep->ipp);
1663 				out(O_DEBUG, " has zero FITrate (using 1)");
1664 				fr = 1;
1665 			}
1666 
1667 			frs[fridx++] = fr;
1668 			frsum += fr;
1669 			frcnt++;
1670 		}
1671 		fravg = avg(frsum, frcnt);
1672 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
1673 			if (frs[fridx] == 0) {
1674 				frs[fridx] = fravg;
1675 				frsum += fravg;
1676 			}
1677 	}
1678 
1679 	/* Add them in reverse order of our sort, as fmd reverses order */
1680 	for (rp = erl; rp >= srl; rp--) {
1681 		if (rp->suspect == NULL)
1682 			continue;
1683 		if (fmep->nonfault != fmep->nsuspects)
1684 			cert = percentof(frs[--fridx], frsum);
1685 		fault = fmd_nvl_create_fault(fmep->hdl,
1686 		    rp->suspect->enode->u.event.ename->u.name.s,
1687 		    cert,
1688 		    rp->asru,
1689 		    rp->fru,
1690 		    rp->rsrc);
1691 		if (fault == NULL)
1692 			out(O_DIE, "fault creation failed");
1693 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
1694 		rp->suspect->fault = fault;
1695 		rslfree(rp);
1696 	}
1697 	fmd_case_solve(fmep->hdl, fmep->fmcase);
1698 	out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
1699 	    fmd_case_uuid(fmep->hdl, fmep->fmcase));
1700 
1701 	/*
1702 	 * revert to the original suspect list
1703 	 */
1704 	FREE(srl);
1705 	restore_suspects(fmep);
1706 }
1707 
1708 static void
1709 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep)
1710 {
1711 	struct case_list *newcase;
1712 	nvlist_t *defect;
1713 
1714 	out(O_ALTFP,
1715 	    "[undiagnosable ereport received, "
1716 	    "creating and closing a new case (%s)]",
1717 	    Undiag_reason ? Undiag_reason : "reason not provided");
1718 
1719 	newcase = MALLOC(sizeof (struct case_list));
1720 	newcase->next = NULL;
1721 
1722 	newcase->fmcase = fmd_case_open(hdl, NULL);
1723 	if (Undiagablecaselist != NULL)
1724 		newcase->next = Undiagablecaselist;
1725 	Undiagablecaselist = newcase;
1726 
1727 	if (ffep != NULL)
1728 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
1729 
1730 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1731 	    NULL, NULL, NULL);
1732 	if (Undiag_reason != NULL)
1733 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
1734 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
1735 
1736 	fmd_case_solve(hdl, newcase->fmcase);
1737 	fmd_case_close(hdl, newcase->fmcase);
1738 }
1739 
1740 static void
1741 fme_undiagnosable(struct fme *f)
1742 {
1743 	nvlist_t *defect;
1744 
1745 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
1746 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
1747 	    Undiag_reason ? Undiag_reason : "undiagnosable");
1748 
1749 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
1750 	    NULL, NULL, NULL);
1751 	if (Undiag_reason != NULL)
1752 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
1753 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
1754 	fmd_case_solve(f->hdl, f->fmcase);
1755 	destroy_fme_bufs(f);
1756 	fmd_case_close(f->hdl, f->fmcase);
1757 }
1758 
1759 /*
1760  * fme_close_case
1761  *
1762  *	Find the requested case amongst our fmes and close it.  Free up
1763  *	the related fme.
1764  */
1765 void
1766 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
1767 {
1768 	struct case_list *ucasep, *prevcasep = NULL;
1769 	struct fme *prev = NULL;
1770 	struct fme *fmep;
1771 
1772 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
1773 		if (fmcase != ucasep->fmcase) {
1774 			prevcasep = ucasep;
1775 			continue;
1776 		}
1777 
1778 		if (prevcasep == NULL)
1779 			Undiagablecaselist = Undiagablecaselist->next;
1780 		else
1781 			prevcasep->next = ucasep->next;
1782 
1783 		FREE(ucasep);
1784 		return;
1785 	}
1786 
1787 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1788 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
1789 			break;
1790 		prev = fmep;
1791 	}
1792 
1793 	if (fmep == NULL) {
1794 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
1795 		    fmd_case_uuid(hdl, fmcase));
1796 		return;
1797 	}
1798 
1799 	if (EFMElist == fmep)
1800 		EFMElist = prev;
1801 
1802 	if (prev == NULL)
1803 		FMElist = FMElist->next;
1804 	else
1805 		prev->next = fmep->next;
1806 
1807 	fmep->next = NULL;
1808 
1809 	/* Get rid of any timer this fme has set */
1810 	if (fmep->wull != 0)
1811 		fmd_timer_remove(fmep->hdl, fmep->timer);
1812 
1813 	if (ClosedFMEs == NULL) {
1814 		ClosedFMEs = fmep;
1815 	} else {
1816 		fmep->next = ClosedFMEs;
1817 		ClosedFMEs = fmep;
1818 	}
1819 
1820 	Open_fme_count--;
1821 
1822 	/* See if we can close the overflow FME */
1823 	if (Open_fme_count <= Max_fme) {
1824 		for (fmep = FMElist; fmep; fmep = fmep->next) {
1825 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
1826 			    fmep->fmcase)))
1827 				break;
1828 		}
1829 
1830 		if (fmep != NULL)
1831 			fmd_case_close(fmep->hdl, fmep->fmcase);
1832 	}
1833 }
1834 
1835 /*
1836  * fme_set_timer()
1837  *	If the time we need to wait for the given FME is less than the
1838  *	current timer, kick that old timer out and establish a new one.
1839  */
1840 static void
1841 fme_set_timer(struct fme *fmep, unsigned long long wull)
1842 {
1843 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
1844 	ptree_timeval(O_ALTFP|O_VERB, &wull);
1845 
1846 	if (wull <= fmep->pull) {
1847 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
1848 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
1849 		out(O_ALTFP|O_VERB, NULL);
1850 		/* we've waited at least wull already, don't need timer */
1851 		return;
1852 	}
1853 
1854 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
1855 	if (fmep->wull != 0) {
1856 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
1857 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
1858 		out(O_ALTFP|O_VERB, NULL);
1859 	} else {
1860 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
1861 		out(O_ALTFP|O_VERB, NULL);
1862 	}
1863 
1864 	if (fmep->wull != 0)
1865 		if (wull >= fmep->wull)
1866 			/* New timer would fire later than established timer */
1867 			return;
1868 
1869 	if (fmep->wull != 0)
1870 		fmd_timer_remove(fmep->hdl, fmep->timer);
1871 
1872 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
1873 	    fmep->e0r, wull);
1874 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
1875 	fmep->wull = wull;
1876 }
1877 
1878 void
1879 fme_timer_fired(struct fme *fmep, id_t tid)
1880 {
1881 	struct fme *ffmep = NULL;
1882 
1883 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
1884 		if (ffmep == fmep)
1885 			break;
1886 
1887 	if (ffmep == NULL) {
1888 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
1889 		    (void *)fmep);
1890 		return;
1891 	}
1892 
1893 	if (tid != fmep->htid) {
1894 		/*
1895 		 * normal timer (not the hesitation timer
1896 		 */
1897 		fmep->pull = fmep->wull;
1898 		fmep->wull = 0;
1899 		fmd_buf_write(fmep->hdl, fmep->fmcase,
1900 		    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
1901 	} else {
1902 		fmep->hesitated = 1;
1903 	}
1904 	fme_eval(fmep, NULL);
1905 }
1906 
1907 /*
1908  * Preserve the fme's suspect list in its psuspects list, NULLing the
1909  * suspects list in the meantime.
1910  */
1911 static void
1912 save_suspects(struct fme *fmep)
1913 {
1914 	struct event *ep;
1915 	struct event *nextep;
1916 
1917 	/* zero out the previous suspect list */
1918 	for (ep = fmep->psuspects; ep; ep = nextep) {
1919 		nextep = ep->psuspects;
1920 		ep->psuspects = NULL;
1921 	}
1922 	fmep->psuspects = NULL;
1923 
1924 	/* zero out the suspect list, copying it to previous suspect list */
1925 	fmep->psuspects = fmep->suspects;
1926 	for (ep = fmep->suspects; ep; ep = nextep) {
1927 		nextep = ep->suspects;
1928 		ep->psuspects = ep->suspects;
1929 		ep->suspects = NULL;
1930 		ep->is_suspect = 0;
1931 	}
1932 	fmep->suspects = NULL;
1933 	fmep->nsuspects = 0;
1934 	fmep->nonfault = 0;
1935 }
1936 
1937 /*
1938  * Retrieve the fme's suspect list from its psuspects list.
1939  */
1940 static void
1941 restore_suspects(struct fme *fmep)
1942 {
1943 	struct event *ep;
1944 	struct event *nextep;
1945 
1946 	fmep->nsuspects = fmep->nonfault = 0;
1947 	fmep->suspects = fmep->psuspects;
1948 	for (ep = fmep->psuspects; ep; ep = nextep) {
1949 		fmep->nsuspects++;
1950 		if (!is_fault(ep->t))
1951 			fmep->nonfault++;
1952 		nextep = ep->psuspects;
1953 		ep->suspects = ep->psuspects;
1954 	}
1955 }
1956 
1957 /*
1958  * this is what we use to call the Emrys prototype code instead of main()
1959  */
1960 static void
1961 fme_eval(struct fme *fmep, fmd_event_t *ffep)
1962 {
1963 	struct event *ep;
1964 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1965 
1966 	save_suspects(fmep);
1967 
1968 	out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id);
1969 	indent_set("  ");
1970 
1971 	initialize_cycles(fmep);
1972 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL);
1973 
1974 	out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
1975 	    fme_state2str(fmep->state));
1976 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1977 		out(O_ALTFP|O_VERB|O_NONL, " ");
1978 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
1979 	}
1980 	out(O_ALTFP|O_VERB, NULL);
1981 
1982 	if (fmep->posted_suspects) {
1983 		/*
1984 		 * this FME has already posted a diagnosis, so see if
1985 		 * the event changed the diagnosis and print a warning
1986 		 * if it did.
1987 		 *
1988 		 */
1989 		if (suspects_changed(fmep)) {
1990 			print_suspects(SLCHANGED, fmep);
1991 			publish_suspects(fmep);
1992 		}
1993 	} else {
1994 		switch (fmep->state) {
1995 		case FME_CREDIBLE:
1996 			/*
1997 			 * if the suspect list contains any upsets, we
1998 			 * turn off the hesitation logic (by setting
1999 			 * the hesitate flag which normally indicates
2000 			 * we've already done the hesitate logic).
2001 			 * this is done because hesitating with upsets
2002 			 * causes us to explain away additional soft errors
2003 			 * while the upset FME stays open.
2004 			 */
2005 			if (fmep->hesitated == 0) {
2006 				struct event *s;
2007 
2008 				for (s = fmep->suspects; s; s = s->suspects) {
2009 					if (s->t == N_UPSET) {
2010 						fmep->hesitated = 1;
2011 						break;
2012 					}
2013 				}
2014 			}
2015 
2016 			if (Hesitate &&
2017 			    fmep->suspects != NULL &&
2018 			    fmep->suspects->suspects != NULL &&
2019 			    fmep->hesitated == 0) {
2020 				/*
2021 				 * about to publish multi-entry suspect list,
2022 				 * set the hesitation timer if not already set.
2023 				 */
2024 				if (fmep->htid == 0) {
2025 					out(O_ALTFP|O_NONL,
2026 					    "[hesitate FME%d, case %s ",
2027 					    fmep->id,
2028 					    fmd_case_uuid(fmep->hdl,
2029 					    fmep->fmcase));
2030 					ptree_timeval(O_ALTFP|O_NONL,
2031 					    (unsigned long long *)&Hesitate);
2032 					out(O_ALTFP, "]");
2033 					fme_set_timer(fmep, my_delay);
2034 					fmep->htid =
2035 					    fmd_timer_install(fmep->hdl,
2036 					    (void *)fmep, NULL, Hesitate);
2037 				} else {
2038 					out(O_ALTFP,
2039 					    "[still hesitating FME%d, case %s]",
2040 					    fmep->id,
2041 					    fmd_case_uuid(fmep->hdl,
2042 					    fmep->fmcase));
2043 				}
2044 			} else {
2045 				print_suspects(SLNEW, fmep);
2046 				(void) upsets_eval(fmep, ffep);
2047 				publish_suspects(fmep);
2048 				fmep->posted_suspects = 1;
2049 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2050 				    WOBUF_POSTD,
2051 				    (void *)&fmep->posted_suspects,
2052 				    sizeof (fmep->posted_suspects));
2053 			}
2054 			break;
2055 
2056 		case FME_WAIT:
2057 			/*
2058 			 * singleton suspect list implies
2059 			 * no point in waiting
2060 			 */
2061 			if (fmep->suspects &&
2062 			    fmep->suspects->suspects == NULL) {
2063 				print_suspects(SLNEW, fmep);
2064 				(void) upsets_eval(fmep, ffep);
2065 				publish_suspects(fmep);
2066 				fmep->posted_suspects = 1;
2067 				fmd_buf_write(fmep->hdl, fmep->fmcase,
2068 				    WOBUF_POSTD,
2069 				    (void *)&fmep->posted_suspects,
2070 				    sizeof (fmep->posted_suspects));
2071 				fmep->state = FME_CREDIBLE;
2072 			} else {
2073 				ASSERT(my_delay > fmep->ull);
2074 				fme_set_timer(fmep, my_delay);
2075 				print_suspects(SLWAIT, fmep);
2076 			}
2077 			break;
2078 
2079 		case FME_DISPROVED:
2080 			print_suspects(SLDISPROVED, fmep);
2081 			Undiag_reason = UD_UNSOLVD;
2082 			fme_undiagnosable(fmep);
2083 			break;
2084 		}
2085 	}
2086 
2087 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
2088 		int doclose = 0;
2089 
2090 		if (strcmp(Autoclose, "true") == 0 ||
2091 		    strcmp(Autoclose, "all") == 0)
2092 			doclose = 1;
2093 
2094 		if (strcmp(Autoclose, "upsets") == 0) {
2095 			doclose = 1;
2096 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
2097 				if (ep->t != N_UPSET) {
2098 					doclose = 0;
2099 					break;
2100 				}
2101 			}
2102 		}
2103 
2104 		if (doclose) {
2105 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
2106 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2107 
2108 			destroy_fme_bufs(fmep);
2109 			fmd_case_close(fmep->hdl, fmep->fmcase);
2110 		}
2111 	}
2112 }
2113 
2114 /*
2115  * below here is the code derived from the Emrys prototype
2116  */
2117 
2118 static void indent(void);
2119 static int triggered(struct fme *fmep, struct event *ep, int mark);
2120 static void mark_arrows(struct fme *fmep, struct event *ep, int mark);
2121 static enum fme_state effects_test(struct fme *fmep,
2122     struct event *fault_event);
2123 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
2124     unsigned long long at_latest_by, unsigned long long *pdelay,
2125     struct arrow *arrowp);
2126 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
2127     unsigned long long at_latest_by, unsigned long long *pdelay);
2128 
2129 static int
2130 triggered(struct fme *fmep, struct event *ep, int mark)
2131 {
2132 	struct bubble *bp;
2133 	struct arrowlist *ap;
2134 	int count = 0;
2135 
2136 	stats_counter_bump(fmep->Tcallcount);
2137 	for (bp = itree_next_bubble(ep, NULL); bp;
2138 	    bp = itree_next_bubble(ep, bp)) {
2139 		if (bp->t != B_TO)
2140 			continue;
2141 		for (ap = itree_next_arrow(bp, NULL); ap;
2142 		    ap = itree_next_arrow(bp, ap)) {
2143 			/* check count of marks against K in the bubble */
2144 			if (ap->arrowp->tail->mark == mark &&
2145 			    ++count >= bp->nork)
2146 				return (1);
2147 		}
2148 	}
2149 	return (0);
2150 }
2151 
2152 static void
2153 mark_arrows(struct fme *fmep, struct event *ep, int mark)
2154 {
2155 	struct bubble *bp;
2156 	struct arrowlist *ap;
2157 
2158 	for (bp = itree_next_bubble(ep, NULL); bp;
2159 	    bp = itree_next_bubble(ep, bp)) {
2160 		if (bp->t != B_FROM)
2161 			continue;
2162 		if (bp->mark != mark) {
2163 			stats_counter_bump(fmep->Marrowcount);
2164 			bp->mark = mark;
2165 			for (ap = itree_next_arrow(bp, NULL); ap;
2166 			    ap = itree_next_arrow(bp, ap)) {
2167 				struct constraintlist *ctp;
2168 				struct evalue value;
2169 				int do_not_follow = 0;
2170 				/*
2171 				 * see if false constraint prevents us
2172 				 * from traversing this arrow, but don't
2173 				 * bother if the event is an ereport we
2174 				 * haven't seen
2175 				 */
2176 				if (ap->arrowp->head->myevent->t != N_EREPORT ||
2177 				    ap->arrowp->head->myevent->count != 0) {
2178 					platform_set_payloadnvp(
2179 					    ap->arrowp->head->myevent->nvp);
2180 					for (ctp = ap->arrowp->constraints;
2181 					    ctp != NULL; ctp = ctp->next) {
2182 						if (eval_expr(ctp->cnode,
2183 						    NULL, NULL,
2184 						    &fmep->globals,
2185 						    fmep->cfgdata->cooked,
2186 						    ap->arrowp, 0,
2187 						    &value) == 0 ||
2188 						    value.t == UNDEFINED ||
2189 						    value.v == 0) {
2190 							do_not_follow = 1;
2191 							break;
2192 						}
2193 					}
2194 					platform_set_payloadnvp(NULL);
2195 				}
2196 
2197 				if (do_not_follow) {
2198 					indent();
2199 					out(O_ALTFP|O_VERB|O_NONL,
2200 					    "  False arrow to ");
2201 					itree_pevent_brief(
2202 					    O_ALTFP|O_VERB|O_NONL,
2203 					    ap->arrowp->head->myevent);
2204 					out(O_ALTFP|O_VERB|O_NONL, " ");
2205 					ptree(O_ALTFP|O_VERB|O_NONL,
2206 					    ctp->cnode, 1, 0);
2207 					out(O_ALTFP|O_VERB, NULL);
2208 					continue;
2209 				}
2210 
2211 				if (triggered(fmep, ap->arrowp->head->myevent,
2212 				    mark))
2213 					mark_arrows(fmep,
2214 					    ap->arrowp->head->myevent, mark);
2215 			}
2216 		}
2217 	}
2218 }
2219 
2220 static enum fme_state
2221 effects_test(struct fme *fmep, struct event *fault_event)
2222 {
2223 	struct event *error_event;
2224 	enum fme_state return_value = FME_CREDIBLE;
2225 
2226 	stats_counter_bump(fmep->Ecallcount);
2227 	indent_push("  E");
2228 	indent();
2229 	out(O_ALTFP|O_VERB|O_NONL, "->");
2230 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2231 	out(O_ALTFP|O_VERB, NULL);
2232 
2233 	mark_arrows(fmep, fault_event, 1);
2234 	for (error_event = fmep->observations;
2235 	    error_event; error_event = error_event->observations) {
2236 		indent();
2237 		out(O_ALTFP|O_VERB|O_NONL, " ");
2238 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
2239 		if (!triggered(fmep, error_event, 1)) {
2240 			return_value = FME_DISPROVED;
2241 			out(O_ALTFP|O_VERB, " NOT triggered");
2242 			break;
2243 		} else {
2244 			out(O_ALTFP|O_VERB, " triggered");
2245 		}
2246 	}
2247 	mark_arrows(fmep, fault_event, 0);
2248 
2249 	indent();
2250 	out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value));
2251 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
2252 	out(O_ALTFP|O_VERB, NULL);
2253 	indent_pop();
2254 	return (return_value);
2255 }
2256 
2257 static enum fme_state
2258 requirements_test(struct fme *fmep, struct event *ep,
2259     unsigned long long at_latest_by, unsigned long long *pdelay,
2260     struct arrow *arrowp)
2261 {
2262 	int waiting_events;
2263 	int credible_events;
2264 	enum fme_state return_value = FME_CREDIBLE;
2265 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2266 	unsigned long long arrow_delay;
2267 	unsigned long long my_delay;
2268 	struct event *ep2;
2269 	struct bubble *bp;
2270 	struct arrowlist *ap;
2271 
2272 	stats_counter_bump(fmep->Rcallcount);
2273 	indent_push("  R");
2274 	indent();
2275 	out(O_ALTFP|O_VERB|O_NONL, "->");
2276 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2277 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2278 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2279 	out(O_ALTFP|O_VERB, NULL);
2280 
2281 	if (ep->t == N_EREPORT) {
2282 		if (ep->count == 0) {
2283 			if (fmep->pull >= at_latest_by) {
2284 				return_value = FME_DISPROVED;
2285 			} else {
2286 				*pdelay = at_latest_by;
2287 				return_value = FME_WAIT;
2288 			}
2289 		} else if (arrowp != NULL) {
2290 			/*
2291 			 * evaluate constraints only for current observation
2292 			 */
2293 			struct constraintlist *ctp;
2294 			struct evalue value;
2295 
2296 			platform_set_payloadnvp(ep->nvp);
2297 			for (ctp = arrowp->constraints; ctp != NULL;
2298 				ctp = ctp->next) {
2299 				if (eval_expr(ctp->cnode, NULL, NULL,
2300 				    &fmep->globals, fmep->cfgdata->cooked,
2301 				    arrowp, 0, &value) == 0 ||
2302 				    value.t == UNDEFINED || value.v == 0) {
2303 					indent();
2304 					out(O_ALTFP|O_VERB|O_NONL,
2305 					    "  False constraint ");
2306 					out(O_ALTFP|O_VERB|O_NONL, " ");
2307 					ptree(O_ALTFP|O_VERB|O_NONL,
2308 					    ctp->cnode, 1, 0);
2309 					out(O_ALTFP|O_VERB, NULL);
2310 					return_value = FME_DISPROVED;
2311 					break;
2312 				}
2313 			}
2314 			platform_set_payloadnvp(NULL);
2315 		}
2316 
2317 		indent();
2318 		switch (return_value) {
2319 		case FME_CREDIBLE:
2320 			out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2321 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2322 			break;
2323 		case FME_DISPROVED:
2324 			out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2325 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2326 			break;
2327 		case FME_WAIT:
2328 			out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2329 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2330 			out(O_ALTFP|O_VERB|O_NONL, " to ");
2331 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2332 			break;
2333 		default:
2334 			out(O_DIE, "requirements_test: unexpected fme_state");
2335 			break;
2336 		}
2337 		out(O_ALTFP|O_VERB, NULL);
2338 		indent_pop();
2339 
2340 		return (return_value);
2341 	}
2342 
2343 	/* this event is not a report, descend the tree */
2344 	for (bp = itree_next_bubble(ep, NULL); bp;
2345 	    bp = itree_next_bubble(ep, bp)) {
2346 		if (bp->t != B_FROM)
2347 			continue;
2348 		if (bp->mark == 0) {
2349 			int n = bp->nork;
2350 
2351 			bp->mark = 1;
2352 			credible_events = 0;
2353 			waiting_events = 0;
2354 			arrow_delay = TIMEVAL_EVENTUALLY;
2355 			/*
2356 			 * n is -1 for 'A' so adjust it.
2357 			 * XXX just count up the arrows for now.
2358 			 */
2359 			if (n < 0) {
2360 				n = 0;
2361 				for (ap = itree_next_arrow(bp, NULL); ap;
2362 				    ap = itree_next_arrow(bp, ap))
2363 					n++;
2364 				indent();
2365 				out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
2366 			} else {
2367 				indent();
2368 				out(O_ALTFP|O_VERB, " Bubble N=%d", n);
2369 			}
2370 
2371 			for (ap = itree_next_arrow(bp, NULL); ap;
2372 			    ap = itree_next_arrow(bp, ap)) {
2373 				ep2 = ap->arrowp->head->myevent;
2374 				if (n <= credible_events)
2375 					break;
2376 
2377 				if (triggered(fmep, ep2, 1))
2378 					/* XXX adding max timevals! */
2379 					switch (requirements_test(fmep, ep2,
2380 					    at_latest_by + ap->arrowp->maxdelay,
2381 					    &my_delay, ap->arrowp)) {
2382 					case FME_CREDIBLE:
2383 						credible_events++;
2384 						break;
2385 					case FME_DISPROVED:
2386 						break;
2387 					case FME_WAIT:
2388 						if (my_delay < arrow_delay)
2389 							arrow_delay = my_delay;
2390 						waiting_events++;
2391 						break;
2392 					default:
2393 						out(O_DIE,
2394 						"Bug in requirements_test.");
2395 					}
2396 				else
2397 					credible_events++;
2398 			}
2399 			indent();
2400 			out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
2401 			    credible_events, waiting_events);
2402 			if (credible_events + waiting_events < n) {
2403 				/* Can never meet requirements */
2404 				indent();
2405 				out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2406 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2407 				out(O_ALTFP|O_VERB, NULL);
2408 				indent_pop();
2409 				return (FME_DISPROVED);
2410 			}
2411 			if (credible_events < n) { /* will have to wait */
2412 				/* wait time is shortest known */
2413 				if (arrow_delay < overall_delay)
2414 					overall_delay = arrow_delay;
2415 				return_value = FME_WAIT;
2416 			}
2417 		} else {
2418 			indent();
2419 			out(O_ALTFP|O_VERB|O_NONL, " Mark was set: ");
2420 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2421 			out(O_ALTFP|O_VERB|O_NONL, " to");
2422 			for (ap = itree_next_arrow(bp, NULL); ap;
2423 			    ap = itree_next_arrow(bp, ap)) {
2424 				out(O_ALTFP|O_VERB|O_NONL, " ");
2425 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
2426 				    ap->arrowp->head->myevent);
2427 			}
2428 			out(O_ALTFP|O_VERB, NULL);
2429 		}
2430 	}
2431 
2432 	/*
2433 	 * evaluate constraints for ctlist, which is the list of
2434 	 * constraints for the arrow pointing into this node of the tree
2435 	 */
2436 	if (return_value == FME_CREDIBLE && arrowp != NULL) {
2437 		struct constraintlist *ctp;
2438 		struct evalue value;
2439 
2440 		platform_set_payloadnvp(ep->nvp);
2441 		for (ctp = arrowp->constraints; ctp != NULL;
2442 			ctp = ctp->next) {
2443 			if (eval_expr(ctp->cnode, NULL,	NULL, &fmep->globals,
2444 			    fmep->cfgdata->cooked, arrowp, 0, &value) == 0 ||
2445 			    value.t == UNDEFINED || value.v == 0) {
2446 				indent();
2447 				out(O_ALTFP|O_VERB|O_NONL,
2448 				    "  False constraint ");
2449 				out(O_ALTFP|O_VERB|O_NONL, " ");
2450 				ptree(O_ALTFP|O_VERB|O_NONL,
2451 				    ctp->cnode, 1, 0);
2452 				out(O_ALTFP|O_VERB, NULL);
2453 				return_value = FME_DISPROVED;
2454 				break;
2455 			}
2456 		}
2457 		platform_set_payloadnvp(NULL);
2458 	}
2459 
2460 	if (return_value == FME_WAIT)
2461 		*pdelay = overall_delay;
2462 	indent();
2463 	out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value));
2464 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2465 	out(O_ALTFP|O_VERB, NULL);
2466 	indent_pop();
2467 	return (return_value);
2468 }
2469 
2470 static enum fme_state
2471 causes_test(struct fme *fmep, struct event *ep,
2472     unsigned long long at_latest_by, unsigned long long *pdelay)
2473 {
2474 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2475 	unsigned long long my_delay;
2476 	int credible_results = 0;
2477 	int waiting_results = 0;
2478 	enum fme_state fstate;
2479 	struct event *tail_event;
2480 	struct bubble *bp;
2481 	struct arrowlist *ap;
2482 	int k = 1;
2483 
2484 	stats_counter_bump(fmep->Ccallcount);
2485 	indent_push("  C");
2486 	indent();
2487 	out(O_ALTFP|O_VERB|O_NONL, "->");
2488 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2489 	out(O_ALTFP|O_VERB, NULL);
2490 
2491 	for (bp = itree_next_bubble(ep, NULL); bp;
2492 	    bp = itree_next_bubble(ep, bp)) {
2493 		if (bp->t != B_TO)
2494 			continue;
2495 		k = bp->nork;	/* remember the K value */
2496 		for (ap = itree_next_arrow(bp, NULL); ap;
2497 		    ap = itree_next_arrow(bp, ap)) {
2498 			struct constraintlist *ctp;
2499 			struct evalue value;
2500 			int do_not_follow = 0;
2501 			/*
2502 			 * see if false constraint prevents us
2503 			 * from traversing this arrow
2504 			 */
2505 			platform_set_payloadnvp(ep->nvp);
2506 			for (ctp = ap->arrowp->constraints;
2507 			    ctp != NULL; ctp = ctp->next) {
2508 				if (eval_expr(ctp->cnode, NULL, NULL,
2509 				    &fmep->globals,
2510 				    fmep->cfgdata->cooked,
2511 				    ap->arrowp, 0,
2512 				    &value) == 0 ||
2513 				    value.t == UNDEFINED ||
2514 				    value.v == 0) {
2515 					do_not_follow = 1;
2516 					break;
2517 				}
2518 			}
2519 			platform_set_payloadnvp(NULL);
2520 			if (do_not_follow) {
2521 				indent();
2522 				out(O_ALTFP|O_VERB|O_NONL,
2523 				    "  False arrow from ");
2524 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
2525 				    ap->arrowp->tail->myevent);
2526 				out(O_ALTFP|O_VERB|O_NONL, " ");
2527 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
2528 				out(O_ALTFP|O_VERB, NULL);
2529 				continue;
2530 			}
2531 
2532 			if (ap->arrowp->causes_tested++ > 0) {
2533 				/*
2534 				 * get to this point if this is not the
2535 				 * first time we're going through this
2536 				 * arrow in the causes test.  consider this
2537 				 * branch to be credible and let the
2538 				 * credible/noncredible outcome depend on
2539 				 * the other branches in this cycle.
2540 				 */
2541 				fstate = FME_CREDIBLE;
2542 			} else {
2543 				/*
2544 				 * get to this point if this is the first
2545 				 * time we're going through this arrow.
2546 				 */
2547 				tail_event = ap->arrowp->tail->myevent;
2548 				fstate = hypothesise(fmep, tail_event,
2549 						    at_latest_by,
2550 						    &my_delay, ap->arrowp);
2551 			}
2552 
2553 			switch (fstate) {
2554 			case FME_WAIT:
2555 				if (my_delay < overall_delay)
2556 					overall_delay = my_delay;
2557 				waiting_results++;
2558 				break;
2559 			case FME_CREDIBLE:
2560 				credible_results++;
2561 				break;
2562 			case FME_DISPROVED:
2563 				break;
2564 			default:
2565 				out(O_DIE, "Bug in causes_test");
2566 			}
2567 
2568 			ap->arrowp->causes_tested--;
2569 			ASSERT(ap->arrowp->causes_tested >= 0);
2570 		}
2571 	}
2572 	/* compare against K */
2573 	if (credible_results + waiting_results < k) {
2574 		indent();
2575 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2576 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2577 		out(O_ALTFP|O_VERB, NULL);
2578 		indent_pop();
2579 		return (FME_DISPROVED);
2580 	}
2581 	if (waiting_results != 0) {
2582 		*pdelay = overall_delay;
2583 		indent();
2584 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2585 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2586 		out(O_ALTFP|O_VERB|O_NONL, " to ");
2587 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2588 		out(O_ALTFP|O_VERB, NULL);
2589 		indent_pop();
2590 		return (FME_WAIT);
2591 	}
2592 	indent();
2593 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2594 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2595 	out(O_ALTFP|O_VERB, NULL);
2596 	indent_pop();
2597 	return (FME_CREDIBLE);
2598 }
2599 
2600 static enum fme_state
2601 hypothesise(struct fme *fmep, struct event *ep,
2602 	unsigned long long at_latest_by, unsigned long long *pdelay,
2603 	struct arrow *arrowp)
2604 {
2605 	enum fme_state rtr, otr;
2606 	unsigned long long my_delay;
2607 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
2608 
2609 	stats_counter_bump(fmep->Hcallcount);
2610 	indent_push("  H");
2611 	indent();
2612 	out(O_ALTFP|O_VERB|O_NONL, "->");
2613 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2614 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
2615 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
2616 	out(O_ALTFP|O_VERB, NULL);
2617 
2618 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay, arrowp);
2619 	mark_arrows(fmep, ep, 0); /* clean up after requirements test */
2620 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
2621 		overall_delay = my_delay;
2622 	if (rtr != FME_DISPROVED) {
2623 		if (is_problem(ep->t)) {
2624 			otr = effects_test(fmep, ep);
2625 			if (otr != FME_DISPROVED) {
2626 				if (fmep->peek == 0 && ep->is_suspect++ == 0) {
2627 					ep->suspects = fmep->suspects;
2628 					fmep->suspects = ep;
2629 					fmep->nsuspects++;
2630 					if (!is_fault(ep->t))
2631 						fmep->nonfault++;
2632 				}
2633 			}
2634 		} else
2635 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
2636 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
2637 			overall_delay = my_delay;
2638 		if ((otr != FME_DISPROVED) &&
2639 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
2640 			*pdelay = overall_delay;
2641 	}
2642 	if (rtr == FME_DISPROVED) {
2643 		indent();
2644 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2645 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2646 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
2647 		indent_pop();
2648 		return (FME_DISPROVED);
2649 	}
2650 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
2651 		indent();
2652 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2653 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2654 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
2655 		indent_pop();
2656 		return (FME_DISPROVED);
2657 	}
2658 	if (otr == FME_DISPROVED) {
2659 		indent();
2660 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
2661 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2662 		out(O_ALTFP|O_VERB, " (causes are not credible)");
2663 		indent_pop();
2664 		return (FME_DISPROVED);
2665 	}
2666 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
2667 		indent();
2668 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
2669 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2670 		out(O_ALTFP|O_VERB|O_NONL, " to ");
2671 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
2672 		out(O_ALTFP|O_VERB, NULL);
2673 		indent_pop();
2674 		return (FME_WAIT);
2675 	}
2676 	indent();
2677 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
2678 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
2679 	out(O_ALTFP|O_VERB, NULL);
2680 	indent_pop();
2681 	return (FME_CREDIBLE);
2682 }
2683