xref: /titanic_41/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision ba2be53024c0b999e74ba9adcd7d80fec5df8c57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 #include "esclex.h"
58 
59 /* imported from eft.c... */
60 extern char *Autoclose;
61 extern int Dupclose;
62 extern hrtime_t Hesitate;
63 extern char *Serd_Override;
64 extern nv_alloc_t Eft_nv_hdl;
65 extern int Max_fme;
66 extern fmd_hdl_t *Hdl;
67 
68 static int Istat_need_save;
69 static int Serd_need_save;
70 void istat_save(void);
71 void serd_save(void);
72 
73 /* fme under construction is global so we can free it on module abort */
74 static struct fme *Nfmep;
75 
76 static const char *Undiag_reason;
77 
78 static int Nextid = 0;
79 
80 static int Open_fme_count = 0;	/* Count of open FMEs */
81 
82 /* list of fault management exercises underway */
83 static struct fme {
84 	struct fme *next;		/* next exercise */
85 	unsigned long long ull;		/* time when fme was created */
86 	int id;				/* FME id */
87 	struct config *config;		/* cooked configuration data */
88 	struct lut *eventtree;		/* propagation tree for this FME */
89 	/*
90 	 * The initial error report that created this FME is kept in
91 	 * two forms.  e0 points to the instance tree node and is used
92 	 * by fme_eval() as the starting point for the inference
93 	 * algorithm.  e0r is the event handle FMD passed to us when
94 	 * the ereport first arrived and is used when setting timers,
95 	 * which are always relative to the time of this initial
96 	 * report.
97 	 */
98 	struct event *e0;
99 	fmd_event_t *e0r;
100 
101 	id_t    timer;			/* for setting an fmd time-out */
102 
103 	struct event *ecurrent;		/* ereport under consideration */
104 	struct event *suspects;		/* current suspect list */
105 	struct event *psuspects;	/* previous suspect list */
106 	int nsuspects;			/* count of suspects */
107 	int nonfault;			/* zero if all suspects T_FAULT */
108 	int posted_suspects;		/* true if we've posted a diagnosis */
109 	int uniqobs;			/* number of unique events observed */
110 	int peek;			/* just peeking, don't track suspects */
111 	int overflow;			/* true if overflow FME */
112 	enum fme_state {
113 		FME_NOTHING = 5000,	/* not evaluated yet */
114 		FME_WAIT,		/* need to wait for more info */
115 		FME_CREDIBLE,		/* suspect list is credible */
116 		FME_DISPROVED,		/* no valid suspects found */
117 		FME_DEFERRED		/* don't know yet (k-count not met) */
118 	} state;
119 
120 	unsigned long long pull;	/* time passed since created */
121 	unsigned long long wull;	/* wait until this time for re-eval */
122 	struct event *observations;	/* observation list */
123 	struct lut *globals;		/* values of global variables */
124 	/* fmd interfacing */
125 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
126 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
127 	/* stats */
128 	struct stats *Rcount;
129 	struct stats *Hcallcount;
130 	struct stats *Rcallcount;
131 	struct stats *Ccallcount;
132 	struct stats *Ecallcount;
133 	struct stats *Tcallcount;
134 	struct stats *Marrowcount;
135 	struct stats *diags;
136 } *FMElist, *EFMElist, *ClosedFMEs;
137 
138 static struct case_list {
139 	fmd_case_t *fmcase;
140 	struct case_list *next;
141 } *Undiagablecaselist;
142 
143 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
144 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
145 	unsigned long long at_latest_by, unsigned long long *pdelay);
146 static struct node *eventprop_lookup(struct event *ep, const char *propname);
147 static struct node *pathstring2epnamenp(char *path);
148 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep,
149 	fmd_case_t *fmcase);
150 static void restore_suspects(struct fme *fmep);
151 static void save_suspects(struct fme *fmep);
152 static void destroy_fme(struct fme *f);
153 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
154     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
155 static void istat_counter_reset_cb(struct istat_entry *entp,
156     struct stats *statp, const struct ipath *ipp);
157 static void istat_counter_topo_chg_cb(struct istat_entry *entp,
158     struct stats *statp, void *unused);
159 static void serd_reset_cb(struct serd_entry *entp, void *unused,
160     const struct ipath *ipp);
161 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused,
162     void *unused2);
163 static void destroy_fme_bufs(struct fme *fp);
164 
165 static struct fme *
166 alloc_fme(void)
167 {
168 	struct fme *fmep;
169 
170 	fmep = MALLOC(sizeof (*fmep));
171 	bzero(fmep, sizeof (*fmep));
172 	return (fmep);
173 }
174 
175 /*
176  * fme_ready -- called when all initialization of the FME (except for
177  *	stats) has completed successfully.  Adds the fme to global lists
178  *	and establishes its stats.
179  */
180 static struct fme *
181 fme_ready(struct fme *fmep)
182 {
183 	char nbuf[100];
184 
185 	Nfmep = NULL;	/* don't need to free this on module abort now */
186 
187 	if (EFMElist) {
188 		EFMElist->next = fmep;
189 		EFMElist = fmep;
190 	} else
191 		FMElist = EFMElist = fmep;
192 
193 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
194 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
195 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
196 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
197 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
198 	fmep->Rcallcount = stats_new_counter(nbuf,
199 	    "calls to requirements_test()", 1);
200 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
201 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
202 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
203 	fmep->Ecallcount =
204 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
205 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
206 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
207 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
208 	fmep->Marrowcount = stats_new_counter(nbuf,
209 	    "arrows marked by mark_arrows()", 1);
210 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
211 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
212 
213 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
214 	config_print(O_ALTFP|O_VERB2, fmep->config);
215 
216 	return (fmep);
217 }
218 
219 extern void ipath_dummy_lut(struct arrow *);
220 extern struct lut *itree_create_dummy(const char *, const struct ipath *);
221 
222 /* ARGSUSED */
223 static void
224 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
225 {
226 	struct bubble *bp;
227 	struct arrowlist *ap;
228 
229 	for (bp = itree_next_bubble(ep, NULL); bp;
230 	    bp = itree_next_bubble(ep, bp)) {
231 		if (bp->t != B_FROM)
232 			continue;
233 		for (ap = itree_next_arrow(bp, NULL); ap;
234 		    ap = itree_next_arrow(bp, ap)) {
235 			ap->arrowp->pnode->u.arrow.needed = 1;
236 			ipath_dummy_lut(ap->arrowp);
237 		}
238 	}
239 }
240 
241 /* ARGSUSED */
242 static void
243 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
244 {
245 	struct bubble *bp;
246 	struct arrowlist *ap;
247 
248 	for (bp = itree_next_bubble(ep, NULL); bp;
249 	    bp = itree_next_bubble(ep, bp)) {
250 		if (bp->t != B_FROM)
251 			continue;
252 		for (ap = itree_next_arrow(bp, NULL); ap;
253 		    ap = itree_next_arrow(bp, ap))
254 			ap->arrowp->pnode->u.arrow.needed = 0;
255 	}
256 }
257 
258 static void globals_destructor(void *left, void *right, void *arg);
259 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep);
260 
261 static void
262 prune_propagations(const char *e0class, const struct ipath *e0ipp)
263 {
264 	char nbuf[100];
265 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
266 	extern struct lut *Usednames;
267 
268 	Nfmep = alloc_fme();
269 	Nfmep->id = Nextid;
270 	Nfmep->state = FME_NOTHING;
271 	Nfmep->eventtree = itree_create_dummy(e0class, e0ipp);
272 	if ((Nfmep->e0 =
273 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
274 		out(O_ALTFP, "prune_propagations: e0 not in instance tree");
275 		itree_free(Nfmep->eventtree);
276 		FREE(Nfmep);
277 		Nfmep = NULL;
278 		return;
279 	}
280 	Nfmep->ecurrent = Nfmep->observations = Nfmep->e0;
281 	Nfmep->e0->count++;
282 
283 	(void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id);
284 	Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
285 	(void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id);
286 	Nfmep->Hcallcount =
287 	    stats_new_counter(nbuf, "calls to hypothesise()", 1);
288 	(void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id);
289 	Nfmep->Rcallcount = stats_new_counter(nbuf,
290 	    "calls to requirements_test()", 1);
291 	(void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id);
292 	Nfmep->Ccallcount =
293 	    stats_new_counter(nbuf, "calls to causes_test()", 1);
294 	(void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id);
295 	Nfmep->Ecallcount =
296 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
297 	(void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id);
298 	Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
299 	(void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id);
300 	Nfmep->Marrowcount = stats_new_counter(nbuf,
301 	    "arrows marked by mark_arrows()", 1);
302 	(void) sprintf(nbuf, "fme%d.diags", Nfmep->id);
303 	Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
304 
305 	Nfmep->peek = 1;
306 	lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep);
307 	lut_free(Usednames, NULL, NULL);
308 	Usednames = NULL;
309 	lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep);
310 	(void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay);
311 	itree_prune(Nfmep->eventtree);
312 	lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep);
313 
314 	stats_delete(Nfmep->Rcount);
315 	stats_delete(Nfmep->Hcallcount);
316 	stats_delete(Nfmep->Rcallcount);
317 	stats_delete(Nfmep->Ccallcount);
318 	stats_delete(Nfmep->Ecallcount);
319 	stats_delete(Nfmep->Tcallcount);
320 	stats_delete(Nfmep->Marrowcount);
321 	stats_delete(Nfmep->diags);
322 	itree_free(Nfmep->eventtree);
323 	lut_free(Nfmep->globals, globals_destructor, NULL);
324 	FREE(Nfmep);
325 }
326 
327 static struct fme *
328 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl,
329 	fmd_case_t *fmcase)
330 {
331 	struct cfgdata *cfgdata;
332 	int init_size;
333 	extern int alloc_total();
334 
335 	init_size = alloc_total();
336 	out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size);
337 	if ((cfgdata = config_snapshot()) == NULL) {
338 		out(O_ALTFP, "newfme: NULL configuration");
339 		Undiag_reason = UD_NOCONF;
340 		return (NULL);
341 	}
342 	platform_save_config(hdl, fmcase);
343 	out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes",
344 	    alloc_total() - init_size);
345 
346 	Nfmep = alloc_fme();
347 
348 	Nfmep->id = Nextid++;
349 	Nfmep->config = cfgdata->cooked;
350 	config_free(cfgdata);
351 	Nfmep->posted_suspects = 0;
352 	Nfmep->uniqobs = 0;
353 	Nfmep->state = FME_NOTHING;
354 	Nfmep->pull = 0ULL;
355 	Nfmep->overflow = 0;
356 
357 	Nfmep->fmcase = fmcase;
358 	Nfmep->hdl = hdl;
359 
360 	if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) {
361 		out(O_ALTFP, "newfme: NULL instance tree");
362 		Undiag_reason = UD_INSTFAIL;
363 		structconfig_free(Nfmep->config);
364 		destroy_fme_bufs(Nfmep);
365 		FREE(Nfmep);
366 		Nfmep = NULL;
367 		return (NULL);
368 	}
369 
370 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
371 
372 	if ((Nfmep->e0 =
373 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
374 		out(O_ALTFP, "newfme: e0 not in instance tree");
375 		Undiag_reason = UD_BADEVENTI;
376 		itree_free(Nfmep->eventtree);
377 		structconfig_free(Nfmep->config);
378 		destroy_fme_bufs(Nfmep);
379 		FREE(Nfmep);
380 		Nfmep = NULL;
381 		return (NULL);
382 	}
383 
384 	return (fme_ready(Nfmep));
385 }
386 
387 void
388 fme_fini(void)
389 {
390 	struct fme *sfp, *fp;
391 	struct case_list *ucasep, *nextcasep;
392 
393 	ucasep = Undiagablecaselist;
394 	while (ucasep != NULL) {
395 		nextcasep = ucasep->next;
396 		FREE(ucasep);
397 		ucasep = nextcasep;
398 	}
399 	Undiagablecaselist = NULL;
400 
401 	/* clean up closed fmes */
402 	fp = ClosedFMEs;
403 	while (fp != NULL) {
404 		sfp = fp->next;
405 		destroy_fme(fp);
406 		fp = sfp;
407 	}
408 	ClosedFMEs = NULL;
409 
410 	fp = FMElist;
411 	while (fp != NULL) {
412 		sfp = fp->next;
413 		destroy_fme(fp);
414 		fp = sfp;
415 	}
416 	FMElist = EFMElist = NULL;
417 
418 	/* if we were in the middle of creating an fme, free it now */
419 	if (Nfmep) {
420 		destroy_fme(Nfmep);
421 		Nfmep = NULL;
422 	}
423 }
424 
425 /*
426  * Allocated space for a buffer name.  20 bytes allows for
427  * a ridiculous 9,999,999 unique observations.
428  */
429 #define	OBBUFNMSZ 20
430 
431 /*
432  *  serialize_observation
433  *
434  *  Create a recoverable version of the current observation
435  *  (f->ecurrent).  We keep a serialized version of each unique
436  *  observation in order that we may resume correctly the fme in the
437  *  correct state if eft or fmd crashes and we're restarted.
438  */
439 static void
440 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
441 {
442 	size_t pkdlen;
443 	char tmpbuf[OBBUFNMSZ];
444 	char *pkd = NULL;
445 	char *estr;
446 
447 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
448 	estr = ipath2str(cls, ipp);
449 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
450 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
451 	    strlen(estr) + 1);
452 	FREE(estr);
453 
454 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
455 		(void) snprintf(tmpbuf,
456 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
457 		if (nvlist_xpack(fp->ecurrent->nvp,
458 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
459 			out(O_DIE|O_SYS, "pack of observed nvl failed");
460 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
461 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
462 		FREE(pkd);
463 	}
464 
465 	fp->uniqobs++;
466 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
467 	    sizeof (fp->uniqobs));
468 }
469 
470 /*
471  *  init_fme_bufs -- We keep several bits of state about an fme for
472  *	use if eft or fmd crashes and we're restarted.
473  */
474 static void
475 init_fme_bufs(struct fme *fp)
476 {
477 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
478 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
479 	    sizeof (fp->pull));
480 
481 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
482 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
483 	    sizeof (fp->id));
484 
485 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
486 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
487 	    sizeof (fp->uniqobs));
488 
489 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
490 	    sizeof (fp->posted_suspects));
491 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
492 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
493 }
494 
495 static void
496 destroy_fme_bufs(struct fme *fp)
497 {
498 	char tmpbuf[OBBUFNMSZ];
499 	int o;
500 
501 	platform_restore_config(fp->hdl, fp->fmcase);
502 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
503 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
504 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
505 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
506 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
507 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
508 
509 	for (o = 0; o < fp->uniqobs; o++) {
510 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
511 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
512 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
513 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
514 	}
515 }
516 
517 /*
518  * reconstitute_observations -- convert a case's serialized observations
519  *	back into struct events.  Returns zero if all observations are
520  *	successfully reconstituted.
521  */
522 static int
523 reconstitute_observations(struct fme *fmep)
524 {
525 	struct event *ep;
526 	struct node *epnamenp = NULL;
527 	size_t pkdlen;
528 	char *pkd = NULL;
529 	char *tmpbuf = alloca(OBBUFNMSZ);
530 	char *sepptr;
531 	char *estr;
532 	int ocnt;
533 	int elen;
534 
535 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
536 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
537 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
538 		if (elen == 0) {
539 			out(O_ALTFP,
540 			    "reconstitute_observation: no %s buffer found.",
541 			    tmpbuf);
542 			Undiag_reason = UD_MISSINGOBS;
543 			break;
544 		}
545 
546 		estr = MALLOC(elen);
547 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
548 		sepptr = strchr(estr, '@');
549 		if (sepptr == NULL) {
550 			out(O_ALTFP,
551 			    "reconstitute_observation: %s: "
552 			    "missing @ separator in %s.",
553 			    tmpbuf, estr);
554 			Undiag_reason = UD_MISSINGPATH;
555 			FREE(estr);
556 			break;
557 		}
558 
559 		*sepptr = '\0';
560 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
561 			out(O_ALTFP,
562 			    "reconstitute_observation: %s: "
563 			    "trouble converting path string \"%s\" "
564 			    "to internal representation.",
565 			    tmpbuf, sepptr + 1);
566 			Undiag_reason = UD_MISSINGPATH;
567 			FREE(estr);
568 			break;
569 		}
570 
571 		/* construct the event */
572 		ep = itree_lookup(fmep->eventtree,
573 		    stable(estr), ipath(epnamenp));
574 		if (ep == NULL) {
575 			out(O_ALTFP,
576 			    "reconstitute_observation: %s: "
577 			    "lookup of  \"%s\" in itree failed.",
578 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
579 			Undiag_reason = UD_BADOBS;
580 			tree_free(epnamenp);
581 			FREE(estr);
582 			break;
583 		}
584 		tree_free(epnamenp);
585 
586 		/*
587 		 * We may or may not have a saved nvlist for the observation
588 		 */
589 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
590 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
591 		if (pkdlen != 0) {
592 			pkd = MALLOC(pkdlen);
593 			fmd_buf_read(fmep->hdl,
594 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
595 			ASSERT(ep->nvp == NULL);
596 			if (nvlist_xunpack(pkd,
597 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
598 				out(O_DIE|O_SYS, "pack of observed nvl failed");
599 			FREE(pkd);
600 		}
601 
602 		if (ocnt == 0)
603 			fmep->e0 = ep;
604 
605 		FREE(estr);
606 		fmep->ecurrent = ep;
607 		ep->count++;
608 
609 		/* link it into list of observations seen */
610 		ep->observations = fmep->observations;
611 		fmep->observations = ep;
612 	}
613 
614 	if (ocnt == fmep->uniqobs) {
615 		(void) fme_ready(fmep);
616 		return (0);
617 	}
618 
619 	return (1);
620 }
621 
622 /*
623  * restart_fme -- called during eft initialization.  Reconstitutes
624  *	an in-progress fme.
625  */
626 void
627 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
628 {
629 	nvlist_t *defect;
630 	struct case_list *bad;
631 	struct fme *fmep;
632 	struct cfgdata *cfgdata;
633 	size_t rawsz;
634 	struct event *ep;
635 	char *tmpbuf = alloca(OBBUFNMSZ);
636 	char *sepptr;
637 	char *estr;
638 	int elen;
639 	struct node *epnamenp = NULL;
640 	int init_size;
641 	extern int alloc_total();
642 
643 	/*
644 	 * ignore solved or closed cases
645 	 */
646 	if (fmd_case_solved(hdl, inprogress) ||
647 	    fmd_case_closed(hdl, inprogress))
648 		return;
649 
650 	fmep = alloc_fme();
651 	fmep->fmcase = inprogress;
652 	fmep->hdl = hdl;
653 
654 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
655 		out(O_ALTFP, "restart_fme: no saved posted status");
656 		Undiag_reason = UD_MISSINGINFO;
657 		goto badcase;
658 	} else {
659 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
660 		    (void *)&fmep->posted_suspects,
661 		    sizeof (fmep->posted_suspects));
662 	}
663 
664 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
665 		out(O_ALTFP, "restart_fme: no saved id");
666 		Undiag_reason = UD_MISSINGINFO;
667 		goto badcase;
668 	} else {
669 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
670 		    sizeof (fmep->id));
671 	}
672 	if (Nextid <= fmep->id)
673 		Nextid = fmep->id + 1;
674 
675 	out(O_ALTFP, "Replay FME %d", fmep->id);
676 
677 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
678 		out(O_ALTFP, "restart_fme: No config data");
679 		Undiag_reason = UD_MISSINGINFO;
680 		goto badcase;
681 	}
682 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
683 	    sizeof (size_t));
684 
685 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
686 		out(O_ALTFP, "restart_fme: No event zero");
687 		Undiag_reason = UD_MISSINGZERO;
688 		goto badcase;
689 	}
690 
691 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
692 		out(O_ALTFP, "restart_fme: no saved wait time");
693 		Undiag_reason = UD_MISSINGINFO;
694 		goto badcase;
695 	} else {
696 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
697 		    sizeof (fmep->pull));
698 	}
699 
700 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
701 		out(O_ALTFP, "restart_fme: no count of observations");
702 		Undiag_reason = UD_MISSINGINFO;
703 		goto badcase;
704 	} else {
705 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
706 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
707 	}
708 
709 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed0");
710 	elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
711 	if (elen == 0) {
712 		out(O_ALTFP, "reconstitute_observation: no %s buffer found.",
713 		    tmpbuf);
714 		Undiag_reason = UD_MISSINGOBS;
715 		goto badcase;
716 	}
717 	estr = MALLOC(elen);
718 	fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
719 	sepptr = strchr(estr, '@');
720 	if (sepptr == NULL) {
721 		out(O_ALTFP, "reconstitute_observation: %s: "
722 		    "missing @ separator in %s.",
723 		    tmpbuf, estr);
724 		Undiag_reason = UD_MISSINGPATH;
725 		FREE(estr);
726 		goto badcase;
727 	}
728 	*sepptr = '\0';
729 	if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
730 		out(O_ALTFP, "reconstitute_observation: %s: "
731 		    "trouble converting path string \"%s\" "
732 		    "to internal representation.", tmpbuf, sepptr + 1);
733 		Undiag_reason = UD_MISSINGPATH;
734 		FREE(estr);
735 		goto badcase;
736 	}
737 	prune_propagations(stable(estr), ipath(epnamenp));
738 	tree_free(epnamenp);
739 	FREE(estr);
740 
741 	init_size = alloc_total();
742 	out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size);
743 	cfgdata = MALLOC(sizeof (struct cfgdata));
744 	cfgdata->cooked = NULL;
745 	cfgdata->devcache = NULL;
746 	cfgdata->cpucache = NULL;
747 	cfgdata->raw_refcnt = 1;
748 
749 	if (rawsz > 0) {
750 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
751 			out(O_ALTFP, "restart_fme: Config data size mismatch");
752 			Undiag_reason = UD_CFGMISMATCH;
753 			goto badcase;
754 		}
755 		cfgdata->begin = MALLOC(rawsz);
756 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
757 		fmd_buf_read(hdl,
758 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
759 	} else {
760 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
761 	}
762 
763 	config_cook(cfgdata);
764 	fmep->config = cfgdata->cooked;
765 	config_free(cfgdata);
766 	out(O_ALTFP|O_STAMP, "config_restore added %d bytes",
767 	    alloc_total() - init_size);
768 
769 	if ((fmep->eventtree = itree_create(fmep->config)) == NULL) {
770 		/* case not properly saved or irretrievable */
771 		out(O_ALTFP, "restart_fme: NULL instance tree");
772 		Undiag_reason = UD_INSTFAIL;
773 		goto badcase;
774 	}
775 
776 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
777 
778 	if (reconstitute_observations(fmep) != 0)
779 		goto badcase;
780 
781 	out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id);
782 	for (ep = fmep->observations; ep; ep = ep->observations) {
783 		out(O_ALTFP|O_NONL, " ");
784 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
785 	}
786 	out(O_ALTFP, NULL);
787 
788 	Open_fme_count++;
789 
790 	/* give the diagnosis algorithm a shot at the new FME state */
791 	fme_eval(fmep, fmep->e0r);
792 	return;
793 
794 badcase:
795 	if (fmep->eventtree != NULL)
796 		itree_free(fmep->eventtree);
797 	if (fmep->config)
798 		structconfig_free(fmep->config);
799 	destroy_fme_bufs(fmep);
800 	FREE(fmep);
801 
802 	/*
803 	 * Since we're unable to restart the case, add it to the undiagable
804 	 * list and solve and close it as appropriate.
805 	 */
806 	bad = MALLOC(sizeof (struct case_list));
807 	bad->next = NULL;
808 
809 	if (Undiagablecaselist != NULL)
810 		bad->next = Undiagablecaselist;
811 	Undiagablecaselist = bad;
812 	bad->fmcase = inprogress;
813 
814 	out(O_ALTFP|O_NONL, "[case %s (unable to restart), ",
815 	    fmd_case_uuid(hdl, bad->fmcase));
816 
817 	if (fmd_case_solved(hdl, bad->fmcase)) {
818 		out(O_ALTFP|O_NONL, "already solved, ");
819 	} else {
820 		out(O_ALTFP|O_NONL, "solving, ");
821 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
822 		    NULL, NULL, NULL);
823 		if (Undiag_reason != NULL)
824 			(void) nvlist_add_string(defect,
825 			    UNDIAG_REASON, Undiag_reason);
826 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
827 		fmd_case_solve(hdl, bad->fmcase);
828 	}
829 
830 	if (fmd_case_closed(hdl, bad->fmcase)) {
831 		out(O_ALTFP, "already closed ]");
832 	} else {
833 		out(O_ALTFP, "closing ]");
834 		fmd_case_close(hdl, bad->fmcase);
835 	}
836 }
837 
838 /*ARGSUSED*/
839 static void
840 globals_destructor(void *left, void *right, void *arg)
841 {
842 	struct evalue *evp = (struct evalue *)right;
843 	if (evp->t == NODEPTR)
844 		tree_free((struct node *)(uintptr_t)evp->v);
845 	evp->v = NULL;
846 	FREE(evp);
847 }
848 
849 void
850 destroy_fme(struct fme *f)
851 {
852 	stats_delete(f->Rcount);
853 	stats_delete(f->Hcallcount);
854 	stats_delete(f->Rcallcount);
855 	stats_delete(f->Ccallcount);
856 	stats_delete(f->Ecallcount);
857 	stats_delete(f->Tcallcount);
858 	stats_delete(f->Marrowcount);
859 	stats_delete(f->diags);
860 
861 	if (f->eventtree != NULL)
862 		itree_free(f->eventtree);
863 	if (f->config)
864 		structconfig_free(f->config);
865 	lut_free(f->globals, globals_destructor, NULL);
866 	FREE(f);
867 }
868 
869 static const char *
870 fme_state2str(enum fme_state s)
871 {
872 	switch (s) {
873 	case FME_NOTHING:	return ("NOTHING");
874 	case FME_WAIT:		return ("WAIT");
875 	case FME_CREDIBLE:	return ("CREDIBLE");
876 	case FME_DISPROVED:	return ("DISPROVED");
877 	case FME_DEFERRED:	return ("DEFERRED");
878 	default:		return ("UNKNOWN");
879 	}
880 }
881 
882 static int
883 is_problem(enum nametype t)
884 {
885 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
886 }
887 
888 static int
889 is_fault(enum nametype t)
890 {
891 	return (t == N_FAULT);
892 }
893 
894 static int
895 is_defect(enum nametype t)
896 {
897 	return (t == N_DEFECT);
898 }
899 
900 static int
901 is_upset(enum nametype t)
902 {
903 	return (t == N_UPSET);
904 }
905 
906 static void
907 fme_print(int flags, struct fme *fmep)
908 {
909 	struct event *ep;
910 
911 	out(flags, "Fault Management Exercise %d", fmep->id);
912 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
913 	out(flags|O_NONL, "\t  Start time: ");
914 	ptree_timeval(flags|O_NONL, &fmep->ull);
915 	out(flags, NULL);
916 	if (fmep->wull) {
917 		out(flags|O_NONL, "\t   Wait time: ");
918 		ptree_timeval(flags|O_NONL, &fmep->wull);
919 		out(flags, NULL);
920 	}
921 	out(flags|O_NONL, "\t          E0: ");
922 	if (fmep->e0)
923 		itree_pevent_brief(flags|O_NONL, fmep->e0);
924 	else
925 		out(flags|O_NONL, "NULL");
926 	out(flags, NULL);
927 	out(flags|O_NONL, "\tObservations:");
928 	for (ep = fmep->observations; ep; ep = ep->observations) {
929 		out(flags|O_NONL, " ");
930 		itree_pevent_brief(flags|O_NONL, ep);
931 	}
932 	out(flags, NULL);
933 	out(flags|O_NONL, "\tSuspect list:");
934 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
935 		out(flags|O_NONL, " ");
936 		itree_pevent_brief(flags|O_NONL, ep);
937 	}
938 	out(flags, NULL);
939 	if (fmep->eventtree != NULL) {
940 		out(flags|O_VERB2, "\t        Tree:");
941 		itree_ptree(flags|O_VERB2, fmep->eventtree);
942 	}
943 }
944 
945 static struct node *
946 pathstring2epnamenp(char *path)
947 {
948 	char *sep = "/";
949 	struct node *ret;
950 	char *ptr;
951 
952 	if ((ptr = strtok(path, sep)) == NULL)
953 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
954 
955 	ret = tree_iname(stable(ptr), NULL, 0);
956 
957 	while ((ptr = strtok(NULL, sep)) != NULL)
958 		ret = tree_name_append(ret,
959 		    tree_iname(stable(ptr), NULL, 0));
960 
961 	return (ret);
962 }
963 
964 /*
965  * for a given upset sp, increment the corresponding SERD engine.  if the
966  * SERD engine trips, return the ename and ipp of the resulting ereport.
967  * returns true if engine tripped and *enamep and *ippp were filled in.
968  */
969 static int
970 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
971     fmd_case_t *fmcase, struct event *sp, const char **enamep,
972     const struct ipath **ippp)
973 {
974 	struct node *serdinst;
975 	char *serdname;
976 	struct node *nid;
977 	struct serd_entry *newentp;
978 
979 	ASSERT(sp->t == N_UPSET);
980 	ASSERT(ffep != NULL);
981 
982 	/*
983 	 * obtain instanced SERD engine from the upset sp.  from this
984 	 * derive serdname, the string used to identify the SERD engine.
985 	 */
986 	serdinst = eventprop_lookup(sp, L_engine);
987 
988 	if (serdinst == NULL)
989 		return (NULL);
990 
991 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
992 	    ipath(serdinst->u.stmt.np->u.event.epname));
993 
994 	/* handle serd engine "id" property, if there is one */
995 	if ((nid =
996 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
997 		struct evalue *gval;
998 		char suffixbuf[200];
999 		char *suffix;
1000 		char *nserdname;
1001 		size_t nname;
1002 
1003 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
1004 		ptree_name_iter(O_ALTFP|O_NONL, nid);
1005 
1006 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
1007 
1008 		if ((gval = lut_lookup(fmep->globals,
1009 		    (void *)nid->u.globid.s, NULL)) == NULL) {
1010 			out(O_ALTFP, " undefined");
1011 		} else if (gval->t == UINT64) {
1012 			out(O_ALTFP, " %llu", gval->v);
1013 			(void) sprintf(suffixbuf, "%llu", gval->v);
1014 			suffix = suffixbuf;
1015 		} else {
1016 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
1017 			suffix = (char *)(uintptr_t)gval->v;
1018 		}
1019 
1020 		nname = strlen(serdname) + strlen(suffix) + 2;
1021 		nserdname = MALLOC(nname);
1022 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
1023 		FREE(serdname);
1024 		serdname = nserdname;
1025 	}
1026 
1027 	if (!fmd_serd_exists(hdl, serdname)) {
1028 		struct node *nN, *nT;
1029 		const char *s;
1030 		struct node *nodep;
1031 		struct config *cp;
1032 		char *path;
1033 		uint_t nval;
1034 		hrtime_t tval;
1035 		const char *name;
1036 		char *serd_name;
1037 		int i;
1038 		char *ptr;
1039 		int got_n_override = 0, got_t_override = 0;
1040 
1041 		/* no SERD engine yet, so create it */
1042 		nodep = serdinst->u.stmt.np->u.event.epname;
1043 		name = serdinst->u.stmt.np->u.event.ename->u.name.s;
1044 		path = ipath2str(NULL, ipath(nodep));
1045 		cp = config_lookup(fmep->config, path, 0);
1046 		FREE((void *)path);
1047 
1048 		/*
1049 		 * We allow serd paramaters to be overridden, either from
1050 		 * eft.conf file values (if Serd_Override is set) or from
1051 		 * driver properties (for "serd.io.device" engines).
1052 		 */
1053 		if (Serd_Override != NULL) {
1054 			char *save_ptr, *ptr1, *ptr2, *ptr3;
1055 			ptr3 = save_ptr = STRDUP(Serd_Override);
1056 			while (*ptr3 != '\0') {
1057 				ptr1 = strchr(ptr3, ',');
1058 				*ptr1 = '\0';
1059 				if (strcmp(ptr3, name) == 0) {
1060 					ptr2 =  strchr(ptr1 + 1, ',');
1061 					*ptr2 = '\0';
1062 					nval = atoi(ptr1 + 1);
1063 					out(O_ALTFP, "serd override %s_n %d",
1064 					    name, nval);
1065 					ptr3 =  strchr(ptr2 + 1, ' ');
1066 					if (ptr3)
1067 						*ptr3 = '\0';
1068 					ptr = STRDUP(ptr2 + 1);
1069 					out(O_ALTFP, "serd override %s_t %s",
1070 					    name, ptr);
1071 					got_n_override = 1;
1072 					got_t_override = 1;
1073 					break;
1074 				} else {
1075 					ptr2 =  strchr(ptr1 + 1, ',');
1076 					ptr3 =  strchr(ptr2 + 1, ' ');
1077 					if (ptr3 == NULL)
1078 						break;
1079 				}
1080 				ptr3++;
1081 			}
1082 			FREE(save_ptr);
1083 		}
1084 
1085 		if (cp && got_n_override == 0) {
1086 			/*
1087 			 * convert serd engine name into property name
1088 			 */
1089 			serd_name = MALLOC(strlen(name) + 3);
1090 			for (i = 0; i < strlen(name); i++) {
1091 				if (name[i] == '.')
1092 					serd_name[i] = '_';
1093 				else
1094 					serd_name[i] = name[i];
1095 			}
1096 			serd_name[i++] = '_';
1097 			serd_name[i++] = 'n';
1098 			serd_name[i] = '\0';
1099 			if (s = config_getprop(cp, serd_name)) {
1100 				nval = atoi(s);
1101 				out(O_ALTFP, "serd override %s_n %s", name, s);
1102 				got_n_override = 1;
1103 			}
1104 			serd_name[i - 1] = 't';
1105 			if (s = config_getprop(cp, serd_name)) {
1106 				ptr = STRDUP(s);
1107 				out(O_ALTFP, "serd override %s_t %s", name, s);
1108 				got_t_override = 1;
1109 			}
1110 			FREE(serd_name);
1111 		}
1112 
1113 		if (!got_n_override) {
1114 			nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N,
1115 			    NULL);
1116 			ASSERT(nN->t == T_NUM);
1117 			nval = (uint_t)nN->u.ull;
1118 		}
1119 		if (!got_t_override) {
1120 			nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T,
1121 			    NULL);
1122 			ASSERT(nT->t == T_TIMEVAL);
1123 			tval = (hrtime_t)nT->u.ull;
1124 		} else {
1125 			const unsigned long long *ullp;
1126 			const char *suffix;
1127 			int len;
1128 
1129 			len = strspn(ptr, "0123456789");
1130 			suffix = stable(&ptr[len]);
1131 			ullp = (unsigned long long *)lut_lookup(Timesuffixlut,
1132 			    (void *)suffix, NULL);
1133 			ptr[len] = '\0';
1134 			tval = (unsigned long long)strtoul(ptr, NULL, 0) *
1135 			    (ullp ? *ullp : 1ll);
1136 			FREE(ptr);
1137 		}
1138 		fmd_serd_create(hdl, serdname, nval, tval);
1139 	}
1140 
1141 	newentp = MALLOC(sizeof (*newentp));
1142 	newentp->ename = stable(serdinst->u.stmt.np->u.event.ename->u.name.s);
1143 	newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname);
1144 	newentp->hdl = hdl;
1145 	if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) {
1146 		SerdEngines = lut_add(SerdEngines, (void *)newentp,
1147 		    (void *)newentp, (lut_cmp)serd_cmp);
1148 		Serd_need_save = 1;
1149 		serd_save();
1150 	} else {
1151 		FREE(newentp);
1152 	}
1153 
1154 
1155 	/*
1156 	 * increment SERD engine.  if engine fires, reset serd
1157 	 * engine and return trip_strcode
1158 	 */
1159 	if (fmd_serd_record(hdl, serdname, ffep)) {
1160 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
1161 		    (void *)L_trip, NULL);
1162 
1163 		ASSERT(tripinst != NULL);
1164 
1165 		*enamep = tripinst->u.event.ename->u.name.s;
1166 		*ippp = ipath(tripinst->u.event.epname);
1167 
1168 		fmd_case_add_serd(hdl, fmcase, serdname);
1169 		fmd_serd_reset(hdl, serdname);
1170 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
1171 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
1172 		out(O_ALTFP, "]");
1173 
1174 		FREE(serdname);
1175 		return (1);
1176 	}
1177 
1178 	FREE(serdname);
1179 	return (0);
1180 }
1181 
1182 /*
1183  * search a suspect list for upsets.  feed each upset to serd_eval() and
1184  * build up tripped[], an array of ereports produced by the firing of
1185  * any SERD engines.  then feed each ereport back into
1186  * fme_receive_report().
1187  *
1188  * returns ntrip, the number of these ereports produced.
1189  */
1190 static int
1191 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
1192 {
1193 	/* we build an array of tripped ereports that we send ourselves */
1194 	struct {
1195 		const char *ename;
1196 		const struct ipath *ipp;
1197 	} *tripped;
1198 	struct event *sp;
1199 	int ntrip, nupset, i;
1200 
1201 	/*
1202 	 * count the number of upsets to determine the upper limit on
1203 	 * expected trip ereport strings.  remember that one upset can
1204 	 * lead to at most one ereport.
1205 	 */
1206 	nupset = 0;
1207 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
1208 		if (sp->t == N_UPSET)
1209 			nupset++;
1210 	}
1211 
1212 	if (nupset == 0)
1213 		return (0);
1214 
1215 	/*
1216 	 * get to this point if we have upsets and expect some trip
1217 	 * ereports
1218 	 */
1219 	tripped = alloca(sizeof (*tripped) * nupset);
1220 	bzero((void *)tripped, sizeof (*tripped) * nupset);
1221 
1222 	ntrip = 0;
1223 	for (sp = fmep->suspects; sp; sp = sp->suspects)
1224 		if (sp->t == N_UPSET &&
1225 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
1226 		    &tripped[ntrip].ename, &tripped[ntrip].ipp))
1227 			ntrip++;
1228 
1229 	for (i = 0; i < ntrip; i++) {
1230 		struct event *ep, *nep;
1231 		struct fme *nfmep;
1232 		fmd_case_t *fmcase;
1233 		const struct ipath *ipp;
1234 		const char *eventstring;
1235 		int prev_verbose;
1236 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1237 		enum fme_state state;
1238 
1239 		/*
1240 		 * First try and evaluate a case with the trip ereport plus
1241 		 * all the other ereports that cause the trip. If that fails
1242 		 * to evaluate then try again with just this ereport on its own.
1243 		 */
1244 		out(O_ALTFP|O_NONL, "fme_receive_report_serd: ");
1245 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1246 		out(O_ALTFP|O_STAMP, NULL);
1247 		ep = fmep->e0;
1248 		eventstring = ep->enode->u.event.ename->u.name.s;
1249 		ipp = ep->ipp;
1250 		prune_propagations(eventstring, ipp);
1251 
1252 		/*
1253 		 * create a duplicate fme and case
1254 		 */
1255 		fmcase = fmd_case_open(fmep->hdl, NULL);
1256 		out(O_ALTFP|O_NONL, "duplicate fme for event [");
1257 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1258 		out(O_ALTFP, " ]");
1259 		if ((nfmep = newfme(eventstring, ipp, fmep->hdl,
1260 		    fmcase)) == NULL) {
1261 			out(O_ALTFP|O_NONL, "[");
1262 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1263 			out(O_ALTFP, " CANNOT DIAGNOSE]");
1264 			publish_undiagnosable(fmep->hdl, ffep, fmcase);
1265 			continue;
1266 		}
1267 		Open_fme_count++;
1268 		nfmep->pull = fmep->pull;
1269 		init_fme_bufs(nfmep);
1270 		out(O_ALTFP|O_NONL, "[");
1271 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1272 		out(O_ALTFP, " created FME%d, case %s]", nfmep->id,
1273 		    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
1274 		if (ffep) {
1275 			fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep);
1276 			nfmep->e0r = ffep;
1277 		}
1278 
1279 		/*
1280 		 * add the original ereports
1281 		 */
1282 		for (ep = fmep->observations; ep; ep = ep->observations) {
1283 			eventstring = ep->enode->u.event.ename->u.name.s;
1284 			ipp = ep->ipp;
1285 			out(O_ALTFP|O_NONL, "adding event [");
1286 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1287 			out(O_ALTFP, " ]");
1288 			nep = itree_lookup(nfmep->eventtree, eventstring, ipp);
1289 			if (nep->count++ == 0) {
1290 				nep->observations = nfmep->observations;
1291 				nfmep->observations = nep;
1292 				serialize_observation(nfmep, eventstring, ipp);
1293 				nep->nvp = evnv_dupnvl(ep->nvp);
1294 			}
1295 			if (ffep)
1296 				fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase,
1297 				    ffep);
1298 			stats_counter_bump(nfmep->Rcount);
1299 		}
1300 
1301 		/*
1302 		 * add the serd trigger ereport
1303 		 */
1304 		if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename,
1305 		    tripped[i].ipp)) == NULL) {
1306 			/*
1307 			 * The trigger ereport is not in the instance tree. It
1308 			 * was presumably removed by prune_propagations() as
1309 			 * this combination of events is not present in the
1310 			 * rules.
1311 			 */
1312 			out(O_ALTFP, "upsets_eval: e0 not in instance tree");
1313 			Undiag_reason = UD_BADEVENTI;
1314 			goto retry_lone_ereport;
1315 		}
1316 		out(O_ALTFP|O_NONL, "adding event [");
1317 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1318 		out(O_ALTFP, " ]");
1319 		nfmep->ecurrent = ep;
1320 		ep->nvp = NULL;
1321 		ep->count = 1;
1322 		ep->observations = nfmep->observations;
1323 		nfmep->observations = ep;
1324 
1325 		/*
1326 		 * just peek first.
1327 		 */
1328 		nfmep->peek = 1;
1329 		prev_verbose = Verbose;
1330 		if (Debug == 0)
1331 			Verbose = 0;
1332 		lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep);
1333 		state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay);
1334 		nfmep->peek = 0;
1335 		Verbose = prev_verbose;
1336 		if (state == FME_DISPROVED) {
1337 			out(O_ALTFP, "upsets_eval: hypothesis disproved");
1338 			Undiag_reason = UD_UNSOLVD;
1339 retry_lone_ereport:
1340 			/*
1341 			 * However the trigger ereport on its own might be
1342 			 * diagnosable, so check for that. Undo the new fme
1343 			 * and case we just created and call fme_receive_report.
1344 			 */
1345 			out(O_ALTFP|O_NONL, "[");
1346 			ipath_print(O_ALTFP|O_NONL, tripped[i].ename,
1347 			    tripped[i].ipp);
1348 			out(O_ALTFP, " retrying with just trigger ereport]");
1349 			itree_free(nfmep->eventtree);
1350 			nfmep->eventtree = NULL;
1351 			structconfig_free(nfmep->config);
1352 			nfmep->config = NULL;
1353 			destroy_fme_bufs(nfmep);
1354 			fmd_case_close(nfmep->hdl, nfmep->fmcase);
1355 			fme_receive_report(fmep->hdl, ffep,
1356 			    tripped[i].ename, tripped[i].ipp, NULL);
1357 			continue;
1358 		}
1359 
1360 		/*
1361 		 * and evaluate
1362 		 */
1363 		serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp);
1364 		if (ffep)
1365 			fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep);
1366 		stats_counter_bump(nfmep->Rcount);
1367 		fme_eval(nfmep, ffep);
1368 	}
1369 
1370 	return (ntrip);
1371 }
1372 
1373 /*
1374  * fme_receive_external_report -- call when an external ereport comes in
1375  *
1376  * this routine just converts the relevant information from the ereport
1377  * into a format used internally and passes it on to fme_receive_report().
1378  */
1379 void
1380 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1381     const char *eventstring)
1382 {
1383 	struct node *epnamenp = platform_getpath(nvl);
1384 	const struct ipath *ipp;
1385 
1386 	/*
1387 	 * XFILE: If we ended up without a path, it's an X-file.
1388 	 * For now, use our undiagnosable interface.
1389 	 */
1390 	if (epnamenp == NULL) {
1391 		fmd_case_t *fmcase;
1392 
1393 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
1394 		Undiag_reason = UD_NOPATH;
1395 		fmcase = fmd_case_open(hdl, NULL);
1396 		publish_undiagnosable(hdl, ffep, fmcase);
1397 		return;
1398 	}
1399 
1400 	ipp = ipath(epnamenp);
1401 	tree_free(epnamenp);
1402 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
1403 }
1404 
1405 /*ARGSUSED*/
1406 void
1407 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1408     const char *eventstring)
1409 {
1410 	char *uuid;
1411 	nvlist_t **nva;
1412 	uint_t nvc;
1413 	const struct ipath *ipp;
1414 
1415 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
1416 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1417 	    &nva, &nvc) != 0) {
1418 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
1419 		return;
1420 	}
1421 
1422 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
1423 
1424 	while (nvc-- != 0) {
1425 		/*
1426 		 * Reset any istat or serd engine associated with this path.
1427 		 */
1428 		char *path;
1429 
1430 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
1431 			continue;
1432 
1433 		path = ipath2str(NULL, ipp);
1434 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
1435 		    path);
1436 		FREE(path);
1437 
1438 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
1439 		istat_save();
1440 
1441 		lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp);
1442 		serd_save();
1443 	}
1444 }
1445 
1446 /*ARGSUSED*/
1447 void
1448 fme_receive_topology_change(void)
1449 {
1450 	lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL);
1451 	istat_save();
1452 
1453 	lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL);
1454 	serd_save();
1455 }
1456 
1457 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1458     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1459 
1460 /* ARGSUSED */
1461 static void
1462 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1463 {
1464 	struct bubble *bp;
1465 	struct arrowlist *ap;
1466 
1467 	ep->cached_state = 0;
1468 	ep->keep_in_tree = 0;
1469 	for (bp = itree_next_bubble(ep, NULL); bp;
1470 	    bp = itree_next_bubble(ep, bp)) {
1471 		if (bp->t != B_FROM)
1472 			continue;
1473 		bp->mark = 0;
1474 		for (ap = itree_next_arrow(bp, NULL); ap;
1475 		    ap = itree_next_arrow(bp, ap))
1476 			ap->arrowp->mark = 0;
1477 	}
1478 }
1479 
1480 static void
1481 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1482     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1483 {
1484 	struct event *ep;
1485 	struct fme *fmep = NULL;
1486 	struct fme *ofmep = NULL;
1487 	struct fme *cfmep, *svfmep;
1488 	int matched = 0;
1489 	nvlist_t *defect;
1490 	fmd_case_t *fmcase;
1491 
1492 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1493 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1494 	out(O_ALTFP|O_STAMP, NULL);
1495 
1496 	/* decide which FME it goes to */
1497 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1498 		int prev_verbose;
1499 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1500 		enum fme_state state;
1501 		nvlist_t *pre_peek_nvp = NULL;
1502 
1503 		if (fmep->overflow) {
1504 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1505 				ofmep = fmep;
1506 
1507 			continue;
1508 		}
1509 
1510 		/*
1511 		 * ignore solved or closed cases
1512 		 */
1513 		if (fmep->posted_suspects ||
1514 		    fmd_case_solved(fmep->hdl, fmep->fmcase) ||
1515 		    fmd_case_closed(fmep->hdl, fmep->fmcase))
1516 			continue;
1517 
1518 		/* look up event in event tree for this FME */
1519 		if ((ep = itree_lookup(fmep->eventtree,
1520 		    eventstring, ipp)) == NULL)
1521 			continue;
1522 
1523 		/* note observation */
1524 		fmep->ecurrent = ep;
1525 		if (ep->count++ == 0) {
1526 			/* link it into list of observations seen */
1527 			ep->observations = fmep->observations;
1528 			fmep->observations = ep;
1529 			ep->nvp = evnv_dupnvl(nvl);
1530 		} else {
1531 			/* use new payload values for peek */
1532 			pre_peek_nvp = ep->nvp;
1533 			ep->nvp = evnv_dupnvl(nvl);
1534 		}
1535 
1536 		/* tell hypothesise() not to mess with suspect list */
1537 		fmep->peek = 1;
1538 
1539 		/* don't want this to be verbose (unless Debug is set) */
1540 		prev_verbose = Verbose;
1541 		if (Debug == 0)
1542 			Verbose = 0;
1543 
1544 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1545 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1546 
1547 		fmep->peek = 0;
1548 
1549 		/* put verbose flag back */
1550 		Verbose = prev_verbose;
1551 
1552 		if (state != FME_DISPROVED) {
1553 			/* found an FME that explains the ereport */
1554 			matched++;
1555 			out(O_ALTFP|O_NONL, "[");
1556 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1557 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1558 
1559 			if (pre_peek_nvp)
1560 				nvlist_free(pre_peek_nvp);
1561 
1562 			if (ep->count == 1)
1563 				serialize_observation(fmep, eventstring, ipp);
1564 
1565 			if (ffep)
1566 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1567 
1568 			stats_counter_bump(fmep->Rcount);
1569 
1570 			/* re-eval FME */
1571 			fme_eval(fmep, ffep);
1572 		} else {
1573 
1574 			/* not a match, undo noting of observation */
1575 			fmep->ecurrent = NULL;
1576 			if (--ep->count == 0) {
1577 				/* unlink it from observations */
1578 				fmep->observations = ep->observations;
1579 				ep->observations = NULL;
1580 				nvlist_free(ep->nvp);
1581 				ep->nvp = NULL;
1582 			} else {
1583 				nvlist_free(ep->nvp);
1584 				ep->nvp = pre_peek_nvp;
1585 			}
1586 		}
1587 	}
1588 
1589 	if (matched)
1590 		return;	/* explained by at least one existing FME */
1591 
1592 	/* clean up closed fmes */
1593 	cfmep = ClosedFMEs;
1594 	while (cfmep != NULL) {
1595 		svfmep = cfmep->next;
1596 		destroy_fme(cfmep);
1597 		cfmep = svfmep;
1598 	}
1599 	ClosedFMEs = NULL;
1600 	prune_propagations(eventstring, ipp);
1601 
1602 	if (ofmep) {
1603 		out(O_ALTFP|O_NONL, "[");
1604 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1605 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1606 		if (ffep)
1607 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1608 
1609 		return;
1610 
1611 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1612 		out(O_ALTFP|O_NONL, "[");
1613 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1614 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1615 
1616 		fmcase = fmd_case_open(hdl, NULL);
1617 
1618 		/* Create overflow fme */
1619 		if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1620 			out(O_ALTFP|O_NONL, "[");
1621 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1622 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1623 			publish_undiagnosable(hdl, ffep, fmcase);
1624 			return;
1625 		}
1626 
1627 		Open_fme_count++;
1628 
1629 		init_fme_bufs(fmep);
1630 		fmep->overflow = B_TRUE;
1631 
1632 		if (ffep)
1633 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1634 
1635 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1636 		    NULL, NULL, NULL);
1637 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1638 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1639 		fmd_case_solve(hdl, fmep->fmcase);
1640 		return;
1641 	}
1642 
1643 	/* open a case */
1644 	fmcase = fmd_case_open(hdl, NULL);
1645 
1646 	/* start a new FME */
1647 	if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1648 		out(O_ALTFP|O_NONL, "[");
1649 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1650 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1651 		publish_undiagnosable(hdl, ffep, fmcase);
1652 		return;
1653 	}
1654 
1655 	Open_fme_count++;
1656 
1657 	init_fme_bufs(fmep);
1658 
1659 	out(O_ALTFP|O_NONL, "[");
1660 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1661 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1662 	    fmd_case_uuid(hdl, fmep->fmcase));
1663 
1664 	ep = fmep->e0;
1665 	ASSERT(ep != NULL);
1666 
1667 	/* note observation */
1668 	fmep->ecurrent = ep;
1669 	if (ep->count++ == 0) {
1670 		/* link it into list of observations seen */
1671 		ep->observations = fmep->observations;
1672 		fmep->observations = ep;
1673 		ep->nvp = evnv_dupnvl(nvl);
1674 		serialize_observation(fmep, eventstring, ipp);
1675 	} else {
1676 		/* new payload overrides any previous */
1677 		nvlist_free(ep->nvp);
1678 		ep->nvp = evnv_dupnvl(nvl);
1679 	}
1680 
1681 	stats_counter_bump(fmep->Rcount);
1682 
1683 	if (ffep) {
1684 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1685 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1686 		fmep->e0r = ffep;
1687 	}
1688 
1689 	/* give the diagnosis algorithm a shot at the new FME state */
1690 	fme_eval(fmep, ffep);
1691 }
1692 
1693 void
1694 fme_status(int flags)
1695 {
1696 	struct fme *fmep;
1697 
1698 	if (FMElist == NULL) {
1699 		out(flags, "No fault management exercises underway.");
1700 		return;
1701 	}
1702 
1703 	for (fmep = FMElist; fmep; fmep = fmep->next)
1704 		fme_print(flags, fmep);
1705 }
1706 
1707 /*
1708  * "indent" routines used mostly for nicely formatted debug output, but also
1709  * for sanity checking for infinite recursion bugs.
1710  */
1711 
1712 #define	MAX_INDENT 1024
1713 static const char *indent_s[MAX_INDENT];
1714 static int current_indent;
1715 
1716 static void
1717 indent_push(const char *s)
1718 {
1719 	if (current_indent < MAX_INDENT)
1720 		indent_s[current_indent++] = s;
1721 	else
1722 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1723 }
1724 
1725 static void
1726 indent_set(const char *s)
1727 {
1728 	current_indent = 0;
1729 	indent_push(s);
1730 }
1731 
1732 static void
1733 indent_pop(void)
1734 {
1735 	if (current_indent > 0)
1736 		current_indent--;
1737 	else
1738 		out(O_DIE, "recursion underflow");
1739 }
1740 
1741 static void
1742 indent(void)
1743 {
1744 	int i;
1745 	if (!Verbose)
1746 		return;
1747 	for (i = 0; i < current_indent; i++)
1748 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1749 }
1750 
1751 #define	SLNEW		1
1752 #define	SLCHANGED	2
1753 #define	SLWAIT		3
1754 #define	SLDISPROVED	4
1755 
1756 static void
1757 print_suspects(int circumstance, struct fme *fmep)
1758 {
1759 	struct event *ep;
1760 
1761 	out(O_ALTFP|O_NONL, "[");
1762 	if (circumstance == SLCHANGED) {
1763 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1764 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1765 	} else if (circumstance == SLWAIT) {
1766 		out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id,
1767 		    fmep->timer);
1768 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1769 	} else if (circumstance == SLDISPROVED) {
1770 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1771 	} else {
1772 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1773 	}
1774 
1775 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1776 		out(O_ALTFP, "]");
1777 		return;
1778 	}
1779 
1780 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1781 		out(O_ALTFP|O_NONL, " ");
1782 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1783 	}
1784 	out(O_ALTFP, "]");
1785 }
1786 
1787 static struct node *
1788 eventprop_lookup(struct event *ep, const char *propname)
1789 {
1790 	return (lut_lookup(ep->props, (void *)propname, NULL));
1791 }
1792 
1793 #define	MAXDIGITIDX	23
1794 static char numbuf[MAXDIGITIDX + 1];
1795 
1796 static int
1797 node2uint(struct node *n, uint_t *valp)
1798 {
1799 	struct evalue value;
1800 	struct lut *globals = NULL;
1801 
1802 	if (n == NULL)
1803 		return (1);
1804 
1805 	/*
1806 	 * check value.v since we are being asked to convert an unsigned
1807 	 * long long int to an unsigned int
1808 	 */
1809 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1810 	    value.t != UINT64 || value.v > (1ULL << 32))
1811 		return (1);
1812 
1813 	*valp = (uint_t)value.v;
1814 
1815 	return (0);
1816 }
1817 
1818 static nvlist_t *
1819 node2fmri(struct node *n)
1820 {
1821 	nvlist_t **pa, *f, *p;
1822 	struct node *nc;
1823 	uint_t depth = 0;
1824 	char *numstr, *nullbyte;
1825 	char *failure;
1826 	int err, i;
1827 
1828 	/* XXX do we need to be able to handle a non-T_NAME node? */
1829 	if (n == NULL || n->t != T_NAME)
1830 		return (NULL);
1831 
1832 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1833 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1834 			break;
1835 		depth++;
1836 	}
1837 
1838 	if (nc != NULL) {
1839 		/* We bailed early, something went wrong */
1840 		return (NULL);
1841 	}
1842 
1843 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1844 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1845 	pa = alloca(depth * sizeof (nvlist_t *));
1846 	for (i = 0; i < depth; i++)
1847 		pa[i] = NULL;
1848 
1849 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1850 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1851 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1852 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1853 	if (err != 0) {
1854 		failure = "basic construction of FMRI failed";
1855 		goto boom;
1856 	}
1857 
1858 	numbuf[MAXDIGITIDX] = '\0';
1859 	nullbyte = &numbuf[MAXDIGITIDX];
1860 	i = 0;
1861 
1862 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1863 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1864 		if (err != 0) {
1865 			failure = "alloc of an hc-pair failed";
1866 			goto boom;
1867 		}
1868 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1869 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1870 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1871 		if (err != 0) {
1872 			failure = "construction of an hc-pair failed";
1873 			goto boom;
1874 		}
1875 		pa[i++] = p;
1876 	}
1877 
1878 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1879 	if (err == 0) {
1880 		for (i = 0; i < depth; i++)
1881 			if (pa[i] != NULL)
1882 				nvlist_free(pa[i]);
1883 		return (f);
1884 	}
1885 	failure = "addition of hc-pair array to FMRI failed";
1886 
1887 boom:
1888 	for (i = 0; i < depth; i++)
1889 		if (pa[i] != NULL)
1890 			nvlist_free(pa[i]);
1891 	nvlist_free(f);
1892 	out(O_DIE, "%s", failure);
1893 	/*NOTREACHED*/
1894 	return (NULL);
1895 }
1896 
1897 /* an ipath cache entry is an array of these, with s==NULL at the end */
1898 struct ipath {
1899 	const char *s;	/* component name (in stable) */
1900 	int i;		/* instance number */
1901 };
1902 
1903 static nvlist_t *
1904 ipath2fmri(struct ipath *ipath)
1905 {
1906 	nvlist_t **pa, *f, *p;
1907 	uint_t depth = 0;
1908 	char *numstr, *nullbyte;
1909 	char *failure;
1910 	int err, i;
1911 	struct ipath *ipp;
1912 
1913 	for (ipp = ipath; ipp->s != NULL; ipp++)
1914 		depth++;
1915 
1916 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1917 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1918 	pa = alloca(depth * sizeof (nvlist_t *));
1919 	for (i = 0; i < depth; i++)
1920 		pa[i] = NULL;
1921 
1922 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1923 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1924 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1925 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1926 	if (err != 0) {
1927 		failure = "basic construction of FMRI failed";
1928 		goto boom;
1929 	}
1930 
1931 	numbuf[MAXDIGITIDX] = '\0';
1932 	nullbyte = &numbuf[MAXDIGITIDX];
1933 	i = 0;
1934 
1935 	for (ipp = ipath; ipp->s != NULL; ipp++) {
1936 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1937 		if (err != 0) {
1938 			failure = "alloc of an hc-pair failed";
1939 			goto boom;
1940 		}
1941 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s);
1942 		numstr = ulltostr(ipp->i, nullbyte);
1943 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1944 		if (err != 0) {
1945 			failure = "construction of an hc-pair failed";
1946 			goto boom;
1947 		}
1948 		pa[i++] = p;
1949 	}
1950 
1951 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1952 	if (err == 0) {
1953 		for (i = 0; i < depth; i++)
1954 			if (pa[i] != NULL)
1955 				nvlist_free(pa[i]);
1956 		return (f);
1957 	}
1958 	failure = "addition of hc-pair array to FMRI failed";
1959 
1960 boom:
1961 	for (i = 0; i < depth; i++)
1962 		if (pa[i] != NULL)
1963 			nvlist_free(pa[i]);
1964 	nvlist_free(f);
1965 	out(O_DIE, "%s", failure);
1966 	/*NOTREACHED*/
1967 	return (NULL);
1968 }
1969 
1970 static uint_t
1971 avg(uint_t sum, uint_t cnt)
1972 {
1973 	unsigned long long s = sum * 10;
1974 
1975 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1976 }
1977 
1978 static uint8_t
1979 percentof(uint_t part, uint_t whole)
1980 {
1981 	unsigned long long p = part * 1000;
1982 
1983 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1984 }
1985 
1986 struct rsl {
1987 	struct event *suspect;
1988 	nvlist_t *asru;
1989 	nvlist_t *fru;
1990 	nvlist_t *rsrc;
1991 };
1992 
1993 /*
1994  *  rslfree -- free internal members of struct rsl not expected to be
1995  *	freed elsewhere.
1996  */
1997 static void
1998 rslfree(struct rsl *freeme)
1999 {
2000 	if (freeme->asru != NULL)
2001 		nvlist_free(freeme->asru);
2002 	if (freeme->fru != NULL)
2003 		nvlist_free(freeme->fru);
2004 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
2005 		nvlist_free(freeme->rsrc);
2006 }
2007 
2008 /*
2009  *  rslcmp -- compare two rsl structures.  Use the following
2010  *	comparisons to establish cardinality:
2011  *
2012  *	1. Name of the suspect's class. (simple strcmp)
2013  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
2014  *
2015  */
2016 static int
2017 rslcmp(const void *a, const void *b)
2018 {
2019 	struct rsl *r1 = (struct rsl *)a;
2020 	struct rsl *r2 = (struct rsl *)b;
2021 	int rv;
2022 
2023 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
2024 	    r2->suspect->enode->u.event.ename->u.name.s);
2025 	if (rv != 0)
2026 		return (rv);
2027 
2028 	if (r1->asru == NULL && r2->asru == NULL)
2029 		return (0);
2030 	if (r1->asru == NULL)
2031 		return (-1);
2032 	if (r2->asru == NULL)
2033 		return (1);
2034 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
2035 }
2036 
2037 /*
2038  *  rsluniq -- given an array of rsl structures, seek out and "remove"
2039  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
2040  *	of the array element.  Removal also means updating the number of
2041  *	problems and the number of problems which are not faults.  User
2042  *	provides the first and last element pointers.
2043  */
2044 static void
2045 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
2046 {
2047 	struct rsl *cr;
2048 
2049 	if (*nprobs == 1)
2050 		return;
2051 
2052 	/*
2053 	 *  At this point, we only expect duplicate defects.
2054 	 *  Eversholt's diagnosis algorithm prevents duplicate
2055 	 *  suspects, but we rewrite defects in the platform code after
2056 	 *  the diagnosis is made, and that can introduce new
2057 	 *  duplicates.
2058 	 */
2059 	while (first <= last) {
2060 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
2061 			first++;
2062 			continue;
2063 		}
2064 		cr = first + 1;
2065 		while (cr <= last) {
2066 			if (is_defect(first->suspect->t)) {
2067 				if (rslcmp(first, cr) == 0) {
2068 					cr->suspect = NULL;
2069 					rslfree(cr);
2070 					(*nprobs)--;
2071 					(*nnonf)--;
2072 				}
2073 			}
2074 			/*
2075 			 * assume all defects are in order after our
2076 			 * sort and short circuit here with "else break" ?
2077 			 */
2078 			cr++;
2079 		}
2080 		first++;
2081 	}
2082 }
2083 
2084 /*
2085  * get_resources -- for a given suspect, determine what ASRU, FRU and
2086  *     RSRC nvlists should be advertised in the final suspect list.
2087  */
2088 void
2089 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
2090 {
2091 	struct node *asrudef, *frudef;
2092 	nvlist_t *asru, *fru;
2093 	nvlist_t *rsrc = NULL;
2094 	char *pathstr;
2095 
2096 	/*
2097 	 * First find any ASRU and/or FRU defined in the
2098 	 * initial fault tree.
2099 	 */
2100 	asrudef = eventprop_lookup(sp, L_ASRU);
2101 	frudef = eventprop_lookup(sp, L_FRU);
2102 
2103 	/*
2104 	 * Create FMRIs based on those definitions
2105 	 */
2106 	asru = node2fmri(asrudef);
2107 	fru = node2fmri(frudef);
2108 	pathstr = ipath2str(NULL, sp->ipp);
2109 
2110 	/*
2111 	 * Allow for platform translations of the FMRIs
2112 	 */
2113 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
2114 	    pathstr);
2115 
2116 	FREE(pathstr);
2117 	rsrcs->suspect = sp;
2118 	rsrcs->asru = asru;
2119 	rsrcs->fru = fru;
2120 	rsrcs->rsrc = rsrc;
2121 }
2122 
2123 /*
2124  * trim_suspects -- prior to publishing, we may need to remove some
2125  *    suspects from the list.  If we're auto-closing upsets, we don't
2126  *    want any of those in the published list.  If the ASRUs for multiple
2127  *    defects resolve to the same ASRU (driver) we only want to publish
2128  *    that as a single suspect.
2129  */
2130 static void
2131 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
2132     struct rsl **end)
2133 {
2134 	struct event *ep;
2135 	struct rsl *rp;
2136 	int rpcnt;
2137 
2138 	/*
2139 	 * First save the suspects in the psuspects, then copy back
2140 	 * only the ones we wish to retain.  This resets nsuspects to
2141 	 * zero.
2142 	 */
2143 	rpcnt = fmep->nsuspects;
2144 	save_suspects(fmep);
2145 
2146 	/*
2147 	 * allocate an array of resource pointers for the suspects.
2148 	 * We may end up using less than the full allocation, but this
2149 	 * is a very short-lived array.  publish_suspects() will free
2150 	 * this array when it's done using it.
2151 	 */
2152 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
2153 	bzero(rp, rpcnt * sizeof (struct rsl));
2154 
2155 	/* first pass, remove any unwanted upsets and populate our array */
2156 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
2157 		if (no_upsets && is_upset(ep->t))
2158 			continue;
2159 		get_resources(ep, rp, fmep->config);
2160 		rp++;
2161 		fmep->nsuspects++;
2162 		if (!is_fault(ep->t))
2163 			fmep->nonfault++;
2164 	}
2165 
2166 	/* if all we had was unwanted upsets, we're done */
2167 	if (fmep->nsuspects == 0)
2168 		return;
2169 
2170 	*end = rp - 1;
2171 
2172 	/* sort the array */
2173 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
2174 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
2175 }
2176 
2177 /*
2178  * addpayloadprop -- add a payload prop to a problem
2179  */
2180 static void
2181 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
2182 {
2183 	ASSERT(fault != NULL);
2184 	ASSERT(lhs != NULL);
2185 	ASSERT(rhs != NULL);
2186 
2187 	if (rhs->t == UINT64) {
2188 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
2189 
2190 		if (nvlist_add_uint64(fault, lhs, rhs->v) != 0)
2191 			out(O_DIE,
2192 			    "cannot add payloadprop \"%s\" to fault", lhs);
2193 	} else {
2194 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
2195 		    lhs, (char *)(uintptr_t)rhs->v);
2196 
2197 		if (nvlist_add_string(fault, lhs, (char *)(uintptr_t)rhs->v) !=
2198 		    0)
2199 			out(O_DIE,
2200 			    "cannot add payloadprop \"%s\" to fault", lhs);
2201 	}
2202 }
2203 
2204 static char *Istatbuf;
2205 static char *Istatbufptr;
2206 static int Istatsz;
2207 
2208 /*
2209  * istataddsize -- calculate size of istat and add it to Istatsz
2210  */
2211 /*ARGSUSED2*/
2212 static void
2213 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2214 {
2215 	int val;
2216 
2217 	ASSERT(lhs != NULL);
2218 	ASSERT(rhs != NULL);
2219 
2220 	if ((val = stats_counter_value(rhs)) == 0)
2221 		return;	/* skip zero-valued stats */
2222 
2223 	/* count up the size of the stat name */
2224 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
2225 	Istatsz++;	/* for the trailing NULL byte */
2226 
2227 	/* count up the size of the stat value */
2228 	Istatsz += snprintf(NULL, 0, "%d", val);
2229 	Istatsz++;	/* for the trailing NULL byte */
2230 }
2231 
2232 /*
2233  * istat2str -- serialize an istat, writing result to *Istatbufptr
2234  */
2235 /*ARGSUSED2*/
2236 static void
2237 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2238 {
2239 	char *str;
2240 	int len;
2241 	int val;
2242 
2243 	ASSERT(lhs != NULL);
2244 	ASSERT(rhs != NULL);
2245 
2246 	if ((val = stats_counter_value(rhs)) == 0)
2247 		return;	/* skip zero-valued stats */
2248 
2249 	/* serialize the stat name */
2250 	str = ipath2str(lhs->ename, lhs->ipath);
2251 	len = strlen(str);
2252 
2253 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
2254 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
2255 	Istatbufptr += len;
2256 	FREE(str);
2257 	*Istatbufptr++ = '\0';
2258 
2259 	/* serialize the stat value */
2260 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
2261 	    "%d", val);
2262 	*Istatbufptr++ = '\0';
2263 
2264 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
2265 }
2266 
2267 void
2268 istat_save()
2269 {
2270 	if (Istat_need_save == 0)
2271 		return;
2272 
2273 	/* figure out how big the serialzed info is */
2274 	Istatsz = 0;
2275 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
2276 
2277 	if (Istatsz == 0) {
2278 		/* no stats to save */
2279 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2280 		return;
2281 	}
2282 
2283 	/* create the serialized buffer */
2284 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
2285 	lut_walk(Istats, (lut_cb)istat2str, NULL);
2286 
2287 	/* clear out current saved stats */
2288 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2289 
2290 	/* write out the new version */
2291 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
2292 	FREE(Istatbuf);
2293 
2294 	Istat_need_save = 0;
2295 }
2296 
2297 int
2298 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
2299 {
2300 	if (ent1->ename != ent2->ename)
2301 		return (ent2->ename - ent1->ename);
2302 	if (ent1->ipath != ent2->ipath)
2303 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2304 
2305 	return (0);
2306 }
2307 
2308 /*
2309  * istat-verify -- verify the component associated with a stat still exists
2310  *
2311  * if the component no longer exists, this routine resets the stat and
2312  * returns 0.  if the component still exists, it returns 1.
2313  */
2314 static int
2315 istat_verify(struct node *snp, struct istat_entry *entp)
2316 {
2317 	struct stats *statp;
2318 	nvlist_t *fmri;
2319 
2320 	fmri = node2fmri(snp->u.event.epname);
2321 	if (platform_path_exists(fmri)) {
2322 		nvlist_free(fmri);
2323 		return (1);
2324 	}
2325 	nvlist_free(fmri);
2326 
2327 	/* component no longer in system.  zero out the associated stats */
2328 	if ((statp = (struct stats *)
2329 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
2330 	    stats_counter_value(statp) == 0)
2331 		return (0);	/* stat is already reset */
2332 
2333 	Istat_need_save = 1;
2334 	stats_counter_reset(statp);
2335 	return (0);
2336 }
2337 
2338 static void
2339 istat_bump(struct node *snp, int n)
2340 {
2341 	struct stats *statp;
2342 	struct istat_entry ent;
2343 
2344 	ASSERT(snp != NULL);
2345 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
2346 	ASSERT(snp->u.event.epname != NULL);
2347 
2348 	/* class name should be hoisted into a single stable entry */
2349 	ASSERT(snp->u.event.ename->u.name.next == NULL);
2350 	ent.ename = snp->u.event.ename->u.name.s;
2351 	ent.ipath = ipath(snp->u.event.epname);
2352 
2353 	if (!istat_verify(snp, &ent)) {
2354 		/* component no longer exists in system, nothing to do */
2355 		return;
2356 	}
2357 
2358 	if ((statp = (struct stats *)
2359 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
2360 		/* need to create the counter */
2361 		int cnt = 0;
2362 		struct node *np;
2363 		char *sname;
2364 		char *snamep;
2365 		struct istat_entry *newentp;
2366 
2367 		/* count up the size of the stat name */
2368 		np = snp->u.event.ename;
2369 		while (np != NULL) {
2370 			cnt += strlen(np->u.name.s);
2371 			cnt++;	/* for the '.' or '@' */
2372 			np = np->u.name.next;
2373 		}
2374 		np = snp->u.event.epname;
2375 		while (np != NULL) {
2376 			cnt += snprintf(NULL, 0, "%s%llu",
2377 			    np->u.name.s, np->u.name.child->u.ull);
2378 			cnt++;	/* for the '/' or trailing NULL byte */
2379 			np = np->u.name.next;
2380 		}
2381 
2382 		/* build the stat name */
2383 		snamep = sname = alloca(cnt);
2384 		np = snp->u.event.ename;
2385 		while (np != NULL) {
2386 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2387 			    "%s", np->u.name.s);
2388 			np = np->u.name.next;
2389 			if (np)
2390 				*snamep++ = '.';
2391 		}
2392 		*snamep++ = '@';
2393 		np = snp->u.event.epname;
2394 		while (np != NULL) {
2395 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2396 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
2397 			np = np->u.name.next;
2398 			if (np)
2399 				*snamep++ = '/';
2400 		}
2401 		*snamep++ = '\0';
2402 
2403 		/* create the new stat & add it to our list */
2404 		newentp = MALLOC(sizeof (*newentp));
2405 		*newentp = ent;
2406 		statp = stats_new_counter(NULL, sname, 0);
2407 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
2408 		    (lut_cmp)istat_cmp);
2409 	}
2410 
2411 	/* if n is non-zero, set that value instead of bumping */
2412 	if (n) {
2413 		stats_counter_reset(statp);
2414 		stats_counter_add(statp, n);
2415 	} else
2416 		stats_counter_bump(statp);
2417 	Istat_need_save = 1;
2418 
2419 	ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath);
2420 	out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented",
2421 	    stats_counter_value(statp));
2422 }
2423 
2424 /*ARGSUSED*/
2425 static void
2426 istat_destructor(void *left, void *right, void *arg)
2427 {
2428 	struct istat_entry *entp = (struct istat_entry *)left;
2429 	struct stats *statp = (struct stats *)right;
2430 	FREE(entp);
2431 	stats_delete(statp);
2432 }
2433 
2434 /*
2435  * Callback used in a walk of the Istats to reset matching stat counters.
2436  */
2437 static void
2438 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
2439     const struct ipath *ipp)
2440 {
2441 	char *path;
2442 
2443 	if (entp->ipath == ipp) {
2444 		path = ipath2str(entp->ename, ipp);
2445 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
2446 		FREE(path);
2447 		stats_counter_reset(statp);
2448 		Istat_need_save = 1;
2449 	}
2450 }
2451 
2452 /*ARGSUSED*/
2453 static void
2454 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp,
2455     void *unused)
2456 {
2457 	char *path;
2458 	nvlist_t *fmri;
2459 
2460 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2461 	if (!platform_path_exists(fmri)) {
2462 		path = ipath2str(entp->ename, entp->ipath);
2463 		out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path);
2464 		FREE(path);
2465 		stats_counter_reset(statp);
2466 		Istat_need_save = 1;
2467 	}
2468 	nvlist_free(fmri);
2469 }
2470 
2471 void
2472 istat_fini(void)
2473 {
2474 	lut_free(Istats, istat_destructor, NULL);
2475 }
2476 
2477 static char *Serdbuf;
2478 static char *Serdbufptr;
2479 static int Serdsz;
2480 
2481 /*
2482  * serdaddsize -- calculate size of serd and add it to Serdsz
2483  */
2484 /*ARGSUSED*/
2485 static void
2486 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2487 {
2488 	ASSERT(lhs != NULL);
2489 
2490 	/* count up the size of the stat name */
2491 	Serdsz += ipath2strlen(lhs->ename, lhs->ipath);
2492 	Serdsz++;	/* for the trailing NULL byte */
2493 }
2494 
2495 /*
2496  * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2497  */
2498 /*ARGSUSED*/
2499 static void
2500 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2501 {
2502 	char *str;
2503 	int len;
2504 
2505 	ASSERT(lhs != NULL);
2506 
2507 	/* serialize the serd engine name */
2508 	str = ipath2str(lhs->ename, lhs->ipath);
2509 	len = strlen(str);
2510 
2511 	ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]);
2512 	(void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr);
2513 	Serdbufptr += len;
2514 	FREE(str);
2515 	*Serdbufptr++ = '\0';
2516 	ASSERT(Serdbufptr <= &Serdbuf[Serdsz]);
2517 }
2518 
2519 void
2520 serd_save()
2521 {
2522 	if (Serd_need_save == 0)
2523 		return;
2524 
2525 	/* figure out how big the serialzed info is */
2526 	Serdsz = 0;
2527 	lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL);
2528 
2529 	if (Serdsz == 0) {
2530 		/* no serd engines to save */
2531 		fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2532 		return;
2533 	}
2534 
2535 	/* create the serialized buffer */
2536 	Serdbufptr = Serdbuf = MALLOC(Serdsz);
2537 	lut_walk(SerdEngines, (lut_cb)serd2str, NULL);
2538 
2539 	/* clear out current saved stats */
2540 	fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2541 
2542 	/* write out the new version */
2543 	fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz);
2544 	FREE(Serdbuf);
2545 	Serd_need_save = 0;
2546 }
2547 
2548 int
2549 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2)
2550 {
2551 	if (ent1->ename != ent2->ename)
2552 		return (ent2->ename - ent1->ename);
2553 	if (ent1->ipath != ent2->ipath)
2554 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2555 
2556 	return (0);
2557 }
2558 
2559 void
2560 fme_serd_load(fmd_hdl_t *hdl)
2561 {
2562 	int sz;
2563 	char *sbuf;
2564 	char *sepptr;
2565 	char *ptr;
2566 	struct serd_entry *newentp;
2567 	struct node *epname;
2568 	nvlist_t *fmri;
2569 	char *namestring;
2570 
2571 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0)
2572 		return;
2573 	sbuf = alloca(sz);
2574 	fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz);
2575 	ptr = sbuf;
2576 	while (ptr < &sbuf[sz]) {
2577 		sepptr = strchr(ptr, '@');
2578 		*sepptr = '\0';
2579 		namestring = ptr;
2580 		sepptr++;
2581 		ptr = sepptr;
2582 		ptr += strlen(ptr);
2583 		ptr++;	/* move past the '\0' separating paths */
2584 		epname = pathstring2epnamenp(sepptr);
2585 		fmri = node2fmri(epname);
2586 		if (platform_path_exists(fmri)) {
2587 			newentp = MALLOC(sizeof (*newentp));
2588 			newentp->hdl = hdl;
2589 			newentp->ipath = ipath(epname);
2590 			newentp->ename = stable(namestring);
2591 			SerdEngines = lut_add(SerdEngines, (void *)newentp,
2592 			    (void *)newentp, (lut_cmp)serd_cmp);
2593 		} else
2594 			Serd_need_save = 1;
2595 		tree_free(epname);
2596 		nvlist_free(fmri);
2597 	}
2598 	/* save it back again in case some of the paths no longer exist */
2599 	serd_save();
2600 }
2601 
2602 /*ARGSUSED*/
2603 static void
2604 serd_destructor(void *left, void *right, void *arg)
2605 {
2606 	struct serd_entry *entp = (struct serd_entry *)left;
2607 	FREE(entp);
2608 }
2609 
2610 /*
2611  * Callback used in a walk of the SerdEngines to reset matching serd engines.
2612  */
2613 /*ARGSUSED*/
2614 static void
2615 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp)
2616 {
2617 	char *path;
2618 
2619 	if (entp->ipath == ipp) {
2620 		path = ipath2str(entp->ename, ipp);
2621 		out(O_ALTFP, "serd_reset_cb: resetting %s", path);
2622 		fmd_serd_reset(entp->hdl, path);
2623 		FREE(path);
2624 		Serd_need_save = 1;
2625 	}
2626 }
2627 
2628 /*ARGSUSED*/
2629 static void
2630 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2)
2631 {
2632 	char *path;
2633 	nvlist_t *fmri;
2634 
2635 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2636 	if (!platform_path_exists(fmri)) {
2637 		path = ipath2str(entp->ename, entp->ipath);
2638 		out(O_ALTFP, "serd_topo_chg_cb: not present %s", path);
2639 		fmd_serd_reset(entp->hdl, path);
2640 		FREE(path);
2641 		Serd_need_save = 1;
2642 	}
2643 	nvlist_free(fmri);
2644 }
2645 
2646 void
2647 serd_fini(void)
2648 {
2649 	lut_free(SerdEngines, serd_destructor, NULL);
2650 }
2651 
2652 static void
2653 publish_suspects(struct fme *fmep)
2654 {
2655 	struct rsl *srl = NULL;
2656 	struct rsl *erl;
2657 	struct rsl *rp;
2658 	nvlist_t *fault;
2659 	uint8_t cert;
2660 	uint_t *frs;
2661 	uint_t fravg, frsum, fr;
2662 	uint_t messval;
2663 	struct node *snp;
2664 	int frcnt, fridx;
2665 	boolean_t no_upsets = B_FALSE;
2666 	boolean_t allfaulty = B_TRUE;
2667 
2668 	stats_counter_bump(fmep->diags);
2669 
2670 	/*
2671 	 * If we're auto-closing upsets, we don't want to include them
2672 	 * in any produced suspect lists or certainty accounting.
2673 	 */
2674 	if (Autoclose != NULL)
2675 		if (strcmp(Autoclose, "true") == 0 ||
2676 		    strcmp(Autoclose, "all") == 0 ||
2677 		    strcmp(Autoclose, "upsets") == 0)
2678 			no_upsets = B_TRUE;
2679 
2680 	trim_suspects(fmep, no_upsets, &srl, &erl);
2681 
2682 	/*
2683 	 * If the resulting suspect list has no members, we're
2684 	 * done.  Returning here will simply close the case.
2685 	 */
2686 	if (fmep->nsuspects == 0) {
2687 		out(O_ALTFP,
2688 		    "[FME%d, case %s (all suspects are upsets)]",
2689 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2690 		FREE(srl);
2691 		restore_suspects(fmep);
2692 		return;
2693 	}
2694 
2695 	/*
2696 	 * If the suspect list is all faults, then for a given fault,
2697 	 * say X of N, X's certainty is computed via:
2698 	 *
2699 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
2700 	 *
2701 	 * If none of the suspects are faults, and there are N suspects,
2702 	 * the certainty of a given suspect is 100/N.
2703 	 *
2704 	 * If there are are a mixture of faults and other problems in
2705 	 * the suspect list, we take an average of the faults'
2706 	 * FITrates and treat this average as the FITrate for any
2707 	 * non-faults.  The fitrate of any given suspect is then
2708 	 * computed per the first formula above.
2709 	 */
2710 	if (fmep->nonfault == fmep->nsuspects) {
2711 		/* NO faults in the suspect list */
2712 		cert = percentof(1, fmep->nsuspects);
2713 	} else {
2714 		/* sum the fitrates */
2715 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
2716 		fridx = frcnt = frsum = 0;
2717 
2718 		for (rp = srl; rp <= erl; rp++) {
2719 			struct node *n;
2720 
2721 			if (rp->suspect == NULL)
2722 				continue;
2723 			if (!is_fault(rp->suspect->t)) {
2724 				frs[fridx++] = 0;
2725 				continue;
2726 			}
2727 			n = eventprop_lookup(rp->suspect, L_FITrate);
2728 			if (node2uint(n, &fr) != 0) {
2729 				out(O_DEBUG|O_NONL, "event ");
2730 				ipath_print(O_DEBUG|O_NONL,
2731 				    rp->suspect->enode->u.event.ename->u.name.s,
2732 				    rp->suspect->ipp);
2733 				out(O_DEBUG, " has no FITrate (using 1)");
2734 				fr = 1;
2735 			} else if (fr == 0) {
2736 				out(O_DEBUG|O_NONL, "event ");
2737 				ipath_print(O_DEBUG|O_NONL,
2738 				    rp->suspect->enode->u.event.ename->u.name.s,
2739 				    rp->suspect->ipp);
2740 				out(O_DEBUG, " has zero FITrate (using 1)");
2741 				fr = 1;
2742 			}
2743 
2744 			frs[fridx++] = fr;
2745 			frsum += fr;
2746 			frcnt++;
2747 		}
2748 		fravg = avg(frsum, frcnt);
2749 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
2750 			if (frs[fridx] == 0) {
2751 				frs[fridx] = fravg;
2752 				frsum += fravg;
2753 			}
2754 	}
2755 
2756 	/* Add them in reverse order of our sort, as fmd reverses order */
2757 	for (rp = erl; rp >= srl; rp--) {
2758 		if (rp->suspect == NULL)
2759 			continue;
2760 		if (!is_fault(rp->suspect->t))
2761 			allfaulty = B_FALSE;
2762 		if (fmep->nonfault != fmep->nsuspects)
2763 			cert = percentof(frs[--fridx], frsum);
2764 		fault = fmd_nvl_create_fault(fmep->hdl,
2765 		    rp->suspect->enode->u.event.ename->u.name.s,
2766 		    cert,
2767 		    rp->asru,
2768 		    rp->fru,
2769 		    rp->rsrc);
2770 		if (fault == NULL)
2771 			out(O_DIE, "fault creation failed");
2772 		/* if "message" property exists, add it to the fault */
2773 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2774 		    &messval) == 0) {
2775 
2776 			out(O_ALTFP,
2777 			    "[FME%d, %s adds message=%d to suspect list]",
2778 			    fmep->id,
2779 			    rp->suspect->enode->u.event.ename->u.name.s,
2780 			    messval);
2781 			if (nvlist_add_boolean_value(fault,
2782 			    FM_SUSPECT_MESSAGE,
2783 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2784 				out(O_DIE, "cannot add no-message to fault");
2785 			}
2786 		}
2787 		/* add any payload properties */
2788 		lut_walk(rp->suspect->payloadprops,
2789 		    (lut_cb)addpayloadprop, (void *)fault);
2790 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2791 		rslfree(rp);
2792 
2793 		/*
2794 		 * If "action" property exists, evaluate it;  this must be done
2795 		 * before the dupclose check below since some actions may
2796 		 * modify the asru to be used in fmd_nvl_fmri_faulty.  This
2797 		 * needs to be restructured if any new actions are introduced
2798 		 * that have effects that we do not want to be visible if
2799 		 * we decide not to publish in the dupclose check below.
2800 		 */
2801 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2802 			struct evalue evalue;
2803 
2804 			out(O_ALTFP|O_NONL,
2805 			    "[FME%d, %s action ", fmep->id,
2806 			    rp->suspect->enode->u.event.ename->u.name.s);
2807 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2808 			out(O_ALTFP, "]");
2809 			Action_nvl = fault;
2810 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2811 			    NULL, 0, &evalue);
2812 		}
2813 
2814 		/*
2815 		 * if "dupclose" tunable is set, check if the asru is
2816 		 * already marked as "faulty".
2817 		 */
2818 		if (Dupclose && allfaulty) {
2819 			nvlist_t *asru;
2820 
2821 			out(O_ALTFP|O_VERB, "FMD%d dupclose check ", fmep->id);
2822 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2823 			out(O_ALTFP|O_VERB|O_NONL, " ");
2824 			if (nvlist_lookup_nvlist(fault,
2825 			    FM_FAULT_ASRU, &asru) != 0) {
2826 				out(O_ALTFP|O_VERB, "NULL asru");
2827 				allfaulty = B_FALSE;
2828 			} else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) {
2829 				out(O_ALTFP|O_VERB, "faulty");
2830 			} else {
2831 				out(O_ALTFP|O_VERB, "not faulty");
2832 				allfaulty = B_FALSE;
2833 			}
2834 		}
2835 
2836 	}
2837 
2838 	/*
2839 	 * Close the case if all asrus are already known to be faulty and if
2840 	 * Dupclose is enabled.  Otherwise we are going to publish so take
2841 	 * any pre-publication actions.
2842 	 */
2843 	if (Dupclose && allfaulty) {
2844 		out(O_ALTFP, "[dupclose FME%d, case %s]", fmep->id,
2845 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2846 		fmd_case_close(fmep->hdl, fmep->fmcase);
2847 	} else {
2848 		for (rp = erl; rp >= srl; rp--) {
2849 			struct event *suspect = rp->suspect;
2850 
2851 			if (suspect == NULL)
2852 				continue;
2853 
2854 			/* if "count" exists, increment the appropriate stat */
2855 			if ((snp = eventprop_lookup(suspect,
2856 			    L_count)) != NULL) {
2857 				out(O_ALTFP|O_NONL,
2858 				    "[FME%d, %s count ", fmep->id,
2859 				    suspect->enode->u.event.ename->u.name.s);
2860 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2861 				out(O_ALTFP, "]");
2862 				istat_bump(snp, 0);
2863 
2864 			}
2865 		}
2866 		istat_save();	/* write out any istat changes */
2867 
2868 		out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
2869 		    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2870 		fmd_case_solve(fmep->hdl, fmep->fmcase);
2871 	}
2872 
2873 	/*
2874 	 * revert to the original suspect list
2875 	 */
2876 	FREE(srl);
2877 	restore_suspects(fmep);
2878 }
2879 
2880 static void
2881 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase)
2882 {
2883 	struct case_list *newcase;
2884 	nvlist_t *defect;
2885 
2886 	out(O_ALTFP,
2887 	    "[undiagnosable ereport received, "
2888 	    "creating and closing a new case (%s)]",
2889 	    Undiag_reason ? Undiag_reason : "reason not provided");
2890 
2891 	newcase = MALLOC(sizeof (struct case_list));
2892 	newcase->next = NULL;
2893 	newcase->fmcase = fmcase;
2894 	if (Undiagablecaselist != NULL)
2895 		newcase->next = Undiagablecaselist;
2896 	Undiagablecaselist = newcase;
2897 
2898 	if (ffep != NULL)
2899 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
2900 
2901 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
2902 	    NULL, NULL, NULL);
2903 	if (Undiag_reason != NULL)
2904 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2905 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
2906 
2907 	fmd_case_solve(hdl, newcase->fmcase);
2908 	fmd_case_close(hdl, newcase->fmcase);
2909 }
2910 
2911 static void
2912 fme_undiagnosble_pci(struct fme *f, nvlist_t *rc_detector) {
2913 	nvlist_t *defect, *asru;
2914 	char *path;
2915 
2916 	(void) nvlist_lookup_string(rc_detector, FM_FMRI_DEV_PATH, &path);
2917 	out(O_ALTFP, "[solving/closing PCIE FME%d PATH %s]", f->id, path);
2918 
2919 	(void) nvlist_xalloc(&asru, NV_UNIQUE_NAME, &Eft_nv_hdl);
2920 	(void) nvlist_add_uint8(asru, FM_VERSION, FM_HC_SCHEME_VERSION);
2921 	(void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
2922 	(void) nvlist_add_string(asru, FM_FMRI_DEV_PATH, path);
2923 
2924 	defect = fmd_nvl_create_fault(f->hdl,
2925 	    "fault.sunos.eft.unknown_pci_fault", 100,
2926 	    asru, NULL, NULL);
2927 
2928 	(void) nvlist_add_string(defect, UNDIAG_REASON, UD_PCIUNSOLVD);
2929 	fmd_case_pci_undiagnosable(f->hdl, f->fmcase, defect);
2930 
2931 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2932 	fmd_case_solve(f->hdl, f->fmcase);
2933 	fmd_case_close(f->hdl, f->fmcase);
2934 }
2935 
2936 static void
2937 fme_undiagnosable(struct fme *f)
2938 {
2939 	nvlist_t *defect;
2940 	nvlist_t *rc_detector;
2941 
2942 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
2943 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
2944 	    Undiag_reason ? Undiag_reason : "undiagnosable");
2945 
2946 	if ((strcmp(Undiag_reason, UD_UNSOLVD) == 0) &&
2947 	    fmd_case_is_pcie(f->hdl, f->fmcase, &rc_detector)) {
2948 		fme_undiagnosble_pci(f, rc_detector);
2949 		return;
2950 	}
2951 
2952 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
2953 	    NULL, NULL, NULL);
2954 	if (Undiag_reason != NULL)
2955 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2956 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2957 	fmd_case_solve(f->hdl, f->fmcase);
2958 	fmd_case_close(f->hdl, f->fmcase);
2959 }
2960 
2961 /*
2962  * fme_close_case
2963  *
2964  *	Find the requested case amongst our fmes and close it.  Free up
2965  *	the related fme.
2966  */
2967 void
2968 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
2969 {
2970 	struct case_list *ucasep, *prevcasep = NULL;
2971 	struct fme *prev = NULL;
2972 	struct fme *fmep;
2973 
2974 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
2975 		if (fmcase != ucasep->fmcase) {
2976 			prevcasep = ucasep;
2977 			continue;
2978 		}
2979 
2980 		if (prevcasep == NULL)
2981 			Undiagablecaselist = Undiagablecaselist->next;
2982 		else
2983 			prevcasep->next = ucasep->next;
2984 
2985 		FREE(ucasep);
2986 		return;
2987 	}
2988 
2989 	for (fmep = FMElist; fmep; fmep = fmep->next) {
2990 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
2991 			break;
2992 		prev = fmep;
2993 	}
2994 
2995 	if (fmep == NULL) {
2996 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
2997 		    fmd_case_uuid(hdl, fmcase));
2998 		return;
2999 	}
3000 
3001 	if (EFMElist == fmep)
3002 		EFMElist = prev;
3003 
3004 	if (prev == NULL)
3005 		FMElist = FMElist->next;
3006 	else
3007 		prev->next = fmep->next;
3008 
3009 	fmep->next = NULL;
3010 
3011 	/* Get rid of any timer this fme has set */
3012 	if (fmep->wull != 0)
3013 		fmd_timer_remove(fmep->hdl, fmep->timer);
3014 
3015 	if (ClosedFMEs == NULL) {
3016 		ClosedFMEs = fmep;
3017 	} else {
3018 		fmep->next = ClosedFMEs;
3019 		ClosedFMEs = fmep;
3020 	}
3021 
3022 	Open_fme_count--;
3023 
3024 	/* See if we can close the overflow FME */
3025 	if (Open_fme_count <= Max_fme) {
3026 		for (fmep = FMElist; fmep; fmep = fmep->next) {
3027 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
3028 			    fmep->fmcase)))
3029 				break;
3030 		}
3031 
3032 		if (fmep != NULL)
3033 			fmd_case_close(fmep->hdl, fmep->fmcase);
3034 	}
3035 }
3036 
3037 /*
3038  * fme_set_timer()
3039  *	If the time we need to wait for the given FME is less than the
3040  *	current timer, kick that old timer out and establish a new one.
3041  */
3042 static int
3043 fme_set_timer(struct fme *fmep, unsigned long long wull)
3044 {
3045 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
3046 	ptree_timeval(O_ALTFP|O_VERB, &wull);
3047 
3048 	if (wull <= fmep->pull) {
3049 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
3050 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
3051 		out(O_ALTFP|O_VERB, NULL);
3052 		/* we've waited at least wull already, don't need timer */
3053 		return (0);
3054 	}
3055 
3056 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
3057 	if (fmep->wull != 0) {
3058 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
3059 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
3060 		out(O_ALTFP|O_VERB, NULL);
3061 	} else {
3062 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
3063 		out(O_ALTFP|O_VERB, NULL);
3064 	}
3065 
3066 	if (fmep->wull != 0)
3067 		if (wull >= fmep->wull)
3068 			/* New timer would fire later than established timer */
3069 			return (0);
3070 
3071 	if (fmep->wull != 0) {
3072 		fmd_timer_remove(fmep->hdl, fmep->timer);
3073 	}
3074 
3075 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
3076 	    fmep->e0r, wull);
3077 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
3078 	fmep->wull = wull;
3079 	return (1);
3080 }
3081 
3082 void
3083 fme_timer_fired(struct fme *fmep, id_t tid)
3084 {
3085 	struct fme *ffmep = NULL;
3086 
3087 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
3088 		if (ffmep == fmep)
3089 			break;
3090 
3091 	if (ffmep == NULL) {
3092 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
3093 		    (void *)fmep);
3094 		return;
3095 	}
3096 
3097 	out(O_ALTFP|O_VERB, "Timer fired %lx", tid);
3098 	fmep->pull = fmep->wull;
3099 	fmep->wull = 0;
3100 	fmd_buf_write(fmep->hdl, fmep->fmcase,
3101 	    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
3102 
3103 	fme_eval(fmep, fmep->e0r);
3104 }
3105 
3106 /*
3107  * Preserve the fme's suspect list in its psuspects list, NULLing the
3108  * suspects list in the meantime.
3109  */
3110 static void
3111 save_suspects(struct fme *fmep)
3112 {
3113 	struct event *ep;
3114 	struct event *nextep;
3115 
3116 	/* zero out the previous suspect list */
3117 	for (ep = fmep->psuspects; ep; ep = nextep) {
3118 		nextep = ep->psuspects;
3119 		ep->psuspects = NULL;
3120 	}
3121 	fmep->psuspects = NULL;
3122 
3123 	/* zero out the suspect list, copying it to previous suspect list */
3124 	fmep->psuspects = fmep->suspects;
3125 	for (ep = fmep->suspects; ep; ep = nextep) {
3126 		nextep = ep->suspects;
3127 		ep->psuspects = ep->suspects;
3128 		ep->suspects = NULL;
3129 		ep->is_suspect = 0;
3130 	}
3131 	fmep->suspects = NULL;
3132 	fmep->nsuspects = 0;
3133 	fmep->nonfault = 0;
3134 }
3135 
3136 /*
3137  * Retrieve the fme's suspect list from its psuspects list.
3138  */
3139 static void
3140 restore_suspects(struct fme *fmep)
3141 {
3142 	struct event *ep;
3143 	struct event *nextep;
3144 
3145 	fmep->nsuspects = fmep->nonfault = 0;
3146 	fmep->suspects = fmep->psuspects;
3147 	for (ep = fmep->psuspects; ep; ep = nextep) {
3148 		fmep->nsuspects++;
3149 		if (!is_fault(ep->t))
3150 			fmep->nonfault++;
3151 		nextep = ep->psuspects;
3152 		ep->suspects = ep->psuspects;
3153 	}
3154 }
3155 
3156 /*
3157  * this is what we use to call the Emrys prototype code instead of main()
3158  */
3159 static void
3160 fme_eval(struct fme *fmep, fmd_event_t *ffep)
3161 {
3162 	struct event *ep;
3163 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
3164 
3165 	save_suspects(fmep);
3166 
3167 	out(O_ALTFP, "Evaluate FME %d", fmep->id);
3168 	indent_set("  ");
3169 
3170 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
3171 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
3172 
3173 	out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
3174 	    fme_state2str(fmep->state));
3175 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
3176 		out(O_ALTFP|O_NONL, " ");
3177 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
3178 	}
3179 	out(O_ALTFP, NULL);
3180 
3181 	switch (fmep->state) {
3182 	case FME_CREDIBLE:
3183 		print_suspects(SLNEW, fmep);
3184 		(void) upsets_eval(fmep, ffep);
3185 
3186 		/*
3187 		 * we may have already posted suspects in upsets_eval() which
3188 		 * can recurse into fme_eval() again. If so then just return.
3189 		 */
3190 		if (fmep->posted_suspects)
3191 			return;
3192 
3193 		publish_suspects(fmep);
3194 		fmep->posted_suspects = 1;
3195 		fmd_buf_write(fmep->hdl, fmep->fmcase,
3196 		    WOBUF_POSTD,
3197 		    (void *)&fmep->posted_suspects,
3198 		    sizeof (fmep->posted_suspects));
3199 
3200 		/*
3201 		 * Now the suspects have been posted, we can clear up
3202 		 * the instance tree as we won't be looking at it again.
3203 		 * Also cancel the timer as the case is now solved.
3204 		 */
3205 		if (fmep->wull != 0) {
3206 			fmd_timer_remove(fmep->hdl, fmep->timer);
3207 			fmep->wull = 0;
3208 		}
3209 		break;
3210 
3211 	case FME_WAIT:
3212 		ASSERT(my_delay > fmep->ull);
3213 		(void) fme_set_timer(fmep, my_delay);
3214 		print_suspects(SLWAIT, fmep);
3215 		itree_prune(fmep->eventtree);
3216 		return;
3217 
3218 	case FME_DISPROVED:
3219 		print_suspects(SLDISPROVED, fmep);
3220 		Undiag_reason = UD_UNSOLVD;
3221 		fme_undiagnosable(fmep);
3222 		break;
3223 	}
3224 
3225 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
3226 		int doclose = 0;
3227 
3228 		if (strcmp(Autoclose, "true") == 0 ||
3229 		    strcmp(Autoclose, "all") == 0)
3230 			doclose = 1;
3231 
3232 		if (strcmp(Autoclose, "upsets") == 0) {
3233 			doclose = 1;
3234 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
3235 				if (ep->t != N_UPSET) {
3236 					doclose = 0;
3237 					break;
3238 				}
3239 			}
3240 		}
3241 
3242 		if (doclose) {
3243 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
3244 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
3245 			fmd_case_close(fmep->hdl, fmep->fmcase);
3246 		}
3247 	}
3248 	itree_free(fmep->eventtree);
3249 	fmep->eventtree = NULL;
3250 	structconfig_free(fmep->config);
3251 	fmep->config = NULL;
3252 	destroy_fme_bufs(fmep);
3253 }
3254 
3255 static void indent(void);
3256 static int triggered(struct fme *fmep, struct event *ep, int mark);
3257 static enum fme_state effects_test(struct fme *fmep,
3258     struct event *fault_event, unsigned long long at_latest_by,
3259     unsigned long long *pdelay);
3260 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
3261     unsigned long long at_latest_by, unsigned long long *pdelay);
3262 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
3263     unsigned long long at_latest_by, unsigned long long *pdelay);
3264 
3265 static int
3266 checkconstraints(struct fme *fmep, struct arrow *arrowp)
3267 {
3268 	struct constraintlist *ctp;
3269 	struct evalue value;
3270 	char *sep = "";
3271 
3272 	if (arrowp->forever_false) {
3273 		indent();
3274 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
3275 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3276 			out(O_ALTFP|O_VERB|O_NONL, sep);
3277 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3278 			sep = ", ";
3279 		}
3280 		out(O_ALTFP|O_VERB, NULL);
3281 		return (0);
3282 	}
3283 	if (arrowp->forever_true) {
3284 		indent();
3285 		out(O_ALTFP|O_VERB|O_NONL, "  Forever true constraint: ");
3286 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3287 			out(O_ALTFP|O_VERB|O_NONL, sep);
3288 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3289 			sep = ", ";
3290 		}
3291 		out(O_ALTFP|O_VERB, NULL);
3292 		return (1);
3293 	}
3294 
3295 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3296 		if (eval_expr(ctp->cnode, NULL, NULL,
3297 		    &fmep->globals, fmep->config,
3298 		    arrowp, 0, &value)) {
3299 			/* evaluation successful */
3300 			if (value.t == UNDEFINED || value.v == 0) {
3301 				/* known false */
3302 				arrowp->forever_false = 1;
3303 				indent();
3304 				out(O_ALTFP|O_VERB|O_NONL,
3305 				    "  False constraint: ");
3306 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3307 				out(O_ALTFP|O_VERB, NULL);
3308 				return (0);
3309 			}
3310 		} else {
3311 			/* evaluation unsuccessful -- unknown value */
3312 			indent();
3313 			out(O_ALTFP|O_VERB|O_NONL,
3314 			    "  Deferred constraint: ");
3315 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3316 			out(O_ALTFP|O_VERB, NULL);
3317 			return (1);
3318 		}
3319 	}
3320 	/* known true */
3321 	arrowp->forever_true = 1;
3322 	indent();
3323 	out(O_ALTFP|O_VERB|O_NONL, "  True constraint: ");
3324 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3325 		out(O_ALTFP|O_VERB|O_NONL, sep);
3326 		ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3327 		sep = ", ";
3328 	}
3329 	out(O_ALTFP|O_VERB, NULL);
3330 	return (1);
3331 }
3332 
3333 static int
3334 triggered(struct fme *fmep, struct event *ep, int mark)
3335 {
3336 	struct bubble *bp;
3337 	struct arrowlist *ap;
3338 	int count = 0;
3339 
3340 	stats_counter_bump(fmep->Tcallcount);
3341 	for (bp = itree_next_bubble(ep, NULL); bp;
3342 	    bp = itree_next_bubble(ep, bp)) {
3343 		if (bp->t != B_TO)
3344 			continue;
3345 		for (ap = itree_next_arrow(bp, NULL); ap;
3346 		    ap = itree_next_arrow(bp, ap)) {
3347 			/* check count of marks against K in the bubble */
3348 			if ((ap->arrowp->mark & mark) &&
3349 			    ++count >= bp->nork)
3350 				return (1);
3351 		}
3352 	}
3353 	return (0);
3354 }
3355 
3356 static int
3357 mark_arrows(struct fme *fmep, struct event *ep, int mark,
3358     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
3359 {
3360 	struct bubble *bp;
3361 	struct arrowlist *ap;
3362 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3363 	unsigned long long my_delay;
3364 	enum fme_state result;
3365 	int retval = 0;
3366 
3367 	for (bp = itree_next_bubble(ep, NULL); bp;
3368 	    bp = itree_next_bubble(ep, bp)) {
3369 		if (bp->t != B_FROM)
3370 			continue;
3371 		stats_counter_bump(fmep->Marrowcount);
3372 		for (ap = itree_next_arrow(bp, NULL); ap;
3373 		    ap = itree_next_arrow(bp, ap)) {
3374 			struct event *ep2 = ap->arrowp->head->myevent;
3375 			/*
3376 			 * if we're clearing marks, we can avoid doing
3377 			 * all that work evaluating constraints.
3378 			 */
3379 			if (mark == 0) {
3380 				if (ap->arrowp->arrow_marked == 0)
3381 					continue;
3382 				ap->arrowp->arrow_marked = 0;
3383 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
3384 				if (keep && (ep2->cached_state &
3385 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
3386 					ep2->keep_in_tree = 1;
3387 				ep2->cached_state &=
3388 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
3389 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
3390 				    keep);
3391 				continue;
3392 			}
3393 			ap->arrowp->arrow_marked = 1;
3394 			if (ep2->cached_state & REQMNTS_DISPROVED) {
3395 				indent();
3396 				out(O_ALTFP|O_VERB|O_NONL,
3397 				    "  ALREADY DISPROVED ");
3398 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3399 				out(O_ALTFP|O_VERB, NULL);
3400 				continue;
3401 			}
3402 			if (ep2->cached_state & WAIT_EFFECT) {
3403 				indent();
3404 				out(O_ALTFP|O_VERB|O_NONL,
3405 				    "  ALREADY EFFECTS WAIT ");
3406 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3407 				out(O_ALTFP|O_VERB, NULL);
3408 				continue;
3409 			}
3410 			if (ep2->cached_state & CREDIBLE_EFFECT) {
3411 				indent();
3412 				out(O_ALTFP|O_VERB|O_NONL,
3413 				    "  ALREADY EFFECTS CREDIBLE ");
3414 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3415 				out(O_ALTFP|O_VERB, NULL);
3416 				continue;
3417 			}
3418 			if ((ep2->cached_state & PARENT_WAIT) &&
3419 			    (mark & PARENT_WAIT)) {
3420 				indent();
3421 				out(O_ALTFP|O_VERB|O_NONL,
3422 				    "  ALREADY PARENT EFFECTS WAIT ");
3423 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3424 				out(O_ALTFP|O_VERB, NULL);
3425 				continue;
3426 			}
3427 			platform_set_payloadnvp(ep2->nvp);
3428 			if (checkconstraints(fmep, ap->arrowp) == 0) {
3429 				platform_set_payloadnvp(NULL);
3430 				indent();
3431 				out(O_ALTFP|O_VERB|O_NONL,
3432 				    "  CONSTRAINTS FAIL ");
3433 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3434 				out(O_ALTFP|O_VERB, NULL);
3435 				continue;
3436 			}
3437 			platform_set_payloadnvp(NULL);
3438 			ap->arrowp->mark |= EFFECTS_COUNTER;
3439 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
3440 				indent();
3441 				out(O_ALTFP|O_VERB|O_NONL,
3442 				    "  K-COUNT NOT YET MET ");
3443 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3444 				out(O_ALTFP|O_VERB, NULL);
3445 				continue;
3446 			}
3447 			ep2->cached_state &= ~PARENT_WAIT;
3448 			/*
3449 			 * if we've reached an ereport and no propagation time
3450 			 * is specified, use the Hesitate value
3451 			 */
3452 			if (ep2->t == N_EREPORT && at_latest_by == 0ULL &&
3453 			    ap->arrowp->maxdelay == 0ULL) {
3454 				out(O_ALTFP|O_VERB|O_NONL, "  default wait ");
3455 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3456 				out(O_ALTFP|O_VERB, NULL);
3457 				result = requirements_test(fmep, ep2, Hesitate,
3458 				    &my_delay);
3459 			} else {
3460 				result = requirements_test(fmep, ep2,
3461 				    at_latest_by + ap->arrowp->maxdelay,
3462 				    &my_delay);
3463 			}
3464 			if (result == FME_WAIT) {
3465 				retval = WAIT_EFFECT;
3466 				if (overall_delay > my_delay)
3467 					overall_delay = my_delay;
3468 				ep2->cached_state |= WAIT_EFFECT;
3469 				indent();
3470 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
3471 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3472 				out(O_ALTFP|O_VERB, NULL);
3473 				indent_push("  E");
3474 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
3475 				    at_latest_by, &my_delay, 0) ==
3476 				    WAIT_EFFECT) {
3477 					retval = WAIT_EFFECT;
3478 					if (overall_delay > my_delay)
3479 						overall_delay = my_delay;
3480 				}
3481 				indent_pop();
3482 			} else if (result == FME_DISPROVED) {
3483 				indent();
3484 				out(O_ALTFP|O_VERB|O_NONL,
3485 				    "  EFFECTS DISPROVED ");
3486 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3487 				out(O_ALTFP|O_VERB, NULL);
3488 			} else {
3489 				ep2->cached_state |= mark;
3490 				indent();
3491 				if (mark == CREDIBLE_EFFECT)
3492 					out(O_ALTFP|O_VERB|O_NONL,
3493 					    "  EFFECTS CREDIBLE ");
3494 				else
3495 					out(O_ALTFP|O_VERB|O_NONL,
3496 					    "  PARENT EFFECTS WAIT ");
3497 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3498 				out(O_ALTFP|O_VERB, NULL);
3499 				indent_push("  E");
3500 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
3501 				    &my_delay, 0) == WAIT_EFFECT) {
3502 					retval = WAIT_EFFECT;
3503 					if (overall_delay > my_delay)
3504 						overall_delay = my_delay;
3505 				}
3506 				indent_pop();
3507 			}
3508 		}
3509 	}
3510 	if (retval == WAIT_EFFECT)
3511 		*pdelay = overall_delay;
3512 	return (retval);
3513 }
3514 
3515 static enum fme_state
3516 effects_test(struct fme *fmep, struct event *fault_event,
3517     unsigned long long at_latest_by, unsigned long long *pdelay)
3518 {
3519 	struct event *error_event;
3520 	enum fme_state return_value = FME_CREDIBLE;
3521 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3522 	unsigned long long my_delay;
3523 
3524 	stats_counter_bump(fmep->Ecallcount);
3525 	indent_push("  E");
3526 	indent();
3527 	out(O_ALTFP|O_VERB|O_NONL, "->");
3528 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3529 	out(O_ALTFP|O_VERB, NULL);
3530 
3531 	if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
3532 	    &my_delay, 0) == WAIT_EFFECT) {
3533 		return_value = FME_WAIT;
3534 		if (overall_delay > my_delay)
3535 			overall_delay = my_delay;
3536 	}
3537 	for (error_event = fmep->observations;
3538 	    error_event; error_event = error_event->observations) {
3539 		indent();
3540 		out(O_ALTFP|O_VERB|O_NONL, " ");
3541 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
3542 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
3543 			if (error_event->cached_state &
3544 			    (PARENT_WAIT|WAIT_EFFECT)) {
3545 				out(O_ALTFP|O_VERB, " NOT YET triggered");
3546 				continue;
3547 			}
3548 			return_value = FME_DISPROVED;
3549 			out(O_ALTFP|O_VERB, " NOT triggered");
3550 			break;
3551 		} else {
3552 			out(O_ALTFP|O_VERB, " triggered");
3553 		}
3554 	}
3555 	if (return_value == FME_DISPROVED) {
3556 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
3557 	} else {
3558 		fault_event->keep_in_tree = 1;
3559 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
3560 	}
3561 
3562 	indent();
3563 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
3564 	    fme_state2str(return_value));
3565 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3566 	out(O_ALTFP|O_VERB, NULL);
3567 	indent_pop();
3568 	if (return_value == FME_WAIT)
3569 		*pdelay = overall_delay;
3570 	return (return_value);
3571 }
3572 
3573 static enum fme_state
3574 requirements_test(struct fme *fmep, struct event *ep,
3575     unsigned long long at_latest_by, unsigned long long *pdelay)
3576 {
3577 	int waiting_events;
3578 	int credible_events;
3579 	int deferred_events;
3580 	enum fme_state return_value = FME_CREDIBLE;
3581 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3582 	unsigned long long arrow_delay;
3583 	unsigned long long my_delay;
3584 	struct event *ep2;
3585 	struct bubble *bp;
3586 	struct arrowlist *ap;
3587 
3588 	if (ep->cached_state & REQMNTS_CREDIBLE) {
3589 		indent();
3590 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
3591 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3592 		out(O_ALTFP|O_VERB, NULL);
3593 		return (FME_CREDIBLE);
3594 	}
3595 	if (ep->cached_state & REQMNTS_DISPROVED) {
3596 		indent();
3597 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
3598 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3599 		out(O_ALTFP|O_VERB, NULL);
3600 		return (FME_DISPROVED);
3601 	}
3602 	if (ep->cached_state & REQMNTS_WAIT) {
3603 		indent();
3604 		*pdelay = ep->cached_delay;
3605 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
3606 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3607 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
3608 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3609 		out(O_ALTFP|O_VERB, NULL);
3610 		return (FME_WAIT);
3611 	}
3612 	stats_counter_bump(fmep->Rcallcount);
3613 	indent_push("  R");
3614 	indent();
3615 	out(O_ALTFP|O_VERB|O_NONL, "->");
3616 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3617 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3618 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3619 	out(O_ALTFP|O_VERB, NULL);
3620 
3621 	if (ep->t == N_EREPORT) {
3622 		if (ep->count == 0) {
3623 			if (fmep->pull >= at_latest_by) {
3624 				return_value = FME_DISPROVED;
3625 			} else {
3626 				ep->cached_delay = *pdelay = at_latest_by;
3627 				return_value = FME_WAIT;
3628 			}
3629 		}
3630 
3631 		indent();
3632 		switch (return_value) {
3633 		case FME_CREDIBLE:
3634 			ep->cached_state |= REQMNTS_CREDIBLE;
3635 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
3636 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3637 			break;
3638 		case FME_DISPROVED:
3639 			ep->cached_state |= REQMNTS_DISPROVED;
3640 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3641 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3642 			break;
3643 		case FME_WAIT:
3644 			ep->cached_state |= REQMNTS_WAIT;
3645 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
3646 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3647 			out(O_ALTFP|O_VERB|O_NONL, " to ");
3648 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3649 			break;
3650 		default:
3651 			out(O_DIE, "requirements_test: unexpected fme_state");
3652 			break;
3653 		}
3654 		out(O_ALTFP|O_VERB, NULL);
3655 		indent_pop();
3656 
3657 		return (return_value);
3658 	}
3659 
3660 	/* this event is not a report, descend the tree */
3661 	for (bp = itree_next_bubble(ep, NULL); bp;
3662 	    bp = itree_next_bubble(ep, bp)) {
3663 		int n;
3664 
3665 		if (bp->t != B_FROM)
3666 			continue;
3667 
3668 		n = bp->nork;
3669 
3670 		credible_events = 0;
3671 		waiting_events = 0;
3672 		deferred_events = 0;
3673 		arrow_delay = TIMEVAL_EVENTUALLY;
3674 		/*
3675 		 * n is -1 for 'A' so adjust it.
3676 		 * XXX just count up the arrows for now.
3677 		 */
3678 		if (n < 0) {
3679 			n = 0;
3680 			for (ap = itree_next_arrow(bp, NULL); ap;
3681 			    ap = itree_next_arrow(bp, ap))
3682 				n++;
3683 			indent();
3684 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
3685 		} else {
3686 			indent();
3687 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
3688 		}
3689 
3690 		if (n == 0)
3691 			continue;
3692 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
3693 			for (ap = itree_next_arrow(bp, NULL); ap;
3694 			    ap = itree_next_arrow(bp, ap)) {
3695 				ep2 = ap->arrowp->head->myevent;
3696 				platform_set_payloadnvp(ep2->nvp);
3697 				if (checkconstraints(fmep, ap->arrowp) == 0) {
3698 					/*
3699 					 * if any arrow is invalidated by the
3700 					 * constraints, then we should elide the
3701 					 * whole bubble to be consistant with
3702 					 * the tree creation time behaviour
3703 					 */
3704 					bp->mark |= BUBBLE_ELIDED;
3705 					platform_set_payloadnvp(NULL);
3706 					break;
3707 				}
3708 				platform_set_payloadnvp(NULL);
3709 			}
3710 		}
3711 		if (bp->mark & BUBBLE_ELIDED)
3712 			continue;
3713 		bp->mark |= BUBBLE_OK;
3714 		for (ap = itree_next_arrow(bp, NULL); ap;
3715 		    ap = itree_next_arrow(bp, ap)) {
3716 			ep2 = ap->arrowp->head->myevent;
3717 			if (n <= credible_events)
3718 				break;
3719 
3720 			ap->arrowp->mark |= REQMNTS_COUNTER;
3721 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
3722 				/* XXX adding max timevals! */
3723 				switch (requirements_test(fmep, ep2,
3724 				    at_latest_by + ap->arrowp->maxdelay,
3725 				    &my_delay)) {
3726 				case FME_DEFERRED:
3727 					deferred_events++;
3728 					break;
3729 				case FME_CREDIBLE:
3730 					credible_events++;
3731 					break;
3732 				case FME_DISPROVED:
3733 					break;
3734 				case FME_WAIT:
3735 					if (my_delay < arrow_delay)
3736 						arrow_delay = my_delay;
3737 					waiting_events++;
3738 					break;
3739 				default:
3740 					out(O_DIE,
3741 					"Bug in requirements_test.");
3742 				}
3743 			else
3744 				deferred_events++;
3745 		}
3746 		indent();
3747 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
3748 		    credible_events + deferred_events, waiting_events);
3749 		if (credible_events + deferred_events + waiting_events < n) {
3750 			/* Can never meet requirements */
3751 			ep->cached_state |= REQMNTS_DISPROVED;
3752 			indent();
3753 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3754 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3755 			out(O_ALTFP|O_VERB, NULL);
3756 			indent_pop();
3757 			return (FME_DISPROVED);
3758 		}
3759 		if (credible_events + deferred_events < n) {
3760 			/* will have to wait */
3761 			/* wait time is shortest known */
3762 			if (arrow_delay < overall_delay)
3763 				overall_delay = arrow_delay;
3764 			return_value = FME_WAIT;
3765 		} else if (credible_events < n) {
3766 			if (return_value != FME_WAIT)
3767 				return_value = FME_DEFERRED;
3768 		}
3769 	}
3770 
3771 	/*
3772 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
3773 	 * path, then this will be considered FME_CREDIBLE. But if it is
3774 	 * reached by a different path so the K-count is met, then might
3775 	 * get overridden by FME_WAIT or FME_DISPROVED.
3776 	 */
3777 	if (return_value == FME_WAIT) {
3778 		ep->cached_state |= REQMNTS_WAIT;
3779 		ep->cached_delay = *pdelay = overall_delay;
3780 	} else if (return_value == FME_CREDIBLE) {
3781 		ep->cached_state |= REQMNTS_CREDIBLE;
3782 	}
3783 	indent();
3784 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
3785 	    fme_state2str(return_value));
3786 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3787 	out(O_ALTFP|O_VERB, NULL);
3788 	indent_pop();
3789 	return (return_value);
3790 }
3791 
3792 static enum fme_state
3793 causes_test(struct fme *fmep, struct event *ep,
3794     unsigned long long at_latest_by, unsigned long long *pdelay)
3795 {
3796 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3797 	unsigned long long my_delay;
3798 	int credible_results = 0;
3799 	int waiting_results = 0;
3800 	enum fme_state fstate;
3801 	struct event *tail_event;
3802 	struct bubble *bp;
3803 	struct arrowlist *ap;
3804 	int k = 1;
3805 
3806 	stats_counter_bump(fmep->Ccallcount);
3807 	indent_push("  C");
3808 	indent();
3809 	out(O_ALTFP|O_VERB|O_NONL, "->");
3810 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3811 	out(O_ALTFP|O_VERB, NULL);
3812 
3813 	for (bp = itree_next_bubble(ep, NULL); bp;
3814 	    bp = itree_next_bubble(ep, bp)) {
3815 		if (bp->t != B_TO)
3816 			continue;
3817 		k = bp->nork;	/* remember the K value */
3818 		for (ap = itree_next_arrow(bp, NULL); ap;
3819 		    ap = itree_next_arrow(bp, ap)) {
3820 			int do_not_follow = 0;
3821 
3822 			/*
3823 			 * if we get to the same event multiple times
3824 			 * only worry about the first one.
3825 			 */
3826 			if (ap->arrowp->tail->myevent->cached_state &
3827 			    CAUSES_TESTED) {
3828 				indent();
3829 				out(O_ALTFP|O_VERB|O_NONL,
3830 				    "  causes test already run for ");
3831 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3832 				    ap->arrowp->tail->myevent);
3833 				out(O_ALTFP|O_VERB, NULL);
3834 				continue;
3835 			}
3836 
3837 			/*
3838 			 * see if false constraint prevents us
3839 			 * from traversing this arrow
3840 			 */
3841 			platform_set_payloadnvp(ep->nvp);
3842 			if (checkconstraints(fmep, ap->arrowp) == 0)
3843 				do_not_follow = 1;
3844 			platform_set_payloadnvp(NULL);
3845 			if (do_not_follow) {
3846 				indent();
3847 				out(O_ALTFP|O_VERB|O_NONL,
3848 				    "  False arrow from ");
3849 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3850 				    ap->arrowp->tail->myevent);
3851 				out(O_ALTFP|O_VERB, NULL);
3852 				continue;
3853 			}
3854 
3855 			ap->arrowp->tail->myevent->cached_state |=
3856 			    CAUSES_TESTED;
3857 			tail_event = ap->arrowp->tail->myevent;
3858 			fstate = hypothesise(fmep, tail_event, at_latest_by,
3859 			    &my_delay);
3860 
3861 			switch (fstate) {
3862 			case FME_WAIT:
3863 				if (my_delay < overall_delay)
3864 					overall_delay = my_delay;
3865 				waiting_results++;
3866 				break;
3867 			case FME_CREDIBLE:
3868 				credible_results++;
3869 				break;
3870 			case FME_DISPROVED:
3871 				break;
3872 			default:
3873 				out(O_DIE, "Bug in causes_test");
3874 			}
3875 		}
3876 	}
3877 	/* compare against K */
3878 	if (credible_results + waiting_results < k) {
3879 		indent();
3880 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
3881 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3882 		out(O_ALTFP|O_VERB, NULL);
3883 		indent_pop();
3884 		return (FME_DISPROVED);
3885 	}
3886 	if (waiting_results != 0) {
3887 		*pdelay = overall_delay;
3888 		indent();
3889 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
3890 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3891 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3892 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3893 		out(O_ALTFP|O_VERB, NULL);
3894 		indent_pop();
3895 		return (FME_WAIT);
3896 	}
3897 	indent();
3898 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
3899 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3900 	out(O_ALTFP|O_VERB, NULL);
3901 	indent_pop();
3902 	return (FME_CREDIBLE);
3903 }
3904 
3905 static enum fme_state
3906 hypothesise(struct fme *fmep, struct event *ep,
3907 	unsigned long long at_latest_by, unsigned long long *pdelay)
3908 {
3909 	enum fme_state rtr, otr;
3910 	unsigned long long my_delay;
3911 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3912 
3913 	stats_counter_bump(fmep->Hcallcount);
3914 	indent_push("  H");
3915 	indent();
3916 	out(O_ALTFP|O_VERB|O_NONL, "->");
3917 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3918 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3919 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3920 	out(O_ALTFP|O_VERB, NULL);
3921 
3922 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
3923 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
3924 		overall_delay = my_delay;
3925 	if (rtr != FME_DISPROVED) {
3926 		if (is_problem(ep->t)) {
3927 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
3928 			if (otr != FME_DISPROVED) {
3929 				if (fmep->peek == 0 && ep->is_suspect == 0) {
3930 					ep->suspects = fmep->suspects;
3931 					ep->is_suspect = 1;
3932 					fmep->suspects = ep;
3933 					fmep->nsuspects++;
3934 					if (!is_fault(ep->t))
3935 						fmep->nonfault++;
3936 				}
3937 			}
3938 		} else
3939 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
3940 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
3941 			overall_delay = my_delay;
3942 		if ((otr != FME_DISPROVED) &&
3943 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
3944 			*pdelay = overall_delay;
3945 	}
3946 	if (rtr == FME_DISPROVED) {
3947 		indent();
3948 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3949 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3950 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
3951 		indent_pop();
3952 		return (FME_DISPROVED);
3953 	}
3954 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
3955 		indent();
3956 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3957 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3958 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
3959 		indent_pop();
3960 		return (FME_DISPROVED);
3961 	}
3962 	if (otr == FME_DISPROVED) {
3963 		indent();
3964 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3965 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3966 		out(O_ALTFP|O_VERB, " (causes are not credible)");
3967 		indent_pop();
3968 		return (FME_DISPROVED);
3969 	}
3970 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
3971 		indent();
3972 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
3973 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3974 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3975 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
3976 		out(O_ALTFP|O_VERB, NULL);
3977 		indent_pop();
3978 		return (FME_WAIT);
3979 	}
3980 	indent();
3981 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
3982 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3983 	out(O_ALTFP|O_VERB, NULL);
3984 	indent_pop();
3985 	return (FME_CREDIBLE);
3986 }
3987 
3988 /*
3989  * fme_istat_load -- reconstitute any persistent istats
3990  */
3991 void
3992 fme_istat_load(fmd_hdl_t *hdl)
3993 {
3994 	int sz;
3995 	char *sbuf;
3996 	char *ptr;
3997 
3998 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
3999 		out(O_ALTFP, "fme_istat_load: No stats");
4000 		return;
4001 	}
4002 
4003 	sbuf = alloca(sz);
4004 
4005 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
4006 
4007 	/*
4008 	 * pick apart the serialized stats
4009 	 *
4010 	 * format is:
4011 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
4012 	 * for example:
4013 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
4014 	 *
4015 	 * since this is parsing our own serialized data, any parsing issues
4016 	 * are fatal, so we check for them all with ASSERT() below.
4017 	 */
4018 	ptr = sbuf;
4019 	while (ptr < &sbuf[sz]) {
4020 		char *sepptr;
4021 		struct node *np;
4022 		int val;
4023 
4024 		sepptr = strchr(ptr, '@');
4025 		ASSERT(sepptr != NULL);
4026 		*sepptr = '\0';
4027 
4028 		/* construct the event */
4029 		np = newnode(T_EVENT, NULL, 0);
4030 		np->u.event.ename = newnode(T_NAME, NULL, 0);
4031 		np->u.event.ename->u.name.t = N_STAT;
4032 		np->u.event.ename->u.name.s = stable(ptr);
4033 		np->u.event.ename->u.name.it = IT_ENAME;
4034 		np->u.event.ename->u.name.last = np->u.event.ename;
4035 
4036 		ptr = sepptr + 1;
4037 		ASSERT(ptr < &sbuf[sz]);
4038 		ptr += strlen(ptr);
4039 		ptr++;	/* move past the '\0' separating path from value */
4040 		ASSERT(ptr < &sbuf[sz]);
4041 		ASSERT(isdigit(*ptr));
4042 		val = atoi(ptr);
4043 		ASSERT(val > 0);
4044 		ptr += strlen(ptr);
4045 		ptr++;	/* move past the final '\0' for this entry */
4046 
4047 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
4048 		ASSERT(np->u.event.epname != NULL);
4049 
4050 		istat_bump(np, val);
4051 		tree_free(np);
4052 	}
4053 
4054 	istat_save();
4055 }
4056