xref: /titanic_50/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision 567cc2e6a4ceb0e421e1cf9bead0f43c55603d27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <strings.h>
37 #include <ctype.h>
38 #include <alloca.h>
39 #include <libnvpair.h>
40 #include <sys/fm/protocol.h>
41 #include <fm/fmd_api.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 #include "esclex.h"
58 
59 /* imported from eft.c... */
60 extern char *Autoclose;
61 extern hrtime_t Hesitate;
62 extern char *Serd_Override;
63 extern nv_alloc_t Eft_nv_hdl;
64 extern int Max_fme;
65 extern fmd_hdl_t *Hdl;
66 
67 static int Istat_need_save;
68 static int Serd_need_save;
69 void istat_save(void);
70 void serd_save(void);
71 
72 /* fme under construction is global so we can free it on module abort */
73 static struct fme *Nfmep;
74 
75 static const char *Undiag_reason;
76 
77 static int Nextid = 0;
78 
79 static int Open_fme_count = 0;	/* Count of open FMEs */
80 
81 /* list of fault management exercises underway */
82 static struct fme {
83 	struct fme *next;		/* next exercise */
84 	unsigned long long ull;		/* time when fme was created */
85 	int id;				/* FME id */
86 	struct config *config;		/* cooked configuration data */
87 	struct lut *eventtree;		/* propagation tree for this FME */
88 	/*
89 	 * The initial error report that created this FME is kept in
90 	 * two forms.  e0 points to the instance tree node and is used
91 	 * by fme_eval() as the starting point for the inference
92 	 * algorithm.  e0r is the event handle FMD passed to us when
93 	 * the ereport first arrived and is used when setting timers,
94 	 * which are always relative to the time of this initial
95 	 * report.
96 	 */
97 	struct event *e0;
98 	fmd_event_t *e0r;
99 
100 	id_t    timer;			/* for setting an fmd time-out */
101 
102 	struct event *ecurrent;		/* ereport under consideration */
103 	struct event *suspects;		/* current suspect list */
104 	struct event *psuspects;	/* previous suspect list */
105 	int nsuspects;			/* count of suspects */
106 	int nonfault;			/* zero if all suspects T_FAULT */
107 	int posted_suspects;		/* true if we've posted a diagnosis */
108 	int uniqobs;			/* number of unique events observed */
109 	int peek;			/* just peeking, don't track suspects */
110 	int overflow;			/* true if overflow FME */
111 	enum fme_state {
112 		FME_NOTHING = 5000,	/* not evaluated yet */
113 		FME_WAIT,		/* need to wait for more info */
114 		FME_CREDIBLE,		/* suspect list is credible */
115 		FME_DISPROVED,		/* no valid suspects found */
116 		FME_DEFERRED		/* don't know yet (k-count not met) */
117 	} state;
118 
119 	unsigned long long pull;	/* time passed since created */
120 	unsigned long long wull;	/* wait until this time for re-eval */
121 	struct event *observations;	/* observation list */
122 	struct lut *globals;		/* values of global variables */
123 	/* fmd interfacing */
124 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
125 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
126 	/* stats */
127 	struct stats *Rcount;
128 	struct stats *Hcallcount;
129 	struct stats *Rcallcount;
130 	struct stats *Ccallcount;
131 	struct stats *Ecallcount;
132 	struct stats *Tcallcount;
133 	struct stats *Marrowcount;
134 	struct stats *diags;
135 } *FMElist, *EFMElist, *ClosedFMEs;
136 
137 static struct case_list {
138 	fmd_case_t *fmcase;
139 	struct case_list *next;
140 } *Undiagablecaselist;
141 
142 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
143 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
144 	unsigned long long at_latest_by, unsigned long long *pdelay);
145 static struct node *eventprop_lookup(struct event *ep, const char *propname);
146 static struct node *pathstring2epnamenp(char *path);
147 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep,
148 	fmd_case_t *fmcase);
149 static void restore_suspects(struct fme *fmep);
150 static void save_suspects(struct fme *fmep);
151 static void destroy_fme(struct fme *f);
152 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
153     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
154 static void istat_counter_reset_cb(struct istat_entry *entp,
155     struct stats *statp, const struct ipath *ipp);
156 static void istat_counter_topo_chg_cb(struct istat_entry *entp,
157     struct stats *statp, void *unused);
158 static void serd_reset_cb(struct serd_entry *entp, void *unused,
159     const struct ipath *ipp);
160 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused,
161     void *unused2);
162 static void destroy_fme_bufs(struct fme *fp);
163 
164 static struct fme *
165 alloc_fme(void)
166 {
167 	struct fme *fmep;
168 
169 	fmep = MALLOC(sizeof (*fmep));
170 	bzero(fmep, sizeof (*fmep));
171 	return (fmep);
172 }
173 
174 /*
175  * fme_ready -- called when all initialization of the FME (except for
176  *	stats) has completed successfully.  Adds the fme to global lists
177  *	and establishes its stats.
178  */
179 static struct fme *
180 fme_ready(struct fme *fmep)
181 {
182 	char nbuf[100];
183 
184 	Nfmep = NULL;	/* don't need to free this on module abort now */
185 
186 	if (EFMElist) {
187 		EFMElist->next = fmep;
188 		EFMElist = fmep;
189 	} else
190 		FMElist = EFMElist = fmep;
191 
192 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
193 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
194 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
195 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
196 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
197 	fmep->Rcallcount = stats_new_counter(nbuf,
198 	    "calls to requirements_test()", 1);
199 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
200 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
201 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
202 	fmep->Ecallcount =
203 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
204 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
205 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
206 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
207 	fmep->Marrowcount = stats_new_counter(nbuf,
208 	    "arrows marked by mark_arrows()", 1);
209 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
210 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
211 
212 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
213 	config_print(O_ALTFP|O_VERB2, fmep->config);
214 
215 	return (fmep);
216 }
217 
218 extern void ipath_dummy_lut(struct arrow *);
219 extern struct lut *itree_create_dummy(const char *, const struct ipath *);
220 
221 /* ARGSUSED */
222 static void
223 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
224 {
225 	struct bubble *bp;
226 	struct arrowlist *ap;
227 
228 	for (bp = itree_next_bubble(ep, NULL); bp;
229 	    bp = itree_next_bubble(ep, bp)) {
230 		if (bp->t != B_FROM)
231 			continue;
232 		for (ap = itree_next_arrow(bp, NULL); ap;
233 		    ap = itree_next_arrow(bp, ap)) {
234 			ap->arrowp->pnode->u.arrow.needed = 1;
235 			ipath_dummy_lut(ap->arrowp);
236 		}
237 	}
238 }
239 
240 /* ARGSUSED */
241 static void
242 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
243 {
244 	struct bubble *bp;
245 	struct arrowlist *ap;
246 
247 	for (bp = itree_next_bubble(ep, NULL); bp;
248 	    bp = itree_next_bubble(ep, bp)) {
249 		if (bp->t != B_FROM)
250 			continue;
251 		for (ap = itree_next_arrow(bp, NULL); ap;
252 		    ap = itree_next_arrow(bp, ap))
253 			ap->arrowp->pnode->u.arrow.needed = 0;
254 	}
255 }
256 
257 static void globals_destructor(void *left, void *right, void *arg);
258 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep);
259 
260 static void
261 prune_propagations(const char *e0class, const struct ipath *e0ipp)
262 {
263 	char nbuf[100];
264 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
265 	extern struct lut *Usednames;
266 
267 	Nfmep = alloc_fme();
268 	Nfmep->id = Nextid;
269 	Nfmep->state = FME_NOTHING;
270 	Nfmep->eventtree = itree_create_dummy(e0class, e0ipp);
271 	if ((Nfmep->e0 =
272 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
273 		out(O_ALTFP, "prune_propagations: e0 not in instance tree");
274 		itree_free(Nfmep->eventtree);
275 		FREE(Nfmep);
276 		Nfmep = NULL;
277 		return;
278 	}
279 	Nfmep->ecurrent = Nfmep->observations = Nfmep->e0;
280 	Nfmep->e0->count++;
281 
282 	(void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id);
283 	Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
284 	(void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id);
285 	Nfmep->Hcallcount =
286 	    stats_new_counter(nbuf, "calls to hypothesise()", 1);
287 	(void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id);
288 	Nfmep->Rcallcount = stats_new_counter(nbuf,
289 	    "calls to requirements_test()", 1);
290 	(void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id);
291 	Nfmep->Ccallcount =
292 	    stats_new_counter(nbuf, "calls to causes_test()", 1);
293 	(void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id);
294 	Nfmep->Ecallcount =
295 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
296 	(void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id);
297 	Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
298 	(void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id);
299 	Nfmep->Marrowcount = stats_new_counter(nbuf,
300 	    "arrows marked by mark_arrows()", 1);
301 	(void) sprintf(nbuf, "fme%d.diags", Nfmep->id);
302 	Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
303 
304 	Nfmep->peek = 1;
305 	lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep);
306 	lut_free(Usednames, NULL, NULL);
307 	Usednames = NULL;
308 	lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep);
309 	(void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay);
310 	itree_prune(Nfmep->eventtree);
311 	lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep);
312 
313 	stats_delete(Nfmep->Rcount);
314 	stats_delete(Nfmep->Hcallcount);
315 	stats_delete(Nfmep->Rcallcount);
316 	stats_delete(Nfmep->Ccallcount);
317 	stats_delete(Nfmep->Ecallcount);
318 	stats_delete(Nfmep->Tcallcount);
319 	stats_delete(Nfmep->Marrowcount);
320 	stats_delete(Nfmep->diags);
321 	itree_free(Nfmep->eventtree);
322 	lut_free(Nfmep->globals, globals_destructor, NULL);
323 	FREE(Nfmep);
324 }
325 
326 static struct fme *
327 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl,
328 	fmd_case_t *fmcase)
329 {
330 	struct cfgdata *cfgdata;
331 	int init_size;
332 	extern int alloc_total();
333 
334 	init_size = alloc_total();
335 	out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size);
336 	if ((cfgdata = config_snapshot()) == NULL) {
337 		out(O_ALTFP, "newfme: NULL configuration");
338 		Undiag_reason = UD_NOCONF;
339 		return (NULL);
340 	}
341 	platform_save_config(hdl, fmcase);
342 	out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes",
343 	    alloc_total() - init_size);
344 
345 	Nfmep = alloc_fme();
346 
347 	Nfmep->id = Nextid++;
348 	Nfmep->config = cfgdata->cooked;
349 	config_free(cfgdata);
350 	Nfmep->posted_suspects = 0;
351 	Nfmep->uniqobs = 0;
352 	Nfmep->state = FME_NOTHING;
353 	Nfmep->pull = 0ULL;
354 	Nfmep->overflow = 0;
355 
356 	Nfmep->fmcase = fmcase;
357 	Nfmep->hdl = hdl;
358 
359 	if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) {
360 		out(O_ALTFP, "newfme: NULL instance tree");
361 		Undiag_reason = UD_INSTFAIL;
362 		structconfig_free(Nfmep->config);
363 		destroy_fme_bufs(Nfmep);
364 		FREE(Nfmep);
365 		Nfmep = NULL;
366 		return (NULL);
367 	}
368 
369 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
370 
371 	if ((Nfmep->e0 =
372 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
373 		out(O_ALTFP, "newfme: e0 not in instance tree");
374 		Undiag_reason = UD_BADEVENTI;
375 		itree_free(Nfmep->eventtree);
376 		structconfig_free(Nfmep->config);
377 		destroy_fme_bufs(Nfmep);
378 		FREE(Nfmep);
379 		Nfmep = NULL;
380 		return (NULL);
381 	}
382 
383 	return (fme_ready(Nfmep));
384 }
385 
386 void
387 fme_fini(void)
388 {
389 	struct fme *sfp, *fp;
390 	struct case_list *ucasep, *nextcasep;
391 
392 	ucasep = Undiagablecaselist;
393 	while (ucasep != NULL) {
394 		nextcasep = ucasep->next;
395 		FREE(ucasep);
396 		ucasep = nextcasep;
397 	}
398 	Undiagablecaselist = NULL;
399 
400 	/* clean up closed fmes */
401 	fp = ClosedFMEs;
402 	while (fp != NULL) {
403 		sfp = fp->next;
404 		destroy_fme(fp);
405 		fp = sfp;
406 	}
407 	ClosedFMEs = NULL;
408 
409 	fp = FMElist;
410 	while (fp != NULL) {
411 		sfp = fp->next;
412 		destroy_fme(fp);
413 		fp = sfp;
414 	}
415 	FMElist = EFMElist = NULL;
416 
417 	/* if we were in the middle of creating an fme, free it now */
418 	if (Nfmep) {
419 		destroy_fme(Nfmep);
420 		Nfmep = NULL;
421 	}
422 }
423 
424 /*
425  * Allocated space for a buffer name.  20 bytes allows for
426  * a ridiculous 9,999,999 unique observations.
427  */
428 #define	OBBUFNMSZ 20
429 
430 /*
431  *  serialize_observation
432  *
433  *  Create a recoverable version of the current observation
434  *  (f->ecurrent).  We keep a serialized version of each unique
435  *  observation in order that we may resume correctly the fme in the
436  *  correct state if eft or fmd crashes and we're restarted.
437  */
438 static void
439 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
440 {
441 	size_t pkdlen;
442 	char tmpbuf[OBBUFNMSZ];
443 	char *pkd = NULL;
444 	char *estr;
445 
446 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
447 	estr = ipath2str(cls, ipp);
448 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
449 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
450 	    strlen(estr) + 1);
451 	FREE(estr);
452 
453 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
454 		(void) snprintf(tmpbuf,
455 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
456 		if (nvlist_xpack(fp->ecurrent->nvp,
457 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
458 			out(O_DIE|O_SYS, "pack of observed nvl failed");
459 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
460 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
461 		FREE(pkd);
462 	}
463 
464 	fp->uniqobs++;
465 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
466 	    sizeof (fp->uniqobs));
467 }
468 
469 /*
470  *  init_fme_bufs -- We keep several bits of state about an fme for
471  *	use if eft or fmd crashes and we're restarted.
472  */
473 static void
474 init_fme_bufs(struct fme *fp)
475 {
476 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
477 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
478 	    sizeof (fp->pull));
479 
480 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
481 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
482 	    sizeof (fp->id));
483 
484 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
485 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
486 	    sizeof (fp->uniqobs));
487 
488 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
489 	    sizeof (fp->posted_suspects));
490 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
491 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
492 }
493 
494 static void
495 destroy_fme_bufs(struct fme *fp)
496 {
497 	char tmpbuf[OBBUFNMSZ];
498 	int o;
499 
500 	platform_restore_config(fp->hdl, fp->fmcase);
501 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
502 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
503 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
504 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
505 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
506 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
507 
508 	for (o = 0; o < fp->uniqobs; o++) {
509 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
510 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
511 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
512 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
513 	}
514 }
515 
516 /*
517  * reconstitute_observations -- convert a case's serialized observations
518  *	back into struct events.  Returns zero if all observations are
519  *	successfully reconstituted.
520  */
521 static int
522 reconstitute_observations(struct fme *fmep)
523 {
524 	struct event *ep;
525 	struct node *epnamenp = NULL;
526 	size_t pkdlen;
527 	char *pkd = NULL;
528 	char *tmpbuf = alloca(OBBUFNMSZ);
529 	char *sepptr;
530 	char *estr;
531 	int ocnt;
532 	int elen;
533 
534 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
535 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
536 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
537 		if (elen == 0) {
538 			out(O_ALTFP,
539 			    "reconstitute_observation: no %s buffer found.",
540 			    tmpbuf);
541 			Undiag_reason = UD_MISSINGOBS;
542 			break;
543 		}
544 
545 		estr = MALLOC(elen);
546 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
547 		sepptr = strchr(estr, '@');
548 		if (sepptr == NULL) {
549 			out(O_ALTFP,
550 			    "reconstitute_observation: %s: "
551 			    "missing @ separator in %s.",
552 			    tmpbuf, estr);
553 			Undiag_reason = UD_MISSINGPATH;
554 			FREE(estr);
555 			break;
556 		}
557 
558 		*sepptr = '\0';
559 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
560 			out(O_ALTFP,
561 			    "reconstitute_observation: %s: "
562 			    "trouble converting path string \"%s\" "
563 			    "to internal representation.",
564 			    tmpbuf, sepptr + 1);
565 			Undiag_reason = UD_MISSINGPATH;
566 			FREE(estr);
567 			break;
568 		}
569 
570 		/* construct the event */
571 		ep = itree_lookup(fmep->eventtree,
572 		    stable(estr), ipath(epnamenp));
573 		if (ep == NULL) {
574 			out(O_ALTFP,
575 			    "reconstitute_observation: %s: "
576 			    "lookup of  \"%s\" in itree failed.",
577 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
578 			Undiag_reason = UD_BADOBS;
579 			tree_free(epnamenp);
580 			FREE(estr);
581 			break;
582 		}
583 		tree_free(epnamenp);
584 
585 		/*
586 		 * We may or may not have a saved nvlist for the observation
587 		 */
588 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
589 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
590 		if (pkdlen != 0) {
591 			pkd = MALLOC(pkdlen);
592 			fmd_buf_read(fmep->hdl,
593 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
594 			ASSERT(ep->nvp == NULL);
595 			if (nvlist_xunpack(pkd,
596 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
597 				out(O_DIE|O_SYS, "pack of observed nvl failed");
598 			FREE(pkd);
599 		}
600 
601 		if (ocnt == 0)
602 			fmep->e0 = ep;
603 
604 		FREE(estr);
605 		fmep->ecurrent = ep;
606 		ep->count++;
607 
608 		/* link it into list of observations seen */
609 		ep->observations = fmep->observations;
610 		fmep->observations = ep;
611 	}
612 
613 	if (ocnt == fmep->uniqobs) {
614 		(void) fme_ready(fmep);
615 		return (0);
616 	}
617 
618 	return (1);
619 }
620 
621 /*
622  * restart_fme -- called during eft initialization.  Reconstitutes
623  *	an in-progress fme.
624  */
625 void
626 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
627 {
628 	nvlist_t *defect;
629 	struct case_list *bad;
630 	struct fme *fmep;
631 	struct cfgdata *cfgdata;
632 	size_t rawsz;
633 	struct event *ep;
634 	char *tmpbuf = alloca(OBBUFNMSZ);
635 	char *sepptr;
636 	char *estr;
637 	int elen;
638 	struct node *epnamenp = NULL;
639 	int init_size;
640 	extern int alloc_total();
641 
642 	/*
643 	 * ignore solved or closed cases
644 	 */
645 	if (fmd_case_solved(hdl, inprogress) ||
646 	    fmd_case_closed(hdl, inprogress))
647 		return;
648 
649 	fmep = alloc_fme();
650 	fmep->fmcase = inprogress;
651 	fmep->hdl = hdl;
652 
653 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
654 		out(O_ALTFP, "restart_fme: no saved posted status");
655 		Undiag_reason = UD_MISSINGINFO;
656 		goto badcase;
657 	} else {
658 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
659 		    (void *)&fmep->posted_suspects,
660 		    sizeof (fmep->posted_suspects));
661 	}
662 
663 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
664 		out(O_ALTFP, "restart_fme: no saved id");
665 		Undiag_reason = UD_MISSINGINFO;
666 		goto badcase;
667 	} else {
668 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
669 		    sizeof (fmep->id));
670 	}
671 	if (Nextid <= fmep->id)
672 		Nextid = fmep->id + 1;
673 
674 	out(O_ALTFP, "Replay FME %d", fmep->id);
675 
676 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
677 		out(O_ALTFP, "restart_fme: No config data");
678 		Undiag_reason = UD_MISSINGINFO;
679 		goto badcase;
680 	}
681 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
682 	    sizeof (size_t));
683 
684 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
685 		out(O_ALTFP, "restart_fme: No event zero");
686 		Undiag_reason = UD_MISSINGZERO;
687 		goto badcase;
688 	}
689 
690 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
691 		out(O_ALTFP, "restart_fme: no saved wait time");
692 		Undiag_reason = UD_MISSINGINFO;
693 		goto badcase;
694 	} else {
695 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
696 		    sizeof (fmep->pull));
697 	}
698 
699 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
700 		out(O_ALTFP, "restart_fme: no count of observations");
701 		Undiag_reason = UD_MISSINGINFO;
702 		goto badcase;
703 	} else {
704 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
705 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
706 	}
707 
708 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed0");
709 	elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
710 	if (elen == 0) {
711 		out(O_ALTFP, "reconstitute_observation: no %s buffer found.",
712 		    tmpbuf);
713 		Undiag_reason = UD_MISSINGOBS;
714 		goto badcase;
715 	}
716 	estr = MALLOC(elen);
717 	fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
718 	sepptr = strchr(estr, '@');
719 	if (sepptr == NULL) {
720 		out(O_ALTFP, "reconstitute_observation: %s: "
721 		    "missing @ separator in %s.",
722 		    tmpbuf, estr);
723 		Undiag_reason = UD_MISSINGPATH;
724 		FREE(estr);
725 		goto badcase;
726 	}
727 	*sepptr = '\0';
728 	if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
729 		out(O_ALTFP, "reconstitute_observation: %s: "
730 		    "trouble converting path string \"%s\" "
731 		    "to internal representation.", tmpbuf, sepptr + 1);
732 		Undiag_reason = UD_MISSINGPATH;
733 		FREE(estr);
734 		goto badcase;
735 	}
736 	prune_propagations(stable(estr), ipath(epnamenp));
737 	tree_free(epnamenp);
738 	FREE(estr);
739 
740 	init_size = alloc_total();
741 	out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size);
742 	cfgdata = MALLOC(sizeof (struct cfgdata));
743 	cfgdata->cooked = NULL;
744 	cfgdata->devcache = NULL;
745 	cfgdata->cpucache = NULL;
746 	cfgdata->raw_refcnt = 1;
747 
748 	if (rawsz > 0) {
749 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
750 			out(O_ALTFP, "restart_fme: Config data size mismatch");
751 			Undiag_reason = UD_CFGMISMATCH;
752 			goto badcase;
753 		}
754 		cfgdata->begin = MALLOC(rawsz);
755 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
756 		fmd_buf_read(hdl,
757 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
758 	} else {
759 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
760 	}
761 
762 	config_cook(cfgdata);
763 	fmep->config = cfgdata->cooked;
764 	config_free(cfgdata);
765 	out(O_ALTFP|O_STAMP, "config_restore added %d bytes",
766 	    alloc_total() - init_size);
767 
768 	if ((fmep->eventtree = itree_create(fmep->config)) == NULL) {
769 		/* case not properly saved or irretrievable */
770 		out(O_ALTFP, "restart_fme: NULL instance tree");
771 		Undiag_reason = UD_INSTFAIL;
772 		goto badcase;
773 	}
774 
775 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
776 
777 	if (reconstitute_observations(fmep) != 0)
778 		goto badcase;
779 
780 	out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id);
781 	for (ep = fmep->observations; ep; ep = ep->observations) {
782 		out(O_ALTFP|O_NONL, " ");
783 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
784 	}
785 	out(O_ALTFP, NULL);
786 
787 	Open_fme_count++;
788 
789 	/* give the diagnosis algorithm a shot at the new FME state */
790 	fme_eval(fmep, fmep->e0r);
791 	return;
792 
793 badcase:
794 	if (fmep->eventtree != NULL)
795 		itree_free(fmep->eventtree);
796 	if (fmep->config)
797 		structconfig_free(fmep->config);
798 	destroy_fme_bufs(fmep);
799 	FREE(fmep);
800 
801 	/*
802 	 * Since we're unable to restart the case, add it to the undiagable
803 	 * list and solve and close it as appropriate.
804 	 */
805 	bad = MALLOC(sizeof (struct case_list));
806 	bad->next = NULL;
807 
808 	if (Undiagablecaselist != NULL)
809 		bad->next = Undiagablecaselist;
810 	Undiagablecaselist = bad;
811 	bad->fmcase = inprogress;
812 
813 	out(O_ALTFP|O_NONL, "[case %s (unable to restart), ",
814 	    fmd_case_uuid(hdl, bad->fmcase));
815 
816 	if (fmd_case_solved(hdl, bad->fmcase)) {
817 		out(O_ALTFP|O_NONL, "already solved, ");
818 	} else {
819 		out(O_ALTFP|O_NONL, "solving, ");
820 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
821 		    NULL, NULL, NULL);
822 		if (Undiag_reason != NULL)
823 			(void) nvlist_add_string(defect,
824 			    UNDIAG_REASON, Undiag_reason);
825 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
826 		fmd_case_solve(hdl, bad->fmcase);
827 	}
828 
829 	if (fmd_case_closed(hdl, bad->fmcase)) {
830 		out(O_ALTFP, "already closed ]");
831 	} else {
832 		out(O_ALTFP, "closing ]");
833 		fmd_case_close(hdl, bad->fmcase);
834 	}
835 }
836 
837 /*ARGSUSED*/
838 static void
839 globals_destructor(void *left, void *right, void *arg)
840 {
841 	struct evalue *evp = (struct evalue *)right;
842 	if (evp->t == NODEPTR)
843 		tree_free((struct node *)(uintptr_t)evp->v);
844 	evp->v = NULL;
845 	FREE(evp);
846 }
847 
848 void
849 destroy_fme(struct fme *f)
850 {
851 	stats_delete(f->Rcount);
852 	stats_delete(f->Hcallcount);
853 	stats_delete(f->Rcallcount);
854 	stats_delete(f->Ccallcount);
855 	stats_delete(f->Ecallcount);
856 	stats_delete(f->Tcallcount);
857 	stats_delete(f->Marrowcount);
858 	stats_delete(f->diags);
859 
860 	if (f->eventtree != NULL)
861 		itree_free(f->eventtree);
862 	if (f->config)
863 		structconfig_free(f->config);
864 	lut_free(f->globals, globals_destructor, NULL);
865 	FREE(f);
866 }
867 
868 static const char *
869 fme_state2str(enum fme_state s)
870 {
871 	switch (s) {
872 	case FME_NOTHING:	return ("NOTHING");
873 	case FME_WAIT:		return ("WAIT");
874 	case FME_CREDIBLE:	return ("CREDIBLE");
875 	case FME_DISPROVED:	return ("DISPROVED");
876 	case FME_DEFERRED:	return ("DEFERRED");
877 	default:		return ("UNKNOWN");
878 	}
879 }
880 
881 static int
882 is_problem(enum nametype t)
883 {
884 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
885 }
886 
887 static int
888 is_fault(enum nametype t)
889 {
890 	return (t == N_FAULT);
891 }
892 
893 static int
894 is_defect(enum nametype t)
895 {
896 	return (t == N_DEFECT);
897 }
898 
899 static int
900 is_upset(enum nametype t)
901 {
902 	return (t == N_UPSET);
903 }
904 
905 static void
906 fme_print(int flags, struct fme *fmep)
907 {
908 	struct event *ep;
909 
910 	out(flags, "Fault Management Exercise %d", fmep->id);
911 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
912 	out(flags|O_NONL, "\t  Start time: ");
913 	ptree_timeval(flags|O_NONL, &fmep->ull);
914 	out(flags, NULL);
915 	if (fmep->wull) {
916 		out(flags|O_NONL, "\t   Wait time: ");
917 		ptree_timeval(flags|O_NONL, &fmep->wull);
918 		out(flags, NULL);
919 	}
920 	out(flags|O_NONL, "\t          E0: ");
921 	if (fmep->e0)
922 		itree_pevent_brief(flags|O_NONL, fmep->e0);
923 	else
924 		out(flags|O_NONL, "NULL");
925 	out(flags, NULL);
926 	out(flags|O_NONL, "\tObservations:");
927 	for (ep = fmep->observations; ep; ep = ep->observations) {
928 		out(flags|O_NONL, " ");
929 		itree_pevent_brief(flags|O_NONL, ep);
930 	}
931 	out(flags, NULL);
932 	out(flags|O_NONL, "\tSuspect list:");
933 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
934 		out(flags|O_NONL, " ");
935 		itree_pevent_brief(flags|O_NONL, ep);
936 	}
937 	out(flags, NULL);
938 	if (fmep->eventtree != NULL) {
939 		out(flags|O_VERB2, "\t        Tree:");
940 		itree_ptree(flags|O_VERB2, fmep->eventtree);
941 	}
942 }
943 
944 static struct node *
945 pathstring2epnamenp(char *path)
946 {
947 	char *sep = "/";
948 	struct node *ret;
949 	char *ptr;
950 
951 	if ((ptr = strtok(path, sep)) == NULL)
952 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
953 
954 	ret = tree_iname(stable(ptr), NULL, 0);
955 
956 	while ((ptr = strtok(NULL, sep)) != NULL)
957 		ret = tree_name_append(ret,
958 		    tree_iname(stable(ptr), NULL, 0));
959 
960 	return (ret);
961 }
962 
963 /*
964  * for a given upset sp, increment the corresponding SERD engine.  if the
965  * SERD engine trips, return the ename and ipp of the resulting ereport.
966  * returns true if engine tripped and *enamep and *ippp were filled in.
967  */
968 static int
969 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
970     fmd_case_t *fmcase, struct event *sp, const char **enamep,
971     const struct ipath **ippp)
972 {
973 	struct node *serdinst;
974 	char *serdname;
975 	struct node *nid;
976 	struct serd_entry *newentp;
977 
978 	ASSERT(sp->t == N_UPSET);
979 	ASSERT(ffep != NULL);
980 
981 	/*
982 	 * obtain instanced SERD engine from the upset sp.  from this
983 	 * derive serdname, the string used to identify the SERD engine.
984 	 */
985 	serdinst = eventprop_lookup(sp, L_engine);
986 
987 	if (serdinst == NULL)
988 		return (NULL);
989 
990 	serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s,
991 	    ipath(serdinst->u.stmt.np->u.event.epname));
992 
993 	/* handle serd engine "id" property, if there is one */
994 	if ((nid =
995 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
996 		struct evalue *gval;
997 		char suffixbuf[200];
998 		char *suffix;
999 		char *nserdname;
1000 		size_t nname;
1001 
1002 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
1003 		ptree_name_iter(O_ALTFP|O_NONL, nid);
1004 
1005 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
1006 
1007 		if ((gval = lut_lookup(fmep->globals,
1008 		    (void *)nid->u.globid.s, NULL)) == NULL) {
1009 			out(O_ALTFP, " undefined");
1010 		} else if (gval->t == UINT64) {
1011 			out(O_ALTFP, " %llu", gval->v);
1012 			(void) sprintf(suffixbuf, "%llu", gval->v);
1013 			suffix = suffixbuf;
1014 		} else {
1015 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
1016 			suffix = (char *)(uintptr_t)gval->v;
1017 		}
1018 
1019 		nname = strlen(serdname) + strlen(suffix) + 2;
1020 		nserdname = MALLOC(nname);
1021 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
1022 		FREE(serdname);
1023 		serdname = nserdname;
1024 	}
1025 
1026 	if (!fmd_serd_exists(hdl, serdname)) {
1027 		struct node *nN, *nT;
1028 		const char *s;
1029 		struct node *nodep;
1030 		struct config *cp;
1031 		char *path;
1032 		uint_t nval;
1033 		hrtime_t tval;
1034 		const char *name;
1035 		char *serd_name;
1036 		int i;
1037 		char *ptr;
1038 		int got_n_override = 0, got_t_override = 0;
1039 
1040 		/* no SERD engine yet, so create it */
1041 		nodep = serdinst->u.stmt.np->u.event.epname;
1042 		name = serdinst->u.stmt.np->u.event.ename->u.name.s;
1043 		path = ipath2str(NULL, ipath(nodep));
1044 		cp = config_lookup(fmep->config, path, 0);
1045 		FREE((void *)path);
1046 
1047 		/*
1048 		 * We allow serd paramaters to be overridden, either from
1049 		 * eft.conf file values (if Serd_Override is set) or from
1050 		 * driver properties (for "serd.io.device" engines).
1051 		 */
1052 		if (Serd_Override != NULL) {
1053 			char *save_ptr, *ptr1, *ptr2, *ptr3;
1054 			ptr3 = save_ptr = STRDUP(Serd_Override);
1055 			while (*ptr3 != '\0') {
1056 				ptr1 = strchr(ptr3, ',');
1057 				*ptr1 = '\0';
1058 				if (strcmp(ptr3, name) == 0) {
1059 					ptr2 =  strchr(ptr1 + 1, ',');
1060 					*ptr2 = '\0';
1061 					nval = atoi(ptr1 + 1);
1062 					out(O_ALTFP, "serd override %s_n %d",
1063 					    name, nval);
1064 					ptr3 =  strchr(ptr2 + 1, ' ');
1065 					if (ptr3)
1066 						*ptr3 = '\0';
1067 					ptr = STRDUP(ptr2 + 1);
1068 					out(O_ALTFP, "serd override %s_t %s",
1069 					    name, ptr);
1070 					got_n_override = 1;
1071 					got_t_override = 1;
1072 					break;
1073 				} else {
1074 					ptr2 =  strchr(ptr1 + 1, ',');
1075 					ptr3 =  strchr(ptr2 + 1, ' ');
1076 					if (ptr3 == NULL)
1077 						break;
1078 				}
1079 				ptr3++;
1080 			}
1081 			FREE(save_ptr);
1082 		}
1083 
1084 		if (cp && got_n_override == 0) {
1085 			/*
1086 			 * convert serd engine name into property name
1087 			 */
1088 			serd_name = MALLOC(strlen(name) + 3);
1089 			for (i = 0; i < strlen(name); i++) {
1090 				if (name[i] == '.')
1091 					serd_name[i] = '_';
1092 				else
1093 					serd_name[i] = name[i];
1094 			}
1095 			serd_name[i++] = '_';
1096 			serd_name[i++] = 'n';
1097 			serd_name[i] = '\0';
1098 			if (s = config_getprop(cp, serd_name)) {
1099 				nval = atoi(s);
1100 				out(O_ALTFP, "serd override %s_n %s", name, s);
1101 				got_n_override = 1;
1102 			}
1103 			serd_name[i - 1] = 't';
1104 			if (s = config_getprop(cp, serd_name)) {
1105 				ptr = STRDUP(s);
1106 				out(O_ALTFP, "serd override %s_t %s", name, s);
1107 				got_t_override = 1;
1108 			}
1109 			FREE(serd_name);
1110 		}
1111 
1112 		if (!got_n_override) {
1113 			nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N,
1114 			    NULL);
1115 			ASSERT(nN->t == T_NUM);
1116 			nval = (uint_t)nN->u.ull;
1117 		}
1118 		if (!got_t_override) {
1119 			nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T,
1120 			    NULL);
1121 			ASSERT(nT->t == T_TIMEVAL);
1122 			tval = (hrtime_t)nT->u.ull;
1123 		} else {
1124 			const unsigned long long *ullp;
1125 			const char *suffix;
1126 			int len;
1127 
1128 			len = strspn(ptr, "0123456789");
1129 			suffix = stable(&ptr[len]);
1130 			ullp = (unsigned long long *)lut_lookup(Timesuffixlut,
1131 			    (void *)suffix, NULL);
1132 			ptr[len] = '\0';
1133 			tval = (unsigned long long)strtoul(ptr, NULL, 0) *
1134 			    (ullp ? *ullp : 1ll);
1135 			FREE(ptr);
1136 		}
1137 		fmd_serd_create(hdl, serdname, nval, tval);
1138 	}
1139 
1140 	newentp = MALLOC(sizeof (*newentp));
1141 	newentp->ename = stable(serdinst->u.stmt.np->u.event.ename->u.name.s);
1142 	newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname);
1143 	newentp->hdl = hdl;
1144 	if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) {
1145 		SerdEngines = lut_add(SerdEngines, (void *)newentp,
1146 		    (void *)newentp, (lut_cmp)serd_cmp);
1147 		Serd_need_save = 1;
1148 		serd_save();
1149 	} else {
1150 		FREE(newentp);
1151 	}
1152 
1153 
1154 	/*
1155 	 * increment SERD engine.  if engine fires, reset serd
1156 	 * engine and return trip_strcode
1157 	 */
1158 	if (fmd_serd_record(hdl, serdname, ffep)) {
1159 		struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp,
1160 		    (void *)L_trip, NULL);
1161 
1162 		ASSERT(tripinst != NULL);
1163 
1164 		*enamep = tripinst->u.event.ename->u.name.s;
1165 		*ippp = ipath(tripinst->u.event.epname);
1166 
1167 		fmd_case_add_serd(hdl, fmcase, serdname);
1168 		fmd_serd_reset(hdl, serdname);
1169 		out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname);
1170 		ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
1171 		out(O_ALTFP, "]");
1172 
1173 		FREE(serdname);
1174 		return (1);
1175 	}
1176 
1177 	FREE(serdname);
1178 	return (0);
1179 }
1180 
1181 /*
1182  * search a suspect list for upsets.  feed each upset to serd_eval() and
1183  * build up tripped[], an array of ereports produced by the firing of
1184  * any SERD engines.  then feed each ereport back into
1185  * fme_receive_report().
1186  *
1187  * returns ntrip, the number of these ereports produced.
1188  */
1189 static int
1190 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
1191 {
1192 	/* we build an array of tripped ereports that we send ourselves */
1193 	struct {
1194 		const char *ename;
1195 		const struct ipath *ipp;
1196 	} *tripped;
1197 	struct event *sp;
1198 	int ntrip, nupset, i;
1199 
1200 	/*
1201 	 * count the number of upsets to determine the upper limit on
1202 	 * expected trip ereport strings.  remember that one upset can
1203 	 * lead to at most one ereport.
1204 	 */
1205 	nupset = 0;
1206 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
1207 		if (sp->t == N_UPSET)
1208 			nupset++;
1209 	}
1210 
1211 	if (nupset == 0)
1212 		return (0);
1213 
1214 	/*
1215 	 * get to this point if we have upsets and expect some trip
1216 	 * ereports
1217 	 */
1218 	tripped = alloca(sizeof (*tripped) * nupset);
1219 	bzero((void *)tripped, sizeof (*tripped) * nupset);
1220 
1221 	ntrip = 0;
1222 	for (sp = fmep->suspects; sp; sp = sp->suspects)
1223 		if (sp->t == N_UPSET &&
1224 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
1225 		    &tripped[ntrip].ename, &tripped[ntrip].ipp))
1226 			ntrip++;
1227 
1228 	for (i = 0; i < ntrip; i++) {
1229 		struct event *ep, *nep;
1230 		struct fme *nfmep;
1231 		fmd_case_t *fmcase;
1232 		const struct ipath *ipp;
1233 		const char *eventstring;
1234 		int prev_verbose;
1235 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1236 		enum fme_state state;
1237 
1238 		/*
1239 		 * First try and evaluate a case with the trip ereport plus
1240 		 * all the other ereports that cause the trip. If that fails
1241 		 * to evaluate then try again with just this ereport on its own.
1242 		 */
1243 		out(O_ALTFP|O_NONL, "fme_receive_report_serd: ");
1244 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1245 		out(O_ALTFP|O_STAMP, NULL);
1246 		ep = fmep->e0;
1247 		eventstring = ep->enode->u.event.ename->u.name.s;
1248 		ipp = ep->ipp;
1249 		prune_propagations(eventstring, ipp);
1250 
1251 		/*
1252 		 * create a duplicate fme and case
1253 		 */
1254 		fmcase = fmd_case_open(fmep->hdl, NULL);
1255 		out(O_ALTFP|O_NONL, "duplicate fme for event [");
1256 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1257 		out(O_ALTFP, " ]");
1258 		if ((nfmep = newfme(eventstring, ipp, fmep->hdl,
1259 		    fmcase)) == NULL) {
1260 			out(O_ALTFP|O_NONL, "[");
1261 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1262 			out(O_ALTFP, " CANNOT DIAGNOSE]");
1263 			publish_undiagnosable(fmep->hdl, ffep, fmcase);
1264 			continue;
1265 		}
1266 		Open_fme_count++;
1267 		nfmep->pull = fmep->pull;
1268 		init_fme_bufs(nfmep);
1269 		out(O_ALTFP|O_NONL, "[");
1270 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1271 		out(O_ALTFP, " created FME%d, case %s]", nfmep->id,
1272 		    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
1273 		if (ffep) {
1274 			fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep);
1275 			nfmep->e0r = ffep;
1276 		}
1277 
1278 		/*
1279 		 * add the original ereports
1280 		 */
1281 		for (ep = fmep->observations; ep; ep = ep->observations) {
1282 			eventstring = ep->enode->u.event.ename->u.name.s;
1283 			ipp = ep->ipp;
1284 			out(O_ALTFP|O_NONL, "adding event [");
1285 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1286 			out(O_ALTFP, " ]");
1287 			nep = itree_lookup(nfmep->eventtree, eventstring, ipp);
1288 			if (nep->count++ == 0) {
1289 				nep->observations = nfmep->observations;
1290 				nfmep->observations = nep;
1291 				serialize_observation(nfmep, eventstring, ipp);
1292 				nep->nvp = evnv_dupnvl(ep->nvp);
1293 			}
1294 			if (ffep)
1295 				fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase,
1296 				    ffep);
1297 			stats_counter_bump(nfmep->Rcount);
1298 		}
1299 
1300 		/*
1301 		 * add the serd trigger ereport
1302 		 */
1303 		if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename,
1304 		    tripped[i].ipp)) == NULL) {
1305 			/*
1306 			 * The trigger ereport is not in the instance tree. It
1307 			 * was presumably removed by prune_propagations() as
1308 			 * this combination of events is not present in the
1309 			 * rules.
1310 			 */
1311 			out(O_ALTFP, "upsets_eval: e0 not in instance tree");
1312 			Undiag_reason = UD_BADEVENTI;
1313 			goto retry_lone_ereport;
1314 		}
1315 		out(O_ALTFP|O_NONL, "adding event [");
1316 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1317 		out(O_ALTFP, " ]");
1318 		nfmep->ecurrent = ep;
1319 		ep->nvp = NULL;
1320 		ep->count = 1;
1321 		ep->observations = nfmep->observations;
1322 		nfmep->observations = ep;
1323 
1324 		/*
1325 		 * just peek first.
1326 		 */
1327 		nfmep->peek = 1;
1328 		prev_verbose = Verbose;
1329 		if (Debug == 0)
1330 			Verbose = 0;
1331 		lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep);
1332 		state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay);
1333 		nfmep->peek = 0;
1334 		Verbose = prev_verbose;
1335 		if (state == FME_DISPROVED) {
1336 			out(O_ALTFP, "upsets_eval: hypothesis disproved");
1337 			Undiag_reason = UD_UNSOLVD;
1338 retry_lone_ereport:
1339 			/*
1340 			 * However the trigger ereport on its own might be
1341 			 * diagnosable, so check for that. Undo the new fme
1342 			 * and case we just created and call fme_receive_report.
1343 			 */
1344 			out(O_ALTFP|O_NONL, "[");
1345 			ipath_print(O_ALTFP|O_NONL, tripped[i].ename,
1346 			    tripped[i].ipp);
1347 			out(O_ALTFP, " retrying with just trigger ereport]");
1348 			itree_free(nfmep->eventtree);
1349 			nfmep->eventtree = NULL;
1350 			structconfig_free(nfmep->config);
1351 			nfmep->config = NULL;
1352 			destroy_fme_bufs(nfmep);
1353 			fmd_case_close(nfmep->hdl, nfmep->fmcase);
1354 			fme_receive_report(fmep->hdl, ffep,
1355 			    tripped[i].ename, tripped[i].ipp, NULL);
1356 			continue;
1357 		}
1358 
1359 		/*
1360 		 * and evaluate
1361 		 */
1362 		serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp);
1363 		if (ffep)
1364 			fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep);
1365 		stats_counter_bump(nfmep->Rcount);
1366 		fme_eval(nfmep, ffep);
1367 	}
1368 
1369 	return (ntrip);
1370 }
1371 
1372 /*
1373  * fme_receive_external_report -- call when an external ereport comes in
1374  *
1375  * this routine just converts the relevant information from the ereport
1376  * into a format used internally and passes it on to fme_receive_report().
1377  */
1378 void
1379 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1380     const char *eventstring)
1381 {
1382 	struct node *epnamenp = platform_getpath(nvl);
1383 	const struct ipath *ipp;
1384 
1385 	/*
1386 	 * XFILE: If we ended up without a path, it's an X-file.
1387 	 * For now, use our undiagnosable interface.
1388 	 */
1389 	if (epnamenp == NULL) {
1390 		fmd_case_t *fmcase;
1391 
1392 		out(O_ALTFP, "XFILE: Unable to get path from ereport");
1393 		Undiag_reason = UD_NOPATH;
1394 		fmcase = fmd_case_open(hdl, NULL);
1395 		publish_undiagnosable(hdl, ffep, fmcase);
1396 		return;
1397 	}
1398 
1399 	ipp = ipath(epnamenp);
1400 	tree_free(epnamenp);
1401 	fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl);
1402 }
1403 
1404 /*ARGSUSED*/
1405 void
1406 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1407     const char *eventstring)
1408 {
1409 	char *uuid;
1410 	nvlist_t **nva;
1411 	uint_t nvc;
1412 	const struct ipath *ipp;
1413 
1414 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
1415 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1416 	    &nva, &nvc) != 0) {
1417 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
1418 		return;
1419 	}
1420 
1421 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
1422 
1423 	while (nvc-- != 0) {
1424 		/*
1425 		 * Reset any istat or serd engine associated with this path.
1426 		 */
1427 		char *path;
1428 
1429 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
1430 			continue;
1431 
1432 		path = ipath2str(NULL, ipp);
1433 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
1434 		    path);
1435 		FREE(path);
1436 
1437 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
1438 		istat_save();
1439 
1440 		lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp);
1441 		serd_save();
1442 	}
1443 }
1444 
1445 /*ARGSUSED*/
1446 void
1447 fme_receive_topology_change(void)
1448 {
1449 	lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL);
1450 	istat_save();
1451 
1452 	lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL);
1453 	serd_save();
1454 }
1455 
1456 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1457     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1458 
1459 /* ARGSUSED */
1460 static void
1461 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1462 {
1463 	struct bubble *bp;
1464 	struct arrowlist *ap;
1465 
1466 	ep->cached_state = 0;
1467 	ep->keep_in_tree = 0;
1468 	for (bp = itree_next_bubble(ep, NULL); bp;
1469 	    bp = itree_next_bubble(ep, bp)) {
1470 		if (bp->t != B_FROM)
1471 			continue;
1472 		bp->mark = 0;
1473 		for (ap = itree_next_arrow(bp, NULL); ap;
1474 		    ap = itree_next_arrow(bp, ap))
1475 			ap->arrowp->mark = 0;
1476 	}
1477 }
1478 
1479 static void
1480 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1481     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1482 {
1483 	struct event *ep;
1484 	struct fme *fmep = NULL;
1485 	struct fme *ofmep = NULL;
1486 	struct fme *cfmep, *svfmep;
1487 	int matched = 0;
1488 	nvlist_t *defect;
1489 	fmd_case_t *fmcase;
1490 
1491 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1492 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1493 	out(O_ALTFP|O_STAMP, NULL);
1494 
1495 	/* decide which FME it goes to */
1496 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1497 		int prev_verbose;
1498 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1499 		enum fme_state state;
1500 		nvlist_t *pre_peek_nvp = NULL;
1501 
1502 		if (fmep->overflow) {
1503 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1504 				ofmep = fmep;
1505 
1506 			continue;
1507 		}
1508 
1509 		/*
1510 		 * ignore solved or closed cases
1511 		 */
1512 		if (fmep->posted_suspects ||
1513 		    fmd_case_solved(fmep->hdl, fmep->fmcase) ||
1514 		    fmd_case_closed(fmep->hdl, fmep->fmcase))
1515 			continue;
1516 
1517 		/* look up event in event tree for this FME */
1518 		if ((ep = itree_lookup(fmep->eventtree,
1519 		    eventstring, ipp)) == NULL)
1520 			continue;
1521 
1522 		/* note observation */
1523 		fmep->ecurrent = ep;
1524 		if (ep->count++ == 0) {
1525 			/* link it into list of observations seen */
1526 			ep->observations = fmep->observations;
1527 			fmep->observations = ep;
1528 			ep->nvp = evnv_dupnvl(nvl);
1529 		} else {
1530 			/* use new payload values for peek */
1531 			pre_peek_nvp = ep->nvp;
1532 			ep->nvp = evnv_dupnvl(nvl);
1533 		}
1534 
1535 		/* tell hypothesise() not to mess with suspect list */
1536 		fmep->peek = 1;
1537 
1538 		/* don't want this to be verbose (unless Debug is set) */
1539 		prev_verbose = Verbose;
1540 		if (Debug == 0)
1541 			Verbose = 0;
1542 
1543 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1544 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1545 
1546 		fmep->peek = 0;
1547 
1548 		/* put verbose flag back */
1549 		Verbose = prev_verbose;
1550 
1551 		if (state != FME_DISPROVED) {
1552 			/* found an FME that explains the ereport */
1553 			matched++;
1554 			out(O_ALTFP|O_NONL, "[");
1555 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1556 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1557 
1558 			if (pre_peek_nvp)
1559 				nvlist_free(pre_peek_nvp);
1560 
1561 			if (ep->count == 1)
1562 				serialize_observation(fmep, eventstring, ipp);
1563 
1564 			if (ffep)
1565 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1566 
1567 			stats_counter_bump(fmep->Rcount);
1568 
1569 			/* re-eval FME */
1570 			fme_eval(fmep, ffep);
1571 		} else {
1572 
1573 			/* not a match, undo noting of observation */
1574 			fmep->ecurrent = NULL;
1575 			if (--ep->count == 0) {
1576 				/* unlink it from observations */
1577 				fmep->observations = ep->observations;
1578 				ep->observations = NULL;
1579 				nvlist_free(ep->nvp);
1580 				ep->nvp = NULL;
1581 			} else {
1582 				nvlist_free(ep->nvp);
1583 				ep->nvp = pre_peek_nvp;
1584 			}
1585 		}
1586 	}
1587 
1588 	if (matched)
1589 		return;	/* explained by at least one existing FME */
1590 
1591 	/* clean up closed fmes */
1592 	cfmep = ClosedFMEs;
1593 	while (cfmep != NULL) {
1594 		svfmep = cfmep->next;
1595 		destroy_fme(cfmep);
1596 		cfmep = svfmep;
1597 	}
1598 	ClosedFMEs = NULL;
1599 	prune_propagations(eventstring, ipp);
1600 
1601 	if (ofmep) {
1602 		out(O_ALTFP|O_NONL, "[");
1603 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1604 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1605 		if (ffep)
1606 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1607 
1608 		return;
1609 
1610 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1611 		out(O_ALTFP|O_NONL, "[");
1612 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1613 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1614 
1615 		fmcase = fmd_case_open(hdl, NULL);
1616 
1617 		/* Create overflow fme */
1618 		if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1619 			out(O_ALTFP|O_NONL, "[");
1620 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1621 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1622 			publish_undiagnosable(hdl, ffep, fmcase);
1623 			return;
1624 		}
1625 
1626 		Open_fme_count++;
1627 
1628 		init_fme_bufs(fmep);
1629 		fmep->overflow = B_TRUE;
1630 
1631 		if (ffep)
1632 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1633 
1634 		defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
1635 		    NULL, NULL, NULL);
1636 		(void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME);
1637 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1638 		fmd_case_solve(hdl, fmep->fmcase);
1639 		return;
1640 	}
1641 
1642 	/* open a case */
1643 	fmcase = fmd_case_open(hdl, NULL);
1644 
1645 	/* start a new FME */
1646 	if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1647 		out(O_ALTFP|O_NONL, "[");
1648 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1649 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1650 		publish_undiagnosable(hdl, ffep, fmcase);
1651 		return;
1652 	}
1653 
1654 	Open_fme_count++;
1655 
1656 	init_fme_bufs(fmep);
1657 
1658 	out(O_ALTFP|O_NONL, "[");
1659 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1660 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1661 	    fmd_case_uuid(hdl, fmep->fmcase));
1662 
1663 	ep = fmep->e0;
1664 	ASSERT(ep != NULL);
1665 
1666 	/* note observation */
1667 	fmep->ecurrent = ep;
1668 	if (ep->count++ == 0) {
1669 		/* link it into list of observations seen */
1670 		ep->observations = fmep->observations;
1671 		fmep->observations = ep;
1672 		ep->nvp = evnv_dupnvl(nvl);
1673 		serialize_observation(fmep, eventstring, ipp);
1674 	} else {
1675 		/* new payload overrides any previous */
1676 		nvlist_free(ep->nvp);
1677 		ep->nvp = evnv_dupnvl(nvl);
1678 	}
1679 
1680 	stats_counter_bump(fmep->Rcount);
1681 
1682 	if (ffep) {
1683 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1684 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1685 		fmep->e0r = ffep;
1686 	}
1687 
1688 	/* give the diagnosis algorithm a shot at the new FME state */
1689 	fme_eval(fmep, ffep);
1690 }
1691 
1692 void
1693 fme_status(int flags)
1694 {
1695 	struct fme *fmep;
1696 
1697 	if (FMElist == NULL) {
1698 		out(flags, "No fault management exercises underway.");
1699 		return;
1700 	}
1701 
1702 	for (fmep = FMElist; fmep; fmep = fmep->next)
1703 		fme_print(flags, fmep);
1704 }
1705 
1706 /*
1707  * "indent" routines used mostly for nicely formatted debug output, but also
1708  * for sanity checking for infinite recursion bugs.
1709  */
1710 
1711 #define	MAX_INDENT 1024
1712 static const char *indent_s[MAX_INDENT];
1713 static int current_indent;
1714 
1715 static void
1716 indent_push(const char *s)
1717 {
1718 	if (current_indent < MAX_INDENT)
1719 		indent_s[current_indent++] = s;
1720 	else
1721 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1722 }
1723 
1724 static void
1725 indent_set(const char *s)
1726 {
1727 	current_indent = 0;
1728 	indent_push(s);
1729 }
1730 
1731 static void
1732 indent_pop(void)
1733 {
1734 	if (current_indent > 0)
1735 		current_indent--;
1736 	else
1737 		out(O_DIE, "recursion underflow");
1738 }
1739 
1740 static void
1741 indent(void)
1742 {
1743 	int i;
1744 	if (!Verbose)
1745 		return;
1746 	for (i = 0; i < current_indent; i++)
1747 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1748 }
1749 
1750 #define	SLNEW		1
1751 #define	SLCHANGED	2
1752 #define	SLWAIT		3
1753 #define	SLDISPROVED	4
1754 
1755 static void
1756 print_suspects(int circumstance, struct fme *fmep)
1757 {
1758 	struct event *ep;
1759 
1760 	out(O_ALTFP|O_NONL, "[");
1761 	if (circumstance == SLCHANGED) {
1762 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1763 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1764 	} else if (circumstance == SLWAIT) {
1765 		out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id,
1766 		    fmep->timer);
1767 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1768 	} else if (circumstance == SLDISPROVED) {
1769 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1770 	} else {
1771 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1772 	}
1773 
1774 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1775 		out(O_ALTFP, "]");
1776 		return;
1777 	}
1778 
1779 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1780 		out(O_ALTFP|O_NONL, " ");
1781 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1782 	}
1783 	out(O_ALTFP, "]");
1784 }
1785 
1786 static struct node *
1787 eventprop_lookup(struct event *ep, const char *propname)
1788 {
1789 	return (lut_lookup(ep->props, (void *)propname, NULL));
1790 }
1791 
1792 #define	MAXDIGITIDX	23
1793 static char numbuf[MAXDIGITIDX + 1];
1794 
1795 static int
1796 node2uint(struct node *n, uint_t *valp)
1797 {
1798 	struct evalue value;
1799 	struct lut *globals = NULL;
1800 
1801 	if (n == NULL)
1802 		return (1);
1803 
1804 	/*
1805 	 * check value.v since we are being asked to convert an unsigned
1806 	 * long long int to an unsigned int
1807 	 */
1808 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1809 	    value.t != UINT64 || value.v > (1ULL << 32))
1810 		return (1);
1811 
1812 	*valp = (uint_t)value.v;
1813 
1814 	return (0);
1815 }
1816 
1817 static nvlist_t *
1818 node2fmri(struct node *n)
1819 {
1820 	nvlist_t **pa, *f, *p;
1821 	struct node *nc;
1822 	uint_t depth = 0;
1823 	char *numstr, *nullbyte;
1824 	char *failure;
1825 	int err, i;
1826 
1827 	/* XXX do we need to be able to handle a non-T_NAME node? */
1828 	if (n == NULL || n->t != T_NAME)
1829 		return (NULL);
1830 
1831 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1832 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1833 			break;
1834 		depth++;
1835 	}
1836 
1837 	if (nc != NULL) {
1838 		/* We bailed early, something went wrong */
1839 		return (NULL);
1840 	}
1841 
1842 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1843 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1844 	pa = alloca(depth * sizeof (nvlist_t *));
1845 	for (i = 0; i < depth; i++)
1846 		pa[i] = NULL;
1847 
1848 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1849 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1850 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1851 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1852 	if (err != 0) {
1853 		failure = "basic construction of FMRI failed";
1854 		goto boom;
1855 	}
1856 
1857 	numbuf[MAXDIGITIDX] = '\0';
1858 	nullbyte = &numbuf[MAXDIGITIDX];
1859 	i = 0;
1860 
1861 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1862 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1863 		if (err != 0) {
1864 			failure = "alloc of an hc-pair failed";
1865 			goto boom;
1866 		}
1867 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1868 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1869 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1870 		if (err != 0) {
1871 			failure = "construction of an hc-pair failed";
1872 			goto boom;
1873 		}
1874 		pa[i++] = p;
1875 	}
1876 
1877 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1878 	if (err == 0) {
1879 		for (i = 0; i < depth; i++)
1880 			if (pa[i] != NULL)
1881 				nvlist_free(pa[i]);
1882 		return (f);
1883 	}
1884 	failure = "addition of hc-pair array to FMRI failed";
1885 
1886 boom:
1887 	for (i = 0; i < depth; i++)
1888 		if (pa[i] != NULL)
1889 			nvlist_free(pa[i]);
1890 	nvlist_free(f);
1891 	out(O_DIE, "%s", failure);
1892 	/*NOTREACHED*/
1893 	return (NULL);
1894 }
1895 
1896 /* an ipath cache entry is an array of these, with s==NULL at the end */
1897 struct ipath {
1898 	const char *s;	/* component name (in stable) */
1899 	int i;		/* instance number */
1900 };
1901 
1902 static nvlist_t *
1903 ipath2fmri(struct ipath *ipath)
1904 {
1905 	nvlist_t **pa, *f, *p;
1906 	uint_t depth = 0;
1907 	char *numstr, *nullbyte;
1908 	char *failure;
1909 	int err, i;
1910 	struct ipath *ipp;
1911 
1912 	for (ipp = ipath; ipp->s != NULL; ipp++)
1913 		depth++;
1914 
1915 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1916 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1917 	pa = alloca(depth * sizeof (nvlist_t *));
1918 	for (i = 0; i < depth; i++)
1919 		pa[i] = NULL;
1920 
1921 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1922 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1923 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1924 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1925 	if (err != 0) {
1926 		failure = "basic construction of FMRI failed";
1927 		goto boom;
1928 	}
1929 
1930 	numbuf[MAXDIGITIDX] = '\0';
1931 	nullbyte = &numbuf[MAXDIGITIDX];
1932 	i = 0;
1933 
1934 	for (ipp = ipath; ipp->s != NULL; ipp++) {
1935 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1936 		if (err != 0) {
1937 			failure = "alloc of an hc-pair failed";
1938 			goto boom;
1939 		}
1940 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s);
1941 		numstr = ulltostr(ipp->i, nullbyte);
1942 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1943 		if (err != 0) {
1944 			failure = "construction of an hc-pair failed";
1945 			goto boom;
1946 		}
1947 		pa[i++] = p;
1948 	}
1949 
1950 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1951 	if (err == 0) {
1952 		for (i = 0; i < depth; i++)
1953 			if (pa[i] != NULL)
1954 				nvlist_free(pa[i]);
1955 		return (f);
1956 	}
1957 	failure = "addition of hc-pair array to FMRI failed";
1958 
1959 boom:
1960 	for (i = 0; i < depth; i++)
1961 		if (pa[i] != NULL)
1962 			nvlist_free(pa[i]);
1963 	nvlist_free(f);
1964 	out(O_DIE, "%s", failure);
1965 	/*NOTREACHED*/
1966 	return (NULL);
1967 }
1968 
1969 static uint_t
1970 avg(uint_t sum, uint_t cnt)
1971 {
1972 	unsigned long long s = sum * 10;
1973 
1974 	return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0));
1975 }
1976 
1977 static uint8_t
1978 percentof(uint_t part, uint_t whole)
1979 {
1980 	unsigned long long p = part * 1000;
1981 
1982 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
1983 }
1984 
1985 struct rsl {
1986 	struct event *suspect;
1987 	nvlist_t *asru;
1988 	nvlist_t *fru;
1989 	nvlist_t *rsrc;
1990 };
1991 
1992 /*
1993  *  rslfree -- free internal members of struct rsl not expected to be
1994  *	freed elsewhere.
1995  */
1996 static void
1997 rslfree(struct rsl *freeme)
1998 {
1999 	if (freeme->asru != NULL)
2000 		nvlist_free(freeme->asru);
2001 	if (freeme->fru != NULL)
2002 		nvlist_free(freeme->fru);
2003 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
2004 		nvlist_free(freeme->rsrc);
2005 }
2006 
2007 /*
2008  *  rslcmp -- compare two rsl structures.  Use the following
2009  *	comparisons to establish cardinality:
2010  *
2011  *	1. Name of the suspect's class. (simple strcmp)
2012  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
2013  *
2014  */
2015 static int
2016 rslcmp(const void *a, const void *b)
2017 {
2018 	struct rsl *r1 = (struct rsl *)a;
2019 	struct rsl *r2 = (struct rsl *)b;
2020 	int rv;
2021 
2022 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
2023 	    r2->suspect->enode->u.event.ename->u.name.s);
2024 	if (rv != 0)
2025 		return (rv);
2026 
2027 	if (r1->asru == NULL && r2->asru == NULL)
2028 		return (0);
2029 	if (r1->asru == NULL)
2030 		return (-1);
2031 	if (r2->asru == NULL)
2032 		return (1);
2033 	return (evnv_cmpnvl(r1->asru, r2->asru, 0));
2034 }
2035 
2036 /*
2037  *  rsluniq -- given an array of rsl structures, seek out and "remove"
2038  *	any duplicates.  Dups are "remove"d by NULLing the suspect pointer
2039  *	of the array element.  Removal also means updating the number of
2040  *	problems and the number of problems which are not faults.  User
2041  *	provides the first and last element pointers.
2042  */
2043 static void
2044 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf)
2045 {
2046 	struct rsl *cr;
2047 
2048 	if (*nprobs == 1)
2049 		return;
2050 
2051 	/*
2052 	 *  At this point, we only expect duplicate defects.
2053 	 *  Eversholt's diagnosis algorithm prevents duplicate
2054 	 *  suspects, but we rewrite defects in the platform code after
2055 	 *  the diagnosis is made, and that can introduce new
2056 	 *  duplicates.
2057 	 */
2058 	while (first <= last) {
2059 		if (first->suspect == NULL || !is_defect(first->suspect->t)) {
2060 			first++;
2061 			continue;
2062 		}
2063 		cr = first + 1;
2064 		while (cr <= last) {
2065 			if (is_defect(first->suspect->t)) {
2066 				if (rslcmp(first, cr) == 0) {
2067 					cr->suspect = NULL;
2068 					rslfree(cr);
2069 					(*nprobs)--;
2070 					(*nnonf)--;
2071 				}
2072 			}
2073 			/*
2074 			 * assume all defects are in order after our
2075 			 * sort and short circuit here with "else break" ?
2076 			 */
2077 			cr++;
2078 		}
2079 		first++;
2080 	}
2081 }
2082 
2083 /*
2084  * get_resources -- for a given suspect, determine what ASRU, FRU and
2085  *     RSRC nvlists should be advertised in the final suspect list.
2086  */
2087 void
2088 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
2089 {
2090 	struct node *asrudef, *frudef;
2091 	nvlist_t *asru, *fru;
2092 	nvlist_t *rsrc = NULL;
2093 	char *pathstr;
2094 
2095 	/*
2096 	 * First find any ASRU and/or FRU defined in the
2097 	 * initial fault tree.
2098 	 */
2099 	asrudef = eventprop_lookup(sp, L_ASRU);
2100 	frudef = eventprop_lookup(sp, L_FRU);
2101 
2102 	/*
2103 	 * Create FMRIs based on those definitions
2104 	 */
2105 	asru = node2fmri(asrudef);
2106 	fru = node2fmri(frudef);
2107 	pathstr = ipath2str(NULL, sp->ipp);
2108 
2109 	/*
2110 	 * Allow for platform translations of the FMRIs
2111 	 */
2112 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
2113 	    pathstr);
2114 
2115 	FREE(pathstr);
2116 	rsrcs->suspect = sp;
2117 	rsrcs->asru = asru;
2118 	rsrcs->fru = fru;
2119 	rsrcs->rsrc = rsrc;
2120 }
2121 
2122 /*
2123  * trim_suspects -- prior to publishing, we may need to remove some
2124  *    suspects from the list.  If we're auto-closing upsets, we don't
2125  *    want any of those in the published list.  If the ASRUs for multiple
2126  *    defects resolve to the same ASRU (driver) we only want to publish
2127  *    that as a single suspect.
2128  */
2129 static void
2130 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin,
2131     struct rsl **end)
2132 {
2133 	struct event *ep;
2134 	struct rsl *rp;
2135 	int rpcnt;
2136 
2137 	/*
2138 	 * First save the suspects in the psuspects, then copy back
2139 	 * only the ones we wish to retain.  This resets nsuspects to
2140 	 * zero.
2141 	 */
2142 	rpcnt = fmep->nsuspects;
2143 	save_suspects(fmep);
2144 
2145 	/*
2146 	 * allocate an array of resource pointers for the suspects.
2147 	 * We may end up using less than the full allocation, but this
2148 	 * is a very short-lived array.  publish_suspects() will free
2149 	 * this array when it's done using it.
2150 	 */
2151 	rp = *begin = MALLOC(rpcnt * sizeof (struct rsl));
2152 	bzero(rp, rpcnt * sizeof (struct rsl));
2153 
2154 	/* first pass, remove any unwanted upsets and populate our array */
2155 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
2156 		if (no_upsets && is_upset(ep->t))
2157 			continue;
2158 		get_resources(ep, rp, fmep->config);
2159 		rp++;
2160 		fmep->nsuspects++;
2161 		if (!is_fault(ep->t))
2162 			fmep->nonfault++;
2163 	}
2164 
2165 	/* if all we had was unwanted upsets, we're done */
2166 	if (fmep->nsuspects == 0)
2167 		return;
2168 
2169 	*end = rp - 1;
2170 
2171 	/* sort the array */
2172 	qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp);
2173 	rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault);
2174 }
2175 
2176 /*
2177  * addpayloadprop -- add a payload prop to a problem
2178  */
2179 static void
2180 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
2181 {
2182 	ASSERT(fault != NULL);
2183 	ASSERT(lhs != NULL);
2184 	ASSERT(rhs != NULL);
2185 
2186 	if (rhs->t == UINT64) {
2187 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
2188 
2189 		if (nvlist_add_uint64(fault, lhs, rhs->v) != 0)
2190 			out(O_DIE,
2191 			    "cannot add payloadprop \"%s\" to fault", lhs);
2192 	} else {
2193 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
2194 		    lhs, (char *)(uintptr_t)rhs->v);
2195 
2196 		if (nvlist_add_string(fault, lhs, (char *)(uintptr_t)rhs->v) !=
2197 		    0)
2198 			out(O_DIE,
2199 			    "cannot add payloadprop \"%s\" to fault", lhs);
2200 	}
2201 }
2202 
2203 static char *Istatbuf;
2204 static char *Istatbufptr;
2205 static int Istatsz;
2206 
2207 /*
2208  * istataddsize -- calculate size of istat and add it to Istatsz
2209  */
2210 /*ARGSUSED2*/
2211 static void
2212 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2213 {
2214 	int val;
2215 
2216 	ASSERT(lhs != NULL);
2217 	ASSERT(rhs != NULL);
2218 
2219 	if ((val = stats_counter_value(rhs)) == 0)
2220 		return;	/* skip zero-valued stats */
2221 
2222 	/* count up the size of the stat name */
2223 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
2224 	Istatsz++;	/* for the trailing NULL byte */
2225 
2226 	/* count up the size of the stat value */
2227 	Istatsz += snprintf(NULL, 0, "%d", val);
2228 	Istatsz++;	/* for the trailing NULL byte */
2229 }
2230 
2231 /*
2232  * istat2str -- serialize an istat, writing result to *Istatbufptr
2233  */
2234 /*ARGSUSED2*/
2235 static void
2236 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2237 {
2238 	char *str;
2239 	int len;
2240 	int val;
2241 
2242 	ASSERT(lhs != NULL);
2243 	ASSERT(rhs != NULL);
2244 
2245 	if ((val = stats_counter_value(rhs)) == 0)
2246 		return;	/* skip zero-valued stats */
2247 
2248 	/* serialize the stat name */
2249 	str = ipath2str(lhs->ename, lhs->ipath);
2250 	len = strlen(str);
2251 
2252 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
2253 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
2254 	Istatbufptr += len;
2255 	FREE(str);
2256 	*Istatbufptr++ = '\0';
2257 
2258 	/* serialize the stat value */
2259 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
2260 	    "%d", val);
2261 	*Istatbufptr++ = '\0';
2262 
2263 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
2264 }
2265 
2266 void
2267 istat_save()
2268 {
2269 	if (Istat_need_save == 0)
2270 		return;
2271 
2272 	/* figure out how big the serialzed info is */
2273 	Istatsz = 0;
2274 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
2275 
2276 	if (Istatsz == 0) {
2277 		/* no stats to save */
2278 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2279 		return;
2280 	}
2281 
2282 	/* create the serialized buffer */
2283 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
2284 	lut_walk(Istats, (lut_cb)istat2str, NULL);
2285 
2286 	/* clear out current saved stats */
2287 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2288 
2289 	/* write out the new version */
2290 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
2291 	FREE(Istatbuf);
2292 
2293 	Istat_need_save = 0;
2294 }
2295 
2296 int
2297 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
2298 {
2299 	if (ent1->ename != ent2->ename)
2300 		return (ent2->ename - ent1->ename);
2301 	if (ent1->ipath != ent2->ipath)
2302 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2303 
2304 	return (0);
2305 }
2306 
2307 /*
2308  * istat-verify -- verify the component associated with a stat still exists
2309  *
2310  * if the component no longer exists, this routine resets the stat and
2311  * returns 0.  if the component still exists, it returns 1.
2312  */
2313 static int
2314 istat_verify(struct node *snp, struct istat_entry *entp)
2315 {
2316 	struct stats *statp;
2317 	nvlist_t *fmri;
2318 
2319 	fmri = node2fmri(snp->u.event.epname);
2320 	if (platform_path_exists(fmri)) {
2321 		nvlist_free(fmri);
2322 		return (1);
2323 	}
2324 	nvlist_free(fmri);
2325 
2326 	/* component no longer in system.  zero out the associated stats */
2327 	if ((statp = (struct stats *)
2328 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
2329 	    stats_counter_value(statp) == 0)
2330 		return (0);	/* stat is already reset */
2331 
2332 	Istat_need_save = 1;
2333 	stats_counter_reset(statp);
2334 	return (0);
2335 }
2336 
2337 static void
2338 istat_bump(struct node *snp, int n)
2339 {
2340 	struct stats *statp;
2341 	struct istat_entry ent;
2342 
2343 	ASSERT(snp != NULL);
2344 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
2345 	ASSERT(snp->u.event.epname != NULL);
2346 
2347 	/* class name should be hoisted into a single stable entry */
2348 	ASSERT(snp->u.event.ename->u.name.next == NULL);
2349 	ent.ename = snp->u.event.ename->u.name.s;
2350 	ent.ipath = ipath(snp->u.event.epname);
2351 
2352 	if (!istat_verify(snp, &ent)) {
2353 		/* component no longer exists in system, nothing to do */
2354 		return;
2355 	}
2356 
2357 	if ((statp = (struct stats *)
2358 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
2359 		/* need to create the counter */
2360 		int cnt = 0;
2361 		struct node *np;
2362 		char *sname;
2363 		char *snamep;
2364 		struct istat_entry *newentp;
2365 
2366 		/* count up the size of the stat name */
2367 		np = snp->u.event.ename;
2368 		while (np != NULL) {
2369 			cnt += strlen(np->u.name.s);
2370 			cnt++;	/* for the '.' or '@' */
2371 			np = np->u.name.next;
2372 		}
2373 		np = snp->u.event.epname;
2374 		while (np != NULL) {
2375 			cnt += snprintf(NULL, 0, "%s%llu",
2376 			    np->u.name.s, np->u.name.child->u.ull);
2377 			cnt++;	/* for the '/' or trailing NULL byte */
2378 			np = np->u.name.next;
2379 		}
2380 
2381 		/* build the stat name */
2382 		snamep = sname = alloca(cnt);
2383 		np = snp->u.event.ename;
2384 		while (np != NULL) {
2385 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2386 			    "%s", np->u.name.s);
2387 			np = np->u.name.next;
2388 			if (np)
2389 				*snamep++ = '.';
2390 		}
2391 		*snamep++ = '@';
2392 		np = snp->u.event.epname;
2393 		while (np != NULL) {
2394 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2395 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
2396 			np = np->u.name.next;
2397 			if (np)
2398 				*snamep++ = '/';
2399 		}
2400 		*snamep++ = '\0';
2401 
2402 		/* create the new stat & add it to our list */
2403 		newentp = MALLOC(sizeof (*newentp));
2404 		*newentp = ent;
2405 		statp = stats_new_counter(NULL, sname, 0);
2406 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
2407 		    (lut_cmp)istat_cmp);
2408 	}
2409 
2410 	/* if n is non-zero, set that value instead of bumping */
2411 	if (n) {
2412 		stats_counter_reset(statp);
2413 		stats_counter_add(statp, n);
2414 	} else
2415 		stats_counter_bump(statp);
2416 	Istat_need_save = 1;
2417 
2418 	ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath);
2419 	out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented",
2420 	    stats_counter_value(statp));
2421 }
2422 
2423 /*ARGSUSED*/
2424 static void
2425 istat_destructor(void *left, void *right, void *arg)
2426 {
2427 	struct istat_entry *entp = (struct istat_entry *)left;
2428 	struct stats *statp = (struct stats *)right;
2429 	FREE(entp);
2430 	stats_delete(statp);
2431 }
2432 
2433 /*
2434  * Callback used in a walk of the Istats to reset matching stat counters.
2435  */
2436 static void
2437 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
2438     const struct ipath *ipp)
2439 {
2440 	char *path;
2441 
2442 	if (entp->ipath == ipp) {
2443 		path = ipath2str(entp->ename, ipp);
2444 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
2445 		FREE(path);
2446 		stats_counter_reset(statp);
2447 		Istat_need_save = 1;
2448 	}
2449 }
2450 
2451 /*ARGSUSED*/
2452 static void
2453 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp,
2454     void *unused)
2455 {
2456 	char *path;
2457 	nvlist_t *fmri;
2458 
2459 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2460 	if (!platform_path_exists(fmri)) {
2461 		path = ipath2str(entp->ename, entp->ipath);
2462 		out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path);
2463 		FREE(path);
2464 		stats_counter_reset(statp);
2465 		Istat_need_save = 1;
2466 	}
2467 	nvlist_free(fmri);
2468 }
2469 
2470 void
2471 istat_fini(void)
2472 {
2473 	lut_free(Istats, istat_destructor, NULL);
2474 }
2475 
2476 static char *Serdbuf;
2477 static char *Serdbufptr;
2478 static int Serdsz;
2479 
2480 /*
2481  * serdaddsize -- calculate size of serd and add it to Serdsz
2482  */
2483 /*ARGSUSED*/
2484 static void
2485 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2486 {
2487 	ASSERT(lhs != NULL);
2488 
2489 	/* count up the size of the stat name */
2490 	Serdsz += ipath2strlen(lhs->ename, lhs->ipath);
2491 	Serdsz++;	/* for the trailing NULL byte */
2492 }
2493 
2494 /*
2495  * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2496  */
2497 /*ARGSUSED*/
2498 static void
2499 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2500 {
2501 	char *str;
2502 	int len;
2503 
2504 	ASSERT(lhs != NULL);
2505 
2506 	/* serialize the serd engine name */
2507 	str = ipath2str(lhs->ename, lhs->ipath);
2508 	len = strlen(str);
2509 
2510 	ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]);
2511 	(void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr);
2512 	Serdbufptr += len;
2513 	FREE(str);
2514 	*Serdbufptr++ = '\0';
2515 	ASSERT(Serdbufptr <= &Serdbuf[Serdsz]);
2516 }
2517 
2518 void
2519 serd_save()
2520 {
2521 	if (Serd_need_save == 0)
2522 		return;
2523 
2524 	/* figure out how big the serialzed info is */
2525 	Serdsz = 0;
2526 	lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL);
2527 
2528 	if (Serdsz == 0) {
2529 		/* no serd engines to save */
2530 		fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2531 		return;
2532 	}
2533 
2534 	/* create the serialized buffer */
2535 	Serdbufptr = Serdbuf = MALLOC(Serdsz);
2536 	lut_walk(SerdEngines, (lut_cb)serd2str, NULL);
2537 
2538 	/* clear out current saved stats */
2539 	fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2540 
2541 	/* write out the new version */
2542 	fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz);
2543 	FREE(Serdbuf);
2544 	Serd_need_save = 0;
2545 }
2546 
2547 int
2548 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2)
2549 {
2550 	if (ent1->ename != ent2->ename)
2551 		return (ent2->ename - ent1->ename);
2552 	if (ent1->ipath != ent2->ipath)
2553 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2554 
2555 	return (0);
2556 }
2557 
2558 void
2559 fme_serd_load(fmd_hdl_t *hdl)
2560 {
2561 	int sz;
2562 	char *sbuf;
2563 	char *sepptr;
2564 	char *ptr;
2565 	struct serd_entry *newentp;
2566 	struct node *epname;
2567 	nvlist_t *fmri;
2568 	char *namestring;
2569 
2570 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0)
2571 		return;
2572 	sbuf = alloca(sz);
2573 	fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz);
2574 	ptr = sbuf;
2575 	while (ptr < &sbuf[sz]) {
2576 		sepptr = strchr(ptr, '@');
2577 		*sepptr = '\0';
2578 		namestring = ptr;
2579 		sepptr++;
2580 		ptr = sepptr;
2581 		ptr += strlen(ptr);
2582 		ptr++;	/* move past the '\0' separating paths */
2583 		epname = pathstring2epnamenp(sepptr);
2584 		fmri = node2fmri(epname);
2585 		if (platform_path_exists(fmri)) {
2586 			newentp = MALLOC(sizeof (*newentp));
2587 			newentp->hdl = hdl;
2588 			newentp->ipath = ipath(epname);
2589 			newentp->ename = stable(namestring);
2590 			SerdEngines = lut_add(SerdEngines, (void *)newentp,
2591 			    (void *)newentp, (lut_cmp)serd_cmp);
2592 		} else
2593 			Serd_need_save = 1;
2594 		tree_free(epname);
2595 		nvlist_free(fmri);
2596 	}
2597 	/* save it back again in case some of the paths no longer exist */
2598 	serd_save();
2599 }
2600 
2601 /*ARGSUSED*/
2602 static void
2603 serd_destructor(void *left, void *right, void *arg)
2604 {
2605 	struct serd_entry *entp = (struct serd_entry *)left;
2606 	FREE(entp);
2607 }
2608 
2609 /*
2610  * Callback used in a walk of the SerdEngines to reset matching serd engines.
2611  */
2612 /*ARGSUSED*/
2613 static void
2614 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp)
2615 {
2616 	char *path;
2617 
2618 	if (entp->ipath == ipp) {
2619 		path = ipath2str(entp->ename, ipp);
2620 		out(O_ALTFP, "serd_reset_cb: resetting %s", path);
2621 		fmd_serd_reset(entp->hdl, path);
2622 		FREE(path);
2623 		Serd_need_save = 1;
2624 	}
2625 }
2626 
2627 /*ARGSUSED*/
2628 static void
2629 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2)
2630 {
2631 	char *path;
2632 	nvlist_t *fmri;
2633 
2634 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2635 	if (!platform_path_exists(fmri)) {
2636 		path = ipath2str(entp->ename, entp->ipath);
2637 		out(O_ALTFP, "serd_topo_chg_cb: not present %s", path);
2638 		fmd_serd_reset(entp->hdl, path);
2639 		FREE(path);
2640 		Serd_need_save = 1;
2641 	}
2642 	nvlist_free(fmri);
2643 }
2644 
2645 void
2646 serd_fini(void)
2647 {
2648 	lut_free(SerdEngines, serd_destructor, NULL);
2649 }
2650 
2651 static void
2652 publish_suspects(struct fme *fmep)
2653 {
2654 	struct rsl *srl = NULL;
2655 	struct rsl *erl;
2656 	struct rsl *rp;
2657 	nvlist_t *fault;
2658 	uint8_t cert;
2659 	uint_t *frs;
2660 	uint_t fravg, frsum, fr;
2661 	uint_t messval;
2662 	struct node *snp;
2663 	int frcnt, fridx;
2664 	boolean_t no_upsets = B_FALSE;
2665 	boolean_t allfaulty = B_TRUE;
2666 
2667 	stats_counter_bump(fmep->diags);
2668 
2669 	/*
2670 	 * If we're auto-closing upsets, we don't want to include them
2671 	 * in any produced suspect lists or certainty accounting.
2672 	 */
2673 	if (Autoclose != NULL)
2674 		if (strcmp(Autoclose, "true") == 0 ||
2675 		    strcmp(Autoclose, "all") == 0 ||
2676 		    strcmp(Autoclose, "upsets") == 0)
2677 			no_upsets = B_TRUE;
2678 
2679 	trim_suspects(fmep, no_upsets, &srl, &erl);
2680 
2681 	/*
2682 	 * If the resulting suspect list has no members, we're
2683 	 * done.  Returning here will simply close the case.
2684 	 */
2685 	if (fmep->nsuspects == 0) {
2686 		out(O_ALTFP,
2687 		    "[FME%d, case %s (all suspects are upsets)]",
2688 		    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
2689 		FREE(srl);
2690 		restore_suspects(fmep);
2691 		return;
2692 	}
2693 
2694 	/*
2695 	 * If the suspect list is all faults, then for a given fault,
2696 	 * say X of N, X's certainty is computed via:
2697 	 *
2698 	 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100
2699 	 *
2700 	 * If none of the suspects are faults, and there are N suspects,
2701 	 * the certainty of a given suspect is 100/N.
2702 	 *
2703 	 * If there are are a mixture of faults and other problems in
2704 	 * the suspect list, we take an average of the faults'
2705 	 * FITrates and treat this average as the FITrate for any
2706 	 * non-faults.  The fitrate of any given suspect is then
2707 	 * computed per the first formula above.
2708 	 */
2709 	if (fmep->nonfault == fmep->nsuspects) {
2710 		/* NO faults in the suspect list */
2711 		cert = percentof(1, fmep->nsuspects);
2712 	} else {
2713 		/* sum the fitrates */
2714 		frs = alloca(fmep->nsuspects * sizeof (uint_t));
2715 		fridx = frcnt = frsum = 0;
2716 
2717 		for (rp = srl; rp <= erl; rp++) {
2718 			struct node *n;
2719 
2720 			if (rp->suspect == NULL)
2721 				continue;
2722 			if (!is_fault(rp->suspect->t)) {
2723 				frs[fridx++] = 0;
2724 				continue;
2725 			}
2726 			n = eventprop_lookup(rp->suspect, L_FITrate);
2727 			if (node2uint(n, &fr) != 0) {
2728 				out(O_DEBUG|O_NONL, "event ");
2729 				ipath_print(O_DEBUG|O_NONL,
2730 				    rp->suspect->enode->u.event.ename->u.name.s,
2731 				    rp->suspect->ipp);
2732 				out(O_DEBUG, " has no FITrate (using 1)");
2733 				fr = 1;
2734 			} else if (fr == 0) {
2735 				out(O_DEBUG|O_NONL, "event ");
2736 				ipath_print(O_DEBUG|O_NONL,
2737 				    rp->suspect->enode->u.event.ename->u.name.s,
2738 				    rp->suspect->ipp);
2739 				out(O_DEBUG, " has zero FITrate (using 1)");
2740 				fr = 1;
2741 			}
2742 
2743 			frs[fridx++] = fr;
2744 			frsum += fr;
2745 			frcnt++;
2746 		}
2747 		fravg = avg(frsum, frcnt);
2748 		for (fridx = 0; fridx < fmep->nsuspects; fridx++)
2749 			if (frs[fridx] == 0) {
2750 				frs[fridx] = fravg;
2751 				frsum += fravg;
2752 			}
2753 	}
2754 
2755 	/* Add them in reverse order of our sort, as fmd reverses order */
2756 	for (rp = erl; rp >= srl; rp--) {
2757 		if (rp->suspect == NULL)
2758 			continue;
2759 		if (!is_fault(rp->suspect->t))
2760 			allfaulty = B_FALSE;
2761 		if (fmep->nonfault != fmep->nsuspects)
2762 			cert = percentof(frs[--fridx], frsum);
2763 		fault = fmd_nvl_create_fault(fmep->hdl,
2764 		    rp->suspect->enode->u.event.ename->u.name.s,
2765 		    cert,
2766 		    rp->asru,
2767 		    rp->fru,
2768 		    rp->rsrc);
2769 		if (fault == NULL)
2770 			out(O_DIE, "fault creation failed");
2771 		/* if "message" property exists, add it to the fault */
2772 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2773 		    &messval) == 0) {
2774 
2775 			out(O_ALTFP,
2776 			    "[FME%d, %s adds message=%d to suspect list]",
2777 			    fmep->id,
2778 			    rp->suspect->enode->u.event.ename->u.name.s,
2779 			    messval);
2780 			if (nvlist_add_boolean_value(fault,
2781 			    FM_SUSPECT_MESSAGE,
2782 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2783 				out(O_DIE, "cannot add no-message to fault");
2784 			}
2785 		}
2786 		/* add any payload properties */
2787 		lut_walk(rp->suspect->payloadprops,
2788 		    (lut_cb)addpayloadprop, (void *)fault);
2789 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2790 		rslfree(rp);
2791 
2792 		/*
2793 		 * If "action" property exists, evaluate it;  this must be done
2794 		 * before the dupclose check below since some actions may
2795 		 * modify the asru to be used in fmd_nvl_fmri_faulty.  This
2796 		 * needs to be restructured if any new actions are introduced
2797 		 * that have effects that we do not want to be visible if
2798 		 * we decide not to publish in the dupclose check below.
2799 		 */
2800 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2801 			struct evalue evalue;
2802 
2803 			out(O_ALTFP|O_NONL,
2804 			    "[FME%d, %s action ", fmep->id,
2805 			    rp->suspect->enode->u.event.ename->u.name.s);
2806 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2807 			out(O_ALTFP, "]");
2808 			Action_nvl = fault;
2809 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2810 			    NULL, 0, &evalue);
2811 		}
2812 
2813 		/*
2814 		 * check if the asru is already marked as "faulty".
2815 		 */
2816 		if (allfaulty) {
2817 			nvlist_t *asru;
2818 
2819 			out(O_ALTFP|O_VERB, "FMD%d dup check ", fmep->id);
2820 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2821 			out(O_ALTFP|O_VERB|O_NONL, " ");
2822 			if (nvlist_lookup_nvlist(fault,
2823 			    FM_FAULT_ASRU, &asru) != 0) {
2824 				out(O_ALTFP|O_VERB, "NULL asru");
2825 				allfaulty = B_FALSE;
2826 			} else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) {
2827 				out(O_ALTFP|O_VERB, "faulty");
2828 			} else {
2829 				out(O_ALTFP|O_VERB, "not faulty");
2830 				allfaulty = B_FALSE;
2831 			}
2832 		}
2833 
2834 	}
2835 
2836 	/*
2837 	 * We are going to publish so take any pre-publication actions.
2838 	 */
2839 	if (!allfaulty) {
2840 		/*
2841 		 * don't update the count stat if all asrus are already
2842 		 * present and unrepaired in the asru cache
2843 		 */
2844 		for (rp = erl; rp >= srl; rp--) {
2845 			struct event *suspect = rp->suspect;
2846 
2847 			if (suspect == NULL)
2848 				continue;
2849 
2850 			/* if "count" exists, increment the appropriate stat */
2851 			if ((snp = eventprop_lookup(suspect,
2852 			    L_count)) != NULL) {
2853 				out(O_ALTFP|O_NONL,
2854 				    "[FME%d, %s count ", fmep->id,
2855 				    suspect->enode->u.event.ename->u.name.s);
2856 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2857 				out(O_ALTFP, "]");
2858 				istat_bump(snp, 0);
2859 
2860 			}
2861 		}
2862 		istat_save();	/* write out any istat changes */
2863 	}
2864 
2865 	out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
2866 	    fmd_case_uuid(fmep->hdl, fmep->fmcase));
2867 	fmd_case_solve(fmep->hdl, fmep->fmcase);
2868 
2869 	/*
2870 	 * revert to the original suspect list
2871 	 */
2872 	FREE(srl);
2873 	restore_suspects(fmep);
2874 }
2875 
2876 static void
2877 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase)
2878 {
2879 	struct case_list *newcase;
2880 	nvlist_t *defect;
2881 
2882 	out(O_ALTFP,
2883 	    "[undiagnosable ereport received, "
2884 	    "creating and closing a new case (%s)]",
2885 	    Undiag_reason ? Undiag_reason : "reason not provided");
2886 
2887 	newcase = MALLOC(sizeof (struct case_list));
2888 	newcase->next = NULL;
2889 	newcase->fmcase = fmcase;
2890 	if (Undiagablecaselist != NULL)
2891 		newcase->next = Undiagablecaselist;
2892 	Undiagablecaselist = newcase;
2893 
2894 	if (ffep != NULL)
2895 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
2896 
2897 	defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100,
2898 	    NULL, NULL, NULL);
2899 	if (Undiag_reason != NULL)
2900 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2901 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
2902 
2903 	fmd_case_solve(hdl, newcase->fmcase);
2904 	fmd_case_close(hdl, newcase->fmcase);
2905 }
2906 
2907 static void
2908 fme_undiagnosble_pci(struct fme *f, nvlist_t *rc_detector) {
2909 	nvlist_t *defect, *asru;
2910 	char *path;
2911 
2912 	(void) nvlist_lookup_string(rc_detector, FM_FMRI_DEV_PATH, &path);
2913 	out(O_ALTFP, "[solving/closing PCIE FME%d PATH %s]", f->id, path);
2914 
2915 	(void) nvlist_xalloc(&asru, NV_UNIQUE_NAME, &Eft_nv_hdl);
2916 	(void) nvlist_add_uint8(asru, FM_VERSION, FM_HC_SCHEME_VERSION);
2917 	(void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
2918 	(void) nvlist_add_string(asru, FM_FMRI_DEV_PATH, path);
2919 
2920 	defect = fmd_nvl_create_fault(f->hdl,
2921 	    "fault.sunos.eft.unknown_pci_fault", 100,
2922 	    asru, NULL, NULL);
2923 
2924 	(void) nvlist_add_string(defect, UNDIAG_REASON, UD_PCIUNSOLVD);
2925 	fmd_case_pci_undiagnosable(f->hdl, f->fmcase, defect);
2926 
2927 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2928 	fmd_case_solve(f->hdl, f->fmcase);
2929 	fmd_case_close(f->hdl, f->fmcase);
2930 }
2931 
2932 static void
2933 fme_undiagnosable(struct fme *f)
2934 {
2935 	nvlist_t *defect;
2936 	nvlist_t *rc_detector;
2937 
2938 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
2939 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
2940 	    Undiag_reason ? Undiag_reason : "undiagnosable");
2941 
2942 	if ((strcmp(Undiag_reason, UD_UNSOLVD) == 0) &&
2943 	    fmd_case_is_pcie(f->hdl, f->fmcase, &rc_detector)) {
2944 		fme_undiagnosble_pci(f, rc_detector);
2945 		return;
2946 	}
2947 
2948 	defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100,
2949 	    NULL, NULL, NULL);
2950 	if (Undiag_reason != NULL)
2951 		(void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason);
2952 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2953 	fmd_case_solve(f->hdl, f->fmcase);
2954 	fmd_case_close(f->hdl, f->fmcase);
2955 }
2956 
2957 /*
2958  * fme_close_case
2959  *
2960  *	Find the requested case amongst our fmes and close it.  Free up
2961  *	the related fme.
2962  */
2963 void
2964 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
2965 {
2966 	struct case_list *ucasep, *prevcasep = NULL;
2967 	struct fme *prev = NULL;
2968 	struct fme *fmep;
2969 
2970 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
2971 		if (fmcase != ucasep->fmcase) {
2972 			prevcasep = ucasep;
2973 			continue;
2974 		}
2975 
2976 		if (prevcasep == NULL)
2977 			Undiagablecaselist = Undiagablecaselist->next;
2978 		else
2979 			prevcasep->next = ucasep->next;
2980 
2981 		FREE(ucasep);
2982 		return;
2983 	}
2984 
2985 	for (fmep = FMElist; fmep; fmep = fmep->next) {
2986 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
2987 			break;
2988 		prev = fmep;
2989 	}
2990 
2991 	if (fmep == NULL) {
2992 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
2993 		    fmd_case_uuid(hdl, fmcase));
2994 		return;
2995 	}
2996 
2997 	if (EFMElist == fmep)
2998 		EFMElist = prev;
2999 
3000 	if (prev == NULL)
3001 		FMElist = FMElist->next;
3002 	else
3003 		prev->next = fmep->next;
3004 
3005 	fmep->next = NULL;
3006 
3007 	/* Get rid of any timer this fme has set */
3008 	if (fmep->wull != 0)
3009 		fmd_timer_remove(fmep->hdl, fmep->timer);
3010 
3011 	if (ClosedFMEs == NULL) {
3012 		ClosedFMEs = fmep;
3013 	} else {
3014 		fmep->next = ClosedFMEs;
3015 		ClosedFMEs = fmep;
3016 	}
3017 
3018 	Open_fme_count--;
3019 
3020 	/* See if we can close the overflow FME */
3021 	if (Open_fme_count <= Max_fme) {
3022 		for (fmep = FMElist; fmep; fmep = fmep->next) {
3023 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
3024 			    fmep->fmcase)))
3025 				break;
3026 		}
3027 
3028 		if (fmep != NULL)
3029 			fmd_case_close(fmep->hdl, fmep->fmcase);
3030 	}
3031 }
3032 
3033 /*
3034  * fme_set_timer()
3035  *	If the time we need to wait for the given FME is less than the
3036  *	current timer, kick that old timer out and establish a new one.
3037  */
3038 static int
3039 fme_set_timer(struct fme *fmep, unsigned long long wull)
3040 {
3041 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
3042 	ptree_timeval(O_ALTFP|O_VERB, &wull);
3043 
3044 	if (wull <= fmep->pull) {
3045 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
3046 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
3047 		out(O_ALTFP|O_VERB, NULL);
3048 		/* we've waited at least wull already, don't need timer */
3049 		return (0);
3050 	}
3051 
3052 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
3053 	if (fmep->wull != 0) {
3054 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
3055 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
3056 		out(O_ALTFP|O_VERB, NULL);
3057 	} else {
3058 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
3059 		out(O_ALTFP|O_VERB, NULL);
3060 	}
3061 
3062 	if (fmep->wull != 0)
3063 		if (wull >= fmep->wull)
3064 			/* New timer would fire later than established timer */
3065 			return (0);
3066 
3067 	if (fmep->wull != 0) {
3068 		fmd_timer_remove(fmep->hdl, fmep->timer);
3069 	}
3070 
3071 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
3072 	    fmep->e0r, wull);
3073 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
3074 	fmep->wull = wull;
3075 	return (1);
3076 }
3077 
3078 void
3079 fme_timer_fired(struct fme *fmep, id_t tid)
3080 {
3081 	struct fme *ffmep = NULL;
3082 
3083 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
3084 		if (ffmep == fmep)
3085 			break;
3086 
3087 	if (ffmep == NULL) {
3088 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
3089 		    (void *)fmep);
3090 		return;
3091 	}
3092 
3093 	out(O_ALTFP|O_VERB, "Timer fired %lx", tid);
3094 	fmep->pull = fmep->wull;
3095 	fmep->wull = 0;
3096 	fmd_buf_write(fmep->hdl, fmep->fmcase,
3097 	    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
3098 
3099 	fme_eval(fmep, fmep->e0r);
3100 }
3101 
3102 /*
3103  * Preserve the fme's suspect list in its psuspects list, NULLing the
3104  * suspects list in the meantime.
3105  */
3106 static void
3107 save_suspects(struct fme *fmep)
3108 {
3109 	struct event *ep;
3110 	struct event *nextep;
3111 
3112 	/* zero out the previous suspect list */
3113 	for (ep = fmep->psuspects; ep; ep = nextep) {
3114 		nextep = ep->psuspects;
3115 		ep->psuspects = NULL;
3116 	}
3117 	fmep->psuspects = NULL;
3118 
3119 	/* zero out the suspect list, copying it to previous suspect list */
3120 	fmep->psuspects = fmep->suspects;
3121 	for (ep = fmep->suspects; ep; ep = nextep) {
3122 		nextep = ep->suspects;
3123 		ep->psuspects = ep->suspects;
3124 		ep->suspects = NULL;
3125 		ep->is_suspect = 0;
3126 	}
3127 	fmep->suspects = NULL;
3128 	fmep->nsuspects = 0;
3129 	fmep->nonfault = 0;
3130 }
3131 
3132 /*
3133  * Retrieve the fme's suspect list from its psuspects list.
3134  */
3135 static void
3136 restore_suspects(struct fme *fmep)
3137 {
3138 	struct event *ep;
3139 	struct event *nextep;
3140 
3141 	fmep->nsuspects = fmep->nonfault = 0;
3142 	fmep->suspects = fmep->psuspects;
3143 	for (ep = fmep->psuspects; ep; ep = nextep) {
3144 		fmep->nsuspects++;
3145 		if (!is_fault(ep->t))
3146 			fmep->nonfault++;
3147 		nextep = ep->psuspects;
3148 		ep->suspects = ep->psuspects;
3149 	}
3150 }
3151 
3152 /*
3153  * this is what we use to call the Emrys prototype code instead of main()
3154  */
3155 static void
3156 fme_eval(struct fme *fmep, fmd_event_t *ffep)
3157 {
3158 	struct event *ep;
3159 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
3160 
3161 	save_suspects(fmep);
3162 
3163 	out(O_ALTFP, "Evaluate FME %d", fmep->id);
3164 	indent_set("  ");
3165 
3166 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
3167 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
3168 
3169 	out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
3170 	    fme_state2str(fmep->state));
3171 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
3172 		out(O_ALTFP|O_NONL, " ");
3173 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
3174 	}
3175 	out(O_ALTFP, NULL);
3176 
3177 	switch (fmep->state) {
3178 	case FME_CREDIBLE:
3179 		print_suspects(SLNEW, fmep);
3180 		(void) upsets_eval(fmep, ffep);
3181 
3182 		/*
3183 		 * we may have already posted suspects in upsets_eval() which
3184 		 * can recurse into fme_eval() again. If so then just return.
3185 		 */
3186 		if (fmep->posted_suspects)
3187 			return;
3188 
3189 		publish_suspects(fmep);
3190 		fmep->posted_suspects = 1;
3191 		fmd_buf_write(fmep->hdl, fmep->fmcase,
3192 		    WOBUF_POSTD,
3193 		    (void *)&fmep->posted_suspects,
3194 		    sizeof (fmep->posted_suspects));
3195 
3196 		/*
3197 		 * Now the suspects have been posted, we can clear up
3198 		 * the instance tree as we won't be looking at it again.
3199 		 * Also cancel the timer as the case is now solved.
3200 		 */
3201 		if (fmep->wull != 0) {
3202 			fmd_timer_remove(fmep->hdl, fmep->timer);
3203 			fmep->wull = 0;
3204 		}
3205 		break;
3206 
3207 	case FME_WAIT:
3208 		ASSERT(my_delay > fmep->ull);
3209 		(void) fme_set_timer(fmep, my_delay);
3210 		print_suspects(SLWAIT, fmep);
3211 		itree_prune(fmep->eventtree);
3212 		return;
3213 
3214 	case FME_DISPROVED:
3215 		print_suspects(SLDISPROVED, fmep);
3216 		Undiag_reason = UD_UNSOLVD;
3217 		fme_undiagnosable(fmep);
3218 		break;
3219 	}
3220 
3221 	if (fmep->posted_suspects == 1 && Autoclose != NULL) {
3222 		int doclose = 0;
3223 
3224 		if (strcmp(Autoclose, "true") == 0 ||
3225 		    strcmp(Autoclose, "all") == 0)
3226 			doclose = 1;
3227 
3228 		if (strcmp(Autoclose, "upsets") == 0) {
3229 			doclose = 1;
3230 			for (ep = fmep->suspects; ep; ep = ep->suspects) {
3231 				if (ep->t != N_UPSET) {
3232 					doclose = 0;
3233 					break;
3234 				}
3235 			}
3236 		}
3237 
3238 		if (doclose) {
3239 			out(O_ALTFP, "[closing FME%d, case %s (autoclose)]",
3240 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
3241 			fmd_case_close(fmep->hdl, fmep->fmcase);
3242 		}
3243 	}
3244 	itree_free(fmep->eventtree);
3245 	fmep->eventtree = NULL;
3246 	structconfig_free(fmep->config);
3247 	fmep->config = NULL;
3248 	destroy_fme_bufs(fmep);
3249 }
3250 
3251 static void indent(void);
3252 static int triggered(struct fme *fmep, struct event *ep, int mark);
3253 static enum fme_state effects_test(struct fme *fmep,
3254     struct event *fault_event, unsigned long long at_latest_by,
3255     unsigned long long *pdelay);
3256 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
3257     unsigned long long at_latest_by, unsigned long long *pdelay);
3258 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
3259     unsigned long long at_latest_by, unsigned long long *pdelay);
3260 
3261 static int
3262 checkconstraints(struct fme *fmep, struct arrow *arrowp)
3263 {
3264 	struct constraintlist *ctp;
3265 	struct evalue value;
3266 	char *sep = "";
3267 
3268 	if (arrowp->forever_false) {
3269 		indent();
3270 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
3271 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3272 			out(O_ALTFP|O_VERB|O_NONL, sep);
3273 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3274 			sep = ", ";
3275 		}
3276 		out(O_ALTFP|O_VERB, NULL);
3277 		return (0);
3278 	}
3279 	if (arrowp->forever_true) {
3280 		indent();
3281 		out(O_ALTFP|O_VERB|O_NONL, "  Forever true constraint: ");
3282 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3283 			out(O_ALTFP|O_VERB|O_NONL, sep);
3284 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3285 			sep = ", ";
3286 		}
3287 		out(O_ALTFP|O_VERB, NULL);
3288 		return (1);
3289 	}
3290 
3291 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3292 		if (eval_expr(ctp->cnode, NULL, NULL,
3293 		    &fmep->globals, fmep->config,
3294 		    arrowp, 0, &value)) {
3295 			/* evaluation successful */
3296 			if (value.t == UNDEFINED || value.v == 0) {
3297 				/* known false */
3298 				arrowp->forever_false = 1;
3299 				indent();
3300 				out(O_ALTFP|O_VERB|O_NONL,
3301 				    "  False constraint: ");
3302 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3303 				out(O_ALTFP|O_VERB, NULL);
3304 				return (0);
3305 			}
3306 		} else {
3307 			/* evaluation unsuccessful -- unknown value */
3308 			indent();
3309 			out(O_ALTFP|O_VERB|O_NONL,
3310 			    "  Deferred constraint: ");
3311 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3312 			out(O_ALTFP|O_VERB, NULL);
3313 			return (1);
3314 		}
3315 	}
3316 	/* known true */
3317 	arrowp->forever_true = 1;
3318 	indent();
3319 	out(O_ALTFP|O_VERB|O_NONL, "  True constraint: ");
3320 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3321 		out(O_ALTFP|O_VERB|O_NONL, sep);
3322 		ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3323 		sep = ", ";
3324 	}
3325 	out(O_ALTFP|O_VERB, NULL);
3326 	return (1);
3327 }
3328 
3329 static int
3330 triggered(struct fme *fmep, struct event *ep, int mark)
3331 {
3332 	struct bubble *bp;
3333 	struct arrowlist *ap;
3334 	int count = 0;
3335 
3336 	stats_counter_bump(fmep->Tcallcount);
3337 	for (bp = itree_next_bubble(ep, NULL); bp;
3338 	    bp = itree_next_bubble(ep, bp)) {
3339 		if (bp->t != B_TO)
3340 			continue;
3341 		for (ap = itree_next_arrow(bp, NULL); ap;
3342 		    ap = itree_next_arrow(bp, ap)) {
3343 			/* check count of marks against K in the bubble */
3344 			if ((ap->arrowp->mark & mark) &&
3345 			    ++count >= bp->nork)
3346 				return (1);
3347 		}
3348 	}
3349 	return (0);
3350 }
3351 
3352 static int
3353 mark_arrows(struct fme *fmep, struct event *ep, int mark,
3354     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
3355 {
3356 	struct bubble *bp;
3357 	struct arrowlist *ap;
3358 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3359 	unsigned long long my_delay;
3360 	enum fme_state result;
3361 	int retval = 0;
3362 
3363 	for (bp = itree_next_bubble(ep, NULL); bp;
3364 	    bp = itree_next_bubble(ep, bp)) {
3365 		if (bp->t != B_FROM)
3366 			continue;
3367 		stats_counter_bump(fmep->Marrowcount);
3368 		for (ap = itree_next_arrow(bp, NULL); ap;
3369 		    ap = itree_next_arrow(bp, ap)) {
3370 			struct event *ep2 = ap->arrowp->head->myevent;
3371 			/*
3372 			 * if we're clearing marks, we can avoid doing
3373 			 * all that work evaluating constraints.
3374 			 */
3375 			if (mark == 0) {
3376 				if (ap->arrowp->arrow_marked == 0)
3377 					continue;
3378 				ap->arrowp->arrow_marked = 0;
3379 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
3380 				if (keep && (ep2->cached_state &
3381 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
3382 					ep2->keep_in_tree = 1;
3383 				ep2->cached_state &=
3384 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
3385 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
3386 				    keep);
3387 				continue;
3388 			}
3389 			ap->arrowp->arrow_marked = 1;
3390 			if (ep2->cached_state & REQMNTS_DISPROVED) {
3391 				indent();
3392 				out(O_ALTFP|O_VERB|O_NONL,
3393 				    "  ALREADY DISPROVED ");
3394 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3395 				out(O_ALTFP|O_VERB, NULL);
3396 				continue;
3397 			}
3398 			if (ep2->cached_state & WAIT_EFFECT) {
3399 				indent();
3400 				out(O_ALTFP|O_VERB|O_NONL,
3401 				    "  ALREADY EFFECTS WAIT ");
3402 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3403 				out(O_ALTFP|O_VERB, NULL);
3404 				continue;
3405 			}
3406 			if (ep2->cached_state & CREDIBLE_EFFECT) {
3407 				indent();
3408 				out(O_ALTFP|O_VERB|O_NONL,
3409 				    "  ALREADY EFFECTS CREDIBLE ");
3410 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3411 				out(O_ALTFP|O_VERB, NULL);
3412 				continue;
3413 			}
3414 			if ((ep2->cached_state & PARENT_WAIT) &&
3415 			    (mark & PARENT_WAIT)) {
3416 				indent();
3417 				out(O_ALTFP|O_VERB|O_NONL,
3418 				    "  ALREADY PARENT EFFECTS WAIT ");
3419 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3420 				out(O_ALTFP|O_VERB, NULL);
3421 				continue;
3422 			}
3423 			platform_set_payloadnvp(ep2->nvp);
3424 			if (checkconstraints(fmep, ap->arrowp) == 0) {
3425 				platform_set_payloadnvp(NULL);
3426 				indent();
3427 				out(O_ALTFP|O_VERB|O_NONL,
3428 				    "  CONSTRAINTS FAIL ");
3429 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3430 				out(O_ALTFP|O_VERB, NULL);
3431 				continue;
3432 			}
3433 			platform_set_payloadnvp(NULL);
3434 			ap->arrowp->mark |= EFFECTS_COUNTER;
3435 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
3436 				indent();
3437 				out(O_ALTFP|O_VERB|O_NONL,
3438 				    "  K-COUNT NOT YET MET ");
3439 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3440 				out(O_ALTFP|O_VERB, NULL);
3441 				continue;
3442 			}
3443 			ep2->cached_state &= ~PARENT_WAIT;
3444 			/*
3445 			 * if we've reached an ereport and no propagation time
3446 			 * is specified, use the Hesitate value
3447 			 */
3448 			if (ep2->t == N_EREPORT && at_latest_by == 0ULL &&
3449 			    ap->arrowp->maxdelay == 0ULL) {
3450 				out(O_ALTFP|O_VERB|O_NONL, "  default wait ");
3451 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3452 				out(O_ALTFP|O_VERB, NULL);
3453 				result = requirements_test(fmep, ep2, Hesitate,
3454 				    &my_delay);
3455 			} else {
3456 				result = requirements_test(fmep, ep2,
3457 				    at_latest_by + ap->arrowp->maxdelay,
3458 				    &my_delay);
3459 			}
3460 			if (result == FME_WAIT) {
3461 				retval = WAIT_EFFECT;
3462 				if (overall_delay > my_delay)
3463 					overall_delay = my_delay;
3464 				ep2->cached_state |= WAIT_EFFECT;
3465 				indent();
3466 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
3467 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3468 				out(O_ALTFP|O_VERB, NULL);
3469 				indent_push("  E");
3470 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
3471 				    at_latest_by, &my_delay, 0) ==
3472 				    WAIT_EFFECT) {
3473 					retval = WAIT_EFFECT;
3474 					if (overall_delay > my_delay)
3475 						overall_delay = my_delay;
3476 				}
3477 				indent_pop();
3478 			} else if (result == FME_DISPROVED) {
3479 				indent();
3480 				out(O_ALTFP|O_VERB|O_NONL,
3481 				    "  EFFECTS DISPROVED ");
3482 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3483 				out(O_ALTFP|O_VERB, NULL);
3484 			} else {
3485 				ep2->cached_state |= mark;
3486 				indent();
3487 				if (mark == CREDIBLE_EFFECT)
3488 					out(O_ALTFP|O_VERB|O_NONL,
3489 					    "  EFFECTS CREDIBLE ");
3490 				else
3491 					out(O_ALTFP|O_VERB|O_NONL,
3492 					    "  PARENT EFFECTS WAIT ");
3493 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3494 				out(O_ALTFP|O_VERB, NULL);
3495 				indent_push("  E");
3496 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
3497 				    &my_delay, 0) == WAIT_EFFECT) {
3498 					retval = WAIT_EFFECT;
3499 					if (overall_delay > my_delay)
3500 						overall_delay = my_delay;
3501 				}
3502 				indent_pop();
3503 			}
3504 		}
3505 	}
3506 	if (retval == WAIT_EFFECT)
3507 		*pdelay = overall_delay;
3508 	return (retval);
3509 }
3510 
3511 static enum fme_state
3512 effects_test(struct fme *fmep, struct event *fault_event,
3513     unsigned long long at_latest_by, unsigned long long *pdelay)
3514 {
3515 	struct event *error_event;
3516 	enum fme_state return_value = FME_CREDIBLE;
3517 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3518 	unsigned long long my_delay;
3519 
3520 	stats_counter_bump(fmep->Ecallcount);
3521 	indent_push("  E");
3522 	indent();
3523 	out(O_ALTFP|O_VERB|O_NONL, "->");
3524 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3525 	out(O_ALTFP|O_VERB, NULL);
3526 
3527 	if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
3528 	    &my_delay, 0) == WAIT_EFFECT) {
3529 		return_value = FME_WAIT;
3530 		if (overall_delay > my_delay)
3531 			overall_delay = my_delay;
3532 	}
3533 	for (error_event = fmep->observations;
3534 	    error_event; error_event = error_event->observations) {
3535 		indent();
3536 		out(O_ALTFP|O_VERB|O_NONL, " ");
3537 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
3538 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
3539 			if (error_event->cached_state &
3540 			    (PARENT_WAIT|WAIT_EFFECT)) {
3541 				out(O_ALTFP|O_VERB, " NOT YET triggered");
3542 				continue;
3543 			}
3544 			return_value = FME_DISPROVED;
3545 			out(O_ALTFP|O_VERB, " NOT triggered");
3546 			break;
3547 		} else {
3548 			out(O_ALTFP|O_VERB, " triggered");
3549 		}
3550 	}
3551 	if (return_value == FME_DISPROVED) {
3552 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
3553 	} else {
3554 		fault_event->keep_in_tree = 1;
3555 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
3556 	}
3557 
3558 	indent();
3559 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
3560 	    fme_state2str(return_value));
3561 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3562 	out(O_ALTFP|O_VERB, NULL);
3563 	indent_pop();
3564 	if (return_value == FME_WAIT)
3565 		*pdelay = overall_delay;
3566 	return (return_value);
3567 }
3568 
3569 static enum fme_state
3570 requirements_test(struct fme *fmep, struct event *ep,
3571     unsigned long long at_latest_by, unsigned long long *pdelay)
3572 {
3573 	int waiting_events;
3574 	int credible_events;
3575 	int deferred_events;
3576 	enum fme_state return_value = FME_CREDIBLE;
3577 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3578 	unsigned long long arrow_delay;
3579 	unsigned long long my_delay;
3580 	struct event *ep2;
3581 	struct bubble *bp;
3582 	struct arrowlist *ap;
3583 
3584 	if (ep->cached_state & REQMNTS_CREDIBLE) {
3585 		indent();
3586 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
3587 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3588 		out(O_ALTFP|O_VERB, NULL);
3589 		return (FME_CREDIBLE);
3590 	}
3591 	if (ep->cached_state & REQMNTS_DISPROVED) {
3592 		indent();
3593 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
3594 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3595 		out(O_ALTFP|O_VERB, NULL);
3596 		return (FME_DISPROVED);
3597 	}
3598 	if (ep->cached_state & REQMNTS_WAIT) {
3599 		indent();
3600 		*pdelay = ep->cached_delay;
3601 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
3602 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3603 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
3604 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3605 		out(O_ALTFP|O_VERB, NULL);
3606 		return (FME_WAIT);
3607 	}
3608 	stats_counter_bump(fmep->Rcallcount);
3609 	indent_push("  R");
3610 	indent();
3611 	out(O_ALTFP|O_VERB|O_NONL, "->");
3612 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3613 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3614 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3615 	out(O_ALTFP|O_VERB, NULL);
3616 
3617 	if (ep->t == N_EREPORT) {
3618 		if (ep->count == 0) {
3619 			if (fmep->pull >= at_latest_by) {
3620 				return_value = FME_DISPROVED;
3621 			} else {
3622 				ep->cached_delay = *pdelay = at_latest_by;
3623 				return_value = FME_WAIT;
3624 			}
3625 		}
3626 
3627 		indent();
3628 		switch (return_value) {
3629 		case FME_CREDIBLE:
3630 			ep->cached_state |= REQMNTS_CREDIBLE;
3631 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
3632 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3633 			break;
3634 		case FME_DISPROVED:
3635 			ep->cached_state |= REQMNTS_DISPROVED;
3636 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3637 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3638 			break;
3639 		case FME_WAIT:
3640 			ep->cached_state |= REQMNTS_WAIT;
3641 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
3642 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3643 			out(O_ALTFP|O_VERB|O_NONL, " to ");
3644 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3645 			break;
3646 		default:
3647 			out(O_DIE, "requirements_test: unexpected fme_state");
3648 			break;
3649 		}
3650 		out(O_ALTFP|O_VERB, NULL);
3651 		indent_pop();
3652 
3653 		return (return_value);
3654 	}
3655 
3656 	/* this event is not a report, descend the tree */
3657 	for (bp = itree_next_bubble(ep, NULL); bp;
3658 	    bp = itree_next_bubble(ep, bp)) {
3659 		int n;
3660 
3661 		if (bp->t != B_FROM)
3662 			continue;
3663 
3664 		n = bp->nork;
3665 
3666 		credible_events = 0;
3667 		waiting_events = 0;
3668 		deferred_events = 0;
3669 		arrow_delay = TIMEVAL_EVENTUALLY;
3670 		/*
3671 		 * n is -1 for 'A' so adjust it.
3672 		 * XXX just count up the arrows for now.
3673 		 */
3674 		if (n < 0) {
3675 			n = 0;
3676 			for (ap = itree_next_arrow(bp, NULL); ap;
3677 			    ap = itree_next_arrow(bp, ap))
3678 				n++;
3679 			indent();
3680 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
3681 		} else {
3682 			indent();
3683 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
3684 		}
3685 
3686 		if (n == 0)
3687 			continue;
3688 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
3689 			for (ap = itree_next_arrow(bp, NULL); ap;
3690 			    ap = itree_next_arrow(bp, ap)) {
3691 				ep2 = ap->arrowp->head->myevent;
3692 				platform_set_payloadnvp(ep2->nvp);
3693 				if (checkconstraints(fmep, ap->arrowp) == 0) {
3694 					/*
3695 					 * if any arrow is invalidated by the
3696 					 * constraints, then we should elide the
3697 					 * whole bubble to be consistant with
3698 					 * the tree creation time behaviour
3699 					 */
3700 					bp->mark |= BUBBLE_ELIDED;
3701 					platform_set_payloadnvp(NULL);
3702 					break;
3703 				}
3704 				platform_set_payloadnvp(NULL);
3705 			}
3706 		}
3707 		if (bp->mark & BUBBLE_ELIDED)
3708 			continue;
3709 		bp->mark |= BUBBLE_OK;
3710 		for (ap = itree_next_arrow(bp, NULL); ap;
3711 		    ap = itree_next_arrow(bp, ap)) {
3712 			ep2 = ap->arrowp->head->myevent;
3713 			if (n <= credible_events)
3714 				break;
3715 
3716 			ap->arrowp->mark |= REQMNTS_COUNTER;
3717 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
3718 				/* XXX adding max timevals! */
3719 				switch (requirements_test(fmep, ep2,
3720 				    at_latest_by + ap->arrowp->maxdelay,
3721 				    &my_delay)) {
3722 				case FME_DEFERRED:
3723 					deferred_events++;
3724 					break;
3725 				case FME_CREDIBLE:
3726 					credible_events++;
3727 					break;
3728 				case FME_DISPROVED:
3729 					break;
3730 				case FME_WAIT:
3731 					if (my_delay < arrow_delay)
3732 						arrow_delay = my_delay;
3733 					waiting_events++;
3734 					break;
3735 				default:
3736 					out(O_DIE,
3737 					"Bug in requirements_test.");
3738 				}
3739 			else
3740 				deferred_events++;
3741 		}
3742 		indent();
3743 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
3744 		    credible_events + deferred_events, waiting_events);
3745 		if (credible_events + deferred_events + waiting_events < n) {
3746 			/* Can never meet requirements */
3747 			ep->cached_state |= REQMNTS_DISPROVED;
3748 			indent();
3749 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3750 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3751 			out(O_ALTFP|O_VERB, NULL);
3752 			indent_pop();
3753 			return (FME_DISPROVED);
3754 		}
3755 		if (credible_events + deferred_events < n) {
3756 			/* will have to wait */
3757 			/* wait time is shortest known */
3758 			if (arrow_delay < overall_delay)
3759 				overall_delay = arrow_delay;
3760 			return_value = FME_WAIT;
3761 		} else if (credible_events < n) {
3762 			if (return_value != FME_WAIT)
3763 				return_value = FME_DEFERRED;
3764 		}
3765 	}
3766 
3767 	/*
3768 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
3769 	 * path, then this will be considered FME_CREDIBLE. But if it is
3770 	 * reached by a different path so the K-count is met, then might
3771 	 * get overridden by FME_WAIT or FME_DISPROVED.
3772 	 */
3773 	if (return_value == FME_WAIT) {
3774 		ep->cached_state |= REQMNTS_WAIT;
3775 		ep->cached_delay = *pdelay = overall_delay;
3776 	} else if (return_value == FME_CREDIBLE) {
3777 		ep->cached_state |= REQMNTS_CREDIBLE;
3778 	}
3779 	indent();
3780 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
3781 	    fme_state2str(return_value));
3782 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3783 	out(O_ALTFP|O_VERB, NULL);
3784 	indent_pop();
3785 	return (return_value);
3786 }
3787 
3788 static enum fme_state
3789 causes_test(struct fme *fmep, struct event *ep,
3790     unsigned long long at_latest_by, unsigned long long *pdelay)
3791 {
3792 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3793 	unsigned long long my_delay;
3794 	int credible_results = 0;
3795 	int waiting_results = 0;
3796 	enum fme_state fstate;
3797 	struct event *tail_event;
3798 	struct bubble *bp;
3799 	struct arrowlist *ap;
3800 	int k = 1;
3801 
3802 	stats_counter_bump(fmep->Ccallcount);
3803 	indent_push("  C");
3804 	indent();
3805 	out(O_ALTFP|O_VERB|O_NONL, "->");
3806 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3807 	out(O_ALTFP|O_VERB, NULL);
3808 
3809 	for (bp = itree_next_bubble(ep, NULL); bp;
3810 	    bp = itree_next_bubble(ep, bp)) {
3811 		if (bp->t != B_TO)
3812 			continue;
3813 		k = bp->nork;	/* remember the K value */
3814 		for (ap = itree_next_arrow(bp, NULL); ap;
3815 		    ap = itree_next_arrow(bp, ap)) {
3816 			int do_not_follow = 0;
3817 
3818 			/*
3819 			 * if we get to the same event multiple times
3820 			 * only worry about the first one.
3821 			 */
3822 			if (ap->arrowp->tail->myevent->cached_state &
3823 			    CAUSES_TESTED) {
3824 				indent();
3825 				out(O_ALTFP|O_VERB|O_NONL,
3826 				    "  causes test already run for ");
3827 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3828 				    ap->arrowp->tail->myevent);
3829 				out(O_ALTFP|O_VERB, NULL);
3830 				continue;
3831 			}
3832 
3833 			/*
3834 			 * see if false constraint prevents us
3835 			 * from traversing this arrow
3836 			 */
3837 			platform_set_payloadnvp(ep->nvp);
3838 			if (checkconstraints(fmep, ap->arrowp) == 0)
3839 				do_not_follow = 1;
3840 			platform_set_payloadnvp(NULL);
3841 			if (do_not_follow) {
3842 				indent();
3843 				out(O_ALTFP|O_VERB|O_NONL,
3844 				    "  False arrow from ");
3845 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3846 				    ap->arrowp->tail->myevent);
3847 				out(O_ALTFP|O_VERB, NULL);
3848 				continue;
3849 			}
3850 
3851 			ap->arrowp->tail->myevent->cached_state |=
3852 			    CAUSES_TESTED;
3853 			tail_event = ap->arrowp->tail->myevent;
3854 			fstate = hypothesise(fmep, tail_event, at_latest_by,
3855 			    &my_delay);
3856 
3857 			switch (fstate) {
3858 			case FME_WAIT:
3859 				if (my_delay < overall_delay)
3860 					overall_delay = my_delay;
3861 				waiting_results++;
3862 				break;
3863 			case FME_CREDIBLE:
3864 				credible_results++;
3865 				break;
3866 			case FME_DISPROVED:
3867 				break;
3868 			default:
3869 				out(O_DIE, "Bug in causes_test");
3870 			}
3871 		}
3872 	}
3873 	/* compare against K */
3874 	if (credible_results + waiting_results < k) {
3875 		indent();
3876 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
3877 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3878 		out(O_ALTFP|O_VERB, NULL);
3879 		indent_pop();
3880 		return (FME_DISPROVED);
3881 	}
3882 	if (waiting_results != 0) {
3883 		*pdelay = overall_delay;
3884 		indent();
3885 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
3886 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3887 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3888 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3889 		out(O_ALTFP|O_VERB, NULL);
3890 		indent_pop();
3891 		return (FME_WAIT);
3892 	}
3893 	indent();
3894 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
3895 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3896 	out(O_ALTFP|O_VERB, NULL);
3897 	indent_pop();
3898 	return (FME_CREDIBLE);
3899 }
3900 
3901 static enum fme_state
3902 hypothesise(struct fme *fmep, struct event *ep,
3903 	unsigned long long at_latest_by, unsigned long long *pdelay)
3904 {
3905 	enum fme_state rtr, otr;
3906 	unsigned long long my_delay;
3907 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3908 
3909 	stats_counter_bump(fmep->Hcallcount);
3910 	indent_push("  H");
3911 	indent();
3912 	out(O_ALTFP|O_VERB|O_NONL, "->");
3913 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3914 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3915 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3916 	out(O_ALTFP|O_VERB, NULL);
3917 
3918 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
3919 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
3920 		overall_delay = my_delay;
3921 	if (rtr != FME_DISPROVED) {
3922 		if (is_problem(ep->t)) {
3923 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
3924 			if (otr != FME_DISPROVED) {
3925 				if (fmep->peek == 0 && ep->is_suspect == 0) {
3926 					ep->suspects = fmep->suspects;
3927 					ep->is_suspect = 1;
3928 					fmep->suspects = ep;
3929 					fmep->nsuspects++;
3930 					if (!is_fault(ep->t))
3931 						fmep->nonfault++;
3932 				}
3933 			}
3934 		} else
3935 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
3936 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
3937 			overall_delay = my_delay;
3938 		if ((otr != FME_DISPROVED) &&
3939 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
3940 			*pdelay = overall_delay;
3941 	}
3942 	if (rtr == FME_DISPROVED) {
3943 		indent();
3944 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3945 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3946 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
3947 		indent_pop();
3948 		return (FME_DISPROVED);
3949 	}
3950 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
3951 		indent();
3952 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3953 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3954 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
3955 		indent_pop();
3956 		return (FME_DISPROVED);
3957 	}
3958 	if (otr == FME_DISPROVED) {
3959 		indent();
3960 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
3961 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3962 		out(O_ALTFP|O_VERB, " (causes are not credible)");
3963 		indent_pop();
3964 		return (FME_DISPROVED);
3965 	}
3966 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
3967 		indent();
3968 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
3969 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3970 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3971 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
3972 		out(O_ALTFP|O_VERB, NULL);
3973 		indent_pop();
3974 		return (FME_WAIT);
3975 	}
3976 	indent();
3977 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
3978 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3979 	out(O_ALTFP|O_VERB, NULL);
3980 	indent_pop();
3981 	return (FME_CREDIBLE);
3982 }
3983 
3984 /*
3985  * fme_istat_load -- reconstitute any persistent istats
3986  */
3987 void
3988 fme_istat_load(fmd_hdl_t *hdl)
3989 {
3990 	int sz;
3991 	char *sbuf;
3992 	char *ptr;
3993 
3994 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
3995 		out(O_ALTFP, "fme_istat_load: No stats");
3996 		return;
3997 	}
3998 
3999 	sbuf = alloca(sz);
4000 
4001 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
4002 
4003 	/*
4004 	 * pick apart the serialized stats
4005 	 *
4006 	 * format is:
4007 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
4008 	 * for example:
4009 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
4010 	 *
4011 	 * since this is parsing our own serialized data, any parsing issues
4012 	 * are fatal, so we check for them all with ASSERT() below.
4013 	 */
4014 	ptr = sbuf;
4015 	while (ptr < &sbuf[sz]) {
4016 		char *sepptr;
4017 		struct node *np;
4018 		int val;
4019 
4020 		sepptr = strchr(ptr, '@');
4021 		ASSERT(sepptr != NULL);
4022 		*sepptr = '\0';
4023 
4024 		/* construct the event */
4025 		np = newnode(T_EVENT, NULL, 0);
4026 		np->u.event.ename = newnode(T_NAME, NULL, 0);
4027 		np->u.event.ename->u.name.t = N_STAT;
4028 		np->u.event.ename->u.name.s = stable(ptr);
4029 		np->u.event.ename->u.name.it = IT_ENAME;
4030 		np->u.event.ename->u.name.last = np->u.event.ename;
4031 
4032 		ptr = sepptr + 1;
4033 		ASSERT(ptr < &sbuf[sz]);
4034 		ptr += strlen(ptr);
4035 		ptr++;	/* move past the '\0' separating path from value */
4036 		ASSERT(ptr < &sbuf[sz]);
4037 		ASSERT(isdigit(*ptr));
4038 		val = atoi(ptr);
4039 		ASSERT(val > 0);
4040 		ptr += strlen(ptr);
4041 		ptr++;	/* move past the final '\0' for this entry */
4042 
4043 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
4044 		ASSERT(np->u.event.epname != NULL);
4045 
4046 		istat_bump(np, val);
4047 		tree_free(np);
4048 	}
4049 
4050 	istat_save();
4051 }
4052