xref: /titanic_51/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision fc51f9bbbff02dbd8c3adf640b1a184ceeb58fa5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  * fme.c -- fault management exercise module
27  *
28  * this module provides the simulated fault management exercise.
29  */
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <strings.h>
35 #include <ctype.h>
36 #include <alloca.h>
37 #include <libnvpair.h>
38 #include <sys/fm/protocol.h>
39 #include <fm/fmd_api.h>
40 #include "alloc.h"
41 #include "out.h"
42 #include "stats.h"
43 #include "stable.h"
44 #include "literals.h"
45 #include "lut.h"
46 #include "tree.h"
47 #include "ptree.h"
48 #include "itree.h"
49 #include "ipath.h"
50 #include "fme.h"
51 #include "evnv.h"
52 #include "eval.h"
53 #include "config.h"
54 #include "platform.h"
55 #include "esclex.h"
56 
57 /* imported from eft.c... */
58 extern hrtime_t Hesitate;
59 extern char *Serd_Override;
60 extern nv_alloc_t Eft_nv_hdl;
61 extern int Max_fme;
62 extern fmd_hdl_t *Hdl;
63 
64 static int Istat_need_save;
65 static int Serd_need_save;
66 void istat_save(void);
67 void serd_save(void);
68 
69 /* fme under construction is global so we can free it on module abort */
70 static struct fme *Nfmep;
71 
72 static int Undiag_reason = UD_VAL_UNKNOWN;
73 
74 static int Nextid = 0;
75 
76 static int Open_fme_count = 0;	/* Count of open FMEs */
77 
78 /* list of fault management exercises underway */
79 static struct fme {
80 	struct fme *next;		/* next exercise */
81 	unsigned long long ull;		/* time when fme was created */
82 	int id;				/* FME id */
83 	struct config *config;		/* cooked configuration data */
84 	struct lut *eventtree;		/* propagation tree for this FME */
85 	/*
86 	 * The initial error report that created this FME is kept in
87 	 * two forms.  e0 points to the instance tree node and is used
88 	 * by fme_eval() as the starting point for the inference
89 	 * algorithm.  e0r is the event handle FMD passed to us when
90 	 * the ereport first arrived and is used when setting timers,
91 	 * which are always relative to the time of this initial
92 	 * report.
93 	 */
94 	struct event *e0;
95 	fmd_event_t *e0r;
96 
97 	id_t    timer;			/* for setting an fmd time-out */
98 
99 	struct event *ecurrent;		/* ereport under consideration */
100 	struct event *suspects;		/* current suspect list */
101 	struct event *psuspects;	/* previous suspect list */
102 	int nsuspects;			/* count of suspects */
103 	int posted_suspects;		/* true if we've posted a diagnosis */
104 	int uniqobs;			/* number of unique events observed */
105 	int peek;			/* just peeking, don't track suspects */
106 	int overflow;			/* true if overflow FME */
107 	enum fme_state {
108 		FME_NOTHING = 5000,	/* not evaluated yet */
109 		FME_WAIT,		/* need to wait for more info */
110 		FME_CREDIBLE,		/* suspect list is credible */
111 		FME_DISPROVED,		/* no valid suspects found */
112 		FME_DEFERRED		/* don't know yet (k-count not met) */
113 	} state;
114 
115 	unsigned long long pull;	/* time passed since created */
116 	unsigned long long wull;	/* wait until this time for re-eval */
117 	struct event *observations;	/* observation list */
118 	struct lut *globals;		/* values of global variables */
119 	/* fmd interfacing */
120 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
121 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
122 	/* stats */
123 	struct stats *Rcount;
124 	struct stats *Hcallcount;
125 	struct stats *Rcallcount;
126 	struct stats *Ccallcount;
127 	struct stats *Ecallcount;
128 	struct stats *Tcallcount;
129 	struct stats *Marrowcount;
130 	struct stats *diags;
131 } *FMElist, *EFMElist, *ClosedFMEs;
132 
133 static struct case_list {
134 	fmd_case_t *fmcase;
135 	struct case_list *next;
136 } *Undiagablecaselist;
137 
138 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
139 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
140 	unsigned long long at_latest_by, unsigned long long *pdelay);
141 static struct node *eventprop_lookup(struct event *ep, const char *propname);
142 static struct node *pathstring2epnamenp(char *path);
143 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep,
144 	fmd_case_t *fmcase);
145 static const char *undiag_2reason_str(int ud);
146 static const char *undiag_2defect_str(int ud);
147 static void restore_suspects(struct fme *fmep);
148 static void save_suspects(struct fme *fmep);
149 static void destroy_fme(struct fme *f);
150 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
151     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
152 static void istat_counter_reset_cb(struct istat_entry *entp,
153     struct stats *statp, const struct ipath *ipp);
154 static void istat_counter_topo_chg_cb(struct istat_entry *entp,
155     struct stats *statp, void *unused);
156 static void serd_reset_cb(struct serd_entry *entp, void *unused,
157     const struct ipath *ipp);
158 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused,
159     void *unused2);
160 static void destroy_fme_bufs(struct fme *fp);
161 
162 static struct fme *
163 alloc_fme(void)
164 {
165 	struct fme *fmep;
166 
167 	fmep = MALLOC(sizeof (*fmep));
168 	bzero(fmep, sizeof (*fmep));
169 	return (fmep);
170 }
171 
172 /*
173  * fme_ready -- called when all initialization of the FME (except for
174  *	stats) has completed successfully.  Adds the fme to global lists
175  *	and establishes its stats.
176  */
177 static struct fme *
178 fme_ready(struct fme *fmep)
179 {
180 	char nbuf[100];
181 
182 	Nfmep = NULL;	/* don't need to free this on module abort now */
183 
184 	if (EFMElist) {
185 		EFMElist->next = fmep;
186 		EFMElist = fmep;
187 	} else
188 		FMElist = EFMElist = fmep;
189 
190 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
191 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
192 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
193 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
194 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
195 	fmep->Rcallcount = stats_new_counter(nbuf,
196 	    "calls to requirements_test()", 1);
197 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
198 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
199 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
200 	fmep->Ecallcount =
201 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
202 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
203 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
204 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
205 	fmep->Marrowcount = stats_new_counter(nbuf,
206 	    "arrows marked by mark_arrows()", 1);
207 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
208 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
209 
210 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
211 	config_print(O_ALTFP|O_VERB2, fmep->config);
212 
213 	return (fmep);
214 }
215 
216 extern void ipath_dummy_lut(struct arrow *);
217 extern struct lut *itree_create_dummy(const char *, const struct ipath *);
218 
219 /* ARGSUSED */
220 static void
221 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
222 {
223 	struct bubble *bp;
224 	struct arrowlist *ap;
225 
226 	for (bp = itree_next_bubble(ep, NULL); bp;
227 	    bp = itree_next_bubble(ep, bp)) {
228 		if (bp->t != B_FROM)
229 			continue;
230 		for (ap = itree_next_arrow(bp, NULL); ap;
231 		    ap = itree_next_arrow(bp, ap)) {
232 			ap->arrowp->pnode->u.arrow.needed = 1;
233 			ipath_dummy_lut(ap->arrowp);
234 		}
235 	}
236 }
237 
238 /* ARGSUSED */
239 static void
240 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
241 {
242 	struct bubble *bp;
243 	struct arrowlist *ap;
244 
245 	for (bp = itree_next_bubble(ep, NULL); bp;
246 	    bp = itree_next_bubble(ep, bp)) {
247 		if (bp->t != B_FROM)
248 			continue;
249 		for (ap = itree_next_arrow(bp, NULL); ap;
250 		    ap = itree_next_arrow(bp, ap))
251 			ap->arrowp->pnode->u.arrow.needed = 0;
252 	}
253 }
254 
255 static void globals_destructor(void *left, void *right, void *arg);
256 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep);
257 
258 static void
259 prune_propagations(const char *e0class, const struct ipath *e0ipp)
260 {
261 	char nbuf[100];
262 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
263 	extern struct lut *Usednames;
264 
265 	Nfmep = alloc_fme();
266 	Nfmep->id = Nextid;
267 	Nfmep->state = FME_NOTHING;
268 	Nfmep->eventtree = itree_create_dummy(e0class, e0ipp);
269 	if ((Nfmep->e0 =
270 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
271 		out(O_ALTFP, "prune_propagations: e0 not in instance tree");
272 		itree_free(Nfmep->eventtree);
273 		FREE(Nfmep);
274 		Nfmep = NULL;
275 		return;
276 	}
277 	Nfmep->ecurrent = Nfmep->observations = Nfmep->e0;
278 	Nfmep->e0->count++;
279 
280 	(void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id);
281 	Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
282 	(void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id);
283 	Nfmep->Hcallcount =
284 	    stats_new_counter(nbuf, "calls to hypothesise()", 1);
285 	(void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id);
286 	Nfmep->Rcallcount = stats_new_counter(nbuf,
287 	    "calls to requirements_test()", 1);
288 	(void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id);
289 	Nfmep->Ccallcount =
290 	    stats_new_counter(nbuf, "calls to causes_test()", 1);
291 	(void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id);
292 	Nfmep->Ecallcount =
293 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
294 	(void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id);
295 	Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
296 	(void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id);
297 	Nfmep->Marrowcount = stats_new_counter(nbuf,
298 	    "arrows marked by mark_arrows()", 1);
299 	(void) sprintf(nbuf, "fme%d.diags", Nfmep->id);
300 	Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
301 
302 	Nfmep->peek = 1;
303 	lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep);
304 	lut_free(Usednames, NULL, NULL);
305 	Usednames = NULL;
306 	lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep);
307 	(void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay);
308 	itree_prune(Nfmep->eventtree);
309 	lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep);
310 
311 	stats_delete(Nfmep->Rcount);
312 	stats_delete(Nfmep->Hcallcount);
313 	stats_delete(Nfmep->Rcallcount);
314 	stats_delete(Nfmep->Ccallcount);
315 	stats_delete(Nfmep->Ecallcount);
316 	stats_delete(Nfmep->Tcallcount);
317 	stats_delete(Nfmep->Marrowcount);
318 	stats_delete(Nfmep->diags);
319 	itree_free(Nfmep->eventtree);
320 	lut_free(Nfmep->globals, globals_destructor, NULL);
321 	FREE(Nfmep);
322 }
323 
324 static struct fme *
325 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl,
326 	fmd_case_t *fmcase)
327 {
328 	struct cfgdata *cfgdata;
329 	int init_size;
330 	extern int alloc_total();
331 
332 	init_size = alloc_total();
333 	out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size);
334 	cfgdata = config_snapshot();
335 	platform_save_config(hdl, fmcase);
336 	out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes",
337 	    alloc_total() - init_size);
338 
339 	Nfmep = alloc_fme();
340 
341 	Nfmep->id = Nextid++;
342 	Nfmep->config = cfgdata->cooked;
343 	config_free(cfgdata);
344 	Nfmep->posted_suspects = 0;
345 	Nfmep->uniqobs = 0;
346 	Nfmep->state = FME_NOTHING;
347 	Nfmep->pull = 0ULL;
348 	Nfmep->overflow = 0;
349 
350 	Nfmep->fmcase = fmcase;
351 	Nfmep->hdl = hdl;
352 
353 	if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) {
354 		out(O_ALTFP, "newfme: NULL instance tree");
355 		Undiag_reason = UD_VAL_INSTFAIL;
356 		structconfig_free(Nfmep->config);
357 		destroy_fme_bufs(Nfmep);
358 		FREE(Nfmep);
359 		Nfmep = NULL;
360 		return (NULL);
361 	}
362 
363 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
364 
365 	if ((Nfmep->e0 =
366 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
367 		out(O_ALTFP, "newfme: e0 not in instance tree");
368 		Undiag_reason = UD_VAL_BADEVENTI;
369 		itree_free(Nfmep->eventtree);
370 		structconfig_free(Nfmep->config);
371 		destroy_fme_bufs(Nfmep);
372 		FREE(Nfmep);
373 		Nfmep = NULL;
374 		return (NULL);
375 	}
376 
377 	return (fme_ready(Nfmep));
378 }
379 
380 void
381 fme_fini(void)
382 {
383 	struct fme *sfp, *fp;
384 	struct case_list *ucasep, *nextcasep;
385 
386 	ucasep = Undiagablecaselist;
387 	while (ucasep != NULL) {
388 		nextcasep = ucasep->next;
389 		FREE(ucasep);
390 		ucasep = nextcasep;
391 	}
392 	Undiagablecaselist = NULL;
393 
394 	/* clean up closed fmes */
395 	fp = ClosedFMEs;
396 	while (fp != NULL) {
397 		sfp = fp->next;
398 		destroy_fme(fp);
399 		fp = sfp;
400 	}
401 	ClosedFMEs = NULL;
402 
403 	fp = FMElist;
404 	while (fp != NULL) {
405 		sfp = fp->next;
406 		destroy_fme(fp);
407 		fp = sfp;
408 	}
409 	FMElist = EFMElist = NULL;
410 
411 	/* if we were in the middle of creating an fme, free it now */
412 	if (Nfmep) {
413 		destroy_fme(Nfmep);
414 		Nfmep = NULL;
415 	}
416 }
417 
418 /*
419  * Allocated space for a buffer name.  20 bytes allows for
420  * a ridiculous 9,999,999 unique observations.
421  */
422 #define	OBBUFNMSZ 20
423 
424 /*
425  *  serialize_observation
426  *
427  *  Create a recoverable version of the current observation
428  *  (f->ecurrent).  We keep a serialized version of each unique
429  *  observation in order that we may resume correctly the fme in the
430  *  correct state if eft or fmd crashes and we're restarted.
431  */
432 static void
433 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
434 {
435 	size_t pkdlen;
436 	char tmpbuf[OBBUFNMSZ];
437 	char *pkd = NULL;
438 	char *estr;
439 
440 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
441 	estr = ipath2str(cls, ipp);
442 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
443 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
444 	    strlen(estr) + 1);
445 	FREE(estr);
446 
447 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
448 		(void) snprintf(tmpbuf,
449 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
450 		if (nvlist_xpack(fp->ecurrent->nvp,
451 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
452 			out(O_DIE|O_SYS, "pack of observed nvl failed");
453 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
454 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
455 		FREE(pkd);
456 	}
457 
458 	fp->uniqobs++;
459 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
460 	    sizeof (fp->uniqobs));
461 }
462 
463 /*
464  *  init_fme_bufs -- We keep several bits of state about an fme for
465  *	use if eft or fmd crashes and we're restarted.
466  */
467 static void
468 init_fme_bufs(struct fme *fp)
469 {
470 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
471 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
472 	    sizeof (fp->pull));
473 
474 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
475 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
476 	    sizeof (fp->id));
477 
478 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
479 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
480 	    sizeof (fp->uniqobs));
481 
482 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
483 	    sizeof (fp->posted_suspects));
484 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
485 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
486 }
487 
488 static void
489 destroy_fme_bufs(struct fme *fp)
490 {
491 	char tmpbuf[OBBUFNMSZ];
492 	int o;
493 
494 	platform_restore_config(fp->hdl, fp->fmcase);
495 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
496 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
497 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
498 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
499 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
500 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
501 
502 	for (o = 0; o < fp->uniqobs; o++) {
503 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
504 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
505 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
506 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
507 	}
508 }
509 
510 /*
511  * reconstitute_observations -- convert a case's serialized observations
512  *	back into struct events.  Returns zero if all observations are
513  *	successfully reconstituted.
514  */
515 static int
516 reconstitute_observations(struct fme *fmep)
517 {
518 	struct event *ep;
519 	struct node *epnamenp = NULL;
520 	size_t pkdlen;
521 	char *pkd = NULL;
522 	char *tmpbuf = alloca(OBBUFNMSZ);
523 	char *sepptr;
524 	char *estr;
525 	int ocnt;
526 	int elen;
527 
528 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
529 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
530 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
531 		if (elen == 0) {
532 			out(O_ALTFP,
533 			    "reconstitute_observation: no %s buffer found.",
534 			    tmpbuf);
535 			Undiag_reason = UD_VAL_MISSINGOBS;
536 			break;
537 		}
538 
539 		estr = MALLOC(elen);
540 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
541 		sepptr = strchr(estr, '@');
542 		if (sepptr == NULL) {
543 			out(O_ALTFP,
544 			    "reconstitute_observation: %s: "
545 			    "missing @ separator in %s.",
546 			    tmpbuf, estr);
547 			Undiag_reason = UD_VAL_MISSINGPATH;
548 			FREE(estr);
549 			break;
550 		}
551 
552 		*sepptr = '\0';
553 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
554 			out(O_ALTFP,
555 			    "reconstitute_observation: %s: "
556 			    "trouble converting path string \"%s\" "
557 			    "to internal representation.",
558 			    tmpbuf, sepptr + 1);
559 			Undiag_reason = UD_VAL_MISSINGPATH;
560 			FREE(estr);
561 			break;
562 		}
563 
564 		/* construct the event */
565 		ep = itree_lookup(fmep->eventtree,
566 		    stable(estr), ipath(epnamenp));
567 		if (ep == NULL) {
568 			out(O_ALTFP,
569 			    "reconstitute_observation: %s: "
570 			    "lookup of  \"%s\" in itree failed.",
571 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
572 			Undiag_reason = UD_VAL_BADOBS;
573 			tree_free(epnamenp);
574 			FREE(estr);
575 			break;
576 		}
577 		tree_free(epnamenp);
578 
579 		/*
580 		 * We may or may not have a saved nvlist for the observation
581 		 */
582 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
583 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
584 		if (pkdlen != 0) {
585 			pkd = MALLOC(pkdlen);
586 			fmd_buf_read(fmep->hdl,
587 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
588 			ASSERT(ep->nvp == NULL);
589 			if (nvlist_xunpack(pkd,
590 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
591 				out(O_DIE|O_SYS, "pack of observed nvl failed");
592 			FREE(pkd);
593 		}
594 
595 		if (ocnt == 0)
596 			fmep->e0 = ep;
597 
598 		FREE(estr);
599 		fmep->ecurrent = ep;
600 		ep->count++;
601 
602 		/* link it into list of observations seen */
603 		ep->observations = fmep->observations;
604 		fmep->observations = ep;
605 	}
606 
607 	if (ocnt == fmep->uniqobs) {
608 		(void) fme_ready(fmep);
609 		return (0);
610 	}
611 
612 	return (1);
613 }
614 
615 /*
616  * restart_fme -- called during eft initialization.  Reconstitutes
617  *	an in-progress fme.
618  */
619 void
620 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
621 {
622 	nvlist_t *defect;
623 	struct case_list *bad;
624 	struct fme *fmep;
625 	struct cfgdata *cfgdata;
626 	size_t rawsz;
627 	struct event *ep;
628 	char *tmpbuf = alloca(OBBUFNMSZ);
629 	char *sepptr;
630 	char *estr;
631 	int elen;
632 	struct node *epnamenp = NULL;
633 	int init_size;
634 	extern int alloc_total();
635 
636 	/*
637 	 * ignore solved or closed cases
638 	 */
639 	if (fmd_case_solved(hdl, inprogress) ||
640 	    fmd_case_closed(hdl, inprogress))
641 		return;
642 
643 	fmep = alloc_fme();
644 	fmep->fmcase = inprogress;
645 	fmep->hdl = hdl;
646 
647 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
648 		out(O_ALTFP, "restart_fme: no saved posted status");
649 		Undiag_reason = UD_VAL_MISSINGINFO;
650 		goto badcase;
651 	} else {
652 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
653 		    (void *)&fmep->posted_suspects,
654 		    sizeof (fmep->posted_suspects));
655 	}
656 
657 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
658 		out(O_ALTFP, "restart_fme: no saved id");
659 		Undiag_reason = UD_VAL_MISSINGINFO;
660 		goto badcase;
661 	} else {
662 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
663 		    sizeof (fmep->id));
664 	}
665 	if (Nextid <= fmep->id)
666 		Nextid = fmep->id + 1;
667 
668 	out(O_ALTFP, "Replay FME %d", fmep->id);
669 
670 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
671 		out(O_ALTFP, "restart_fme: No config data");
672 		Undiag_reason = UD_VAL_MISSINGINFO;
673 		goto badcase;
674 	}
675 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
676 	    sizeof (size_t));
677 
678 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
679 		out(O_ALTFP, "restart_fme: No event zero");
680 		Undiag_reason = UD_VAL_MISSINGZERO;
681 		goto badcase;
682 	}
683 
684 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
685 		out(O_ALTFP, "restart_fme: no saved wait time");
686 		Undiag_reason = UD_VAL_MISSINGINFO;
687 		goto badcase;
688 	} else {
689 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
690 		    sizeof (fmep->pull));
691 	}
692 
693 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
694 		out(O_ALTFP, "restart_fme: no count of observations");
695 		Undiag_reason = UD_VAL_MISSINGINFO;
696 		goto badcase;
697 	} else {
698 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
699 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
700 	}
701 
702 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed0");
703 	elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
704 	if (elen == 0) {
705 		out(O_ALTFP, "reconstitute_observation: no %s buffer found.",
706 		    tmpbuf);
707 		Undiag_reason = UD_VAL_MISSINGOBS;
708 		goto badcase;
709 	}
710 	estr = MALLOC(elen);
711 	fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
712 	sepptr = strchr(estr, '@');
713 	if (sepptr == NULL) {
714 		out(O_ALTFP, "reconstitute_observation: %s: "
715 		    "missing @ separator in %s.",
716 		    tmpbuf, estr);
717 		Undiag_reason = UD_VAL_MISSINGPATH;
718 		FREE(estr);
719 		goto badcase;
720 	}
721 	*sepptr = '\0';
722 	if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
723 		out(O_ALTFP, "reconstitute_observation: %s: "
724 		    "trouble converting path string \"%s\" "
725 		    "to internal representation.", tmpbuf, sepptr + 1);
726 		Undiag_reason = UD_VAL_MISSINGPATH;
727 		FREE(estr);
728 		goto badcase;
729 	}
730 	prune_propagations(stable(estr), ipath(epnamenp));
731 	tree_free(epnamenp);
732 	FREE(estr);
733 
734 	init_size = alloc_total();
735 	out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size);
736 	cfgdata = MALLOC(sizeof (struct cfgdata));
737 	cfgdata->cooked = NULL;
738 	cfgdata->devcache = NULL;
739 	cfgdata->devidcache = NULL;
740 	cfgdata->cpucache = NULL;
741 	cfgdata->raw_refcnt = 1;
742 
743 	if (rawsz > 0) {
744 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
745 			out(O_ALTFP, "restart_fme: Config data size mismatch");
746 			Undiag_reason = UD_VAL_CFGMISMATCH;
747 			goto badcase;
748 		}
749 		cfgdata->begin = MALLOC(rawsz);
750 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
751 		fmd_buf_read(hdl,
752 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
753 	} else {
754 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
755 	}
756 
757 	config_cook(cfgdata);
758 	fmep->config = cfgdata->cooked;
759 	config_free(cfgdata);
760 	out(O_ALTFP|O_STAMP, "config_restore added %d bytes",
761 	    alloc_total() - init_size);
762 
763 	if ((fmep->eventtree = itree_create(fmep->config)) == NULL) {
764 		/* case not properly saved or irretrievable */
765 		out(O_ALTFP, "restart_fme: NULL instance tree");
766 		Undiag_reason = UD_VAL_INSTFAIL;
767 		goto badcase;
768 	}
769 
770 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
771 
772 	if (reconstitute_observations(fmep) != 0)
773 		goto badcase;
774 
775 	out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id);
776 	for (ep = fmep->observations; ep; ep = ep->observations) {
777 		out(O_ALTFP|O_NONL, " ");
778 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
779 	}
780 	out(O_ALTFP, NULL);
781 
782 	Open_fme_count++;
783 
784 	/* give the diagnosis algorithm a shot at the new FME state */
785 	fme_eval(fmep, fmep->e0r);
786 	return;
787 
788 badcase:
789 	if (fmep->eventtree != NULL)
790 		itree_free(fmep->eventtree);
791 	if (fmep->config)
792 		structconfig_free(fmep->config);
793 	destroy_fme_bufs(fmep);
794 	FREE(fmep);
795 
796 	/*
797 	 * Since we're unable to restart the case, add it to the undiagable
798 	 * list and solve and close it as appropriate.
799 	 */
800 	bad = MALLOC(sizeof (struct case_list));
801 	bad->next = NULL;
802 
803 	if (Undiagablecaselist != NULL)
804 		bad->next = Undiagablecaselist;
805 	Undiagablecaselist = bad;
806 	bad->fmcase = inprogress;
807 
808 	out(O_ALTFP|O_NONL, "[case %s (unable to restart), ",
809 	    fmd_case_uuid(hdl, bad->fmcase));
810 
811 	if (fmd_case_solved(hdl, bad->fmcase)) {
812 		out(O_ALTFP|O_NONL, "already solved, ");
813 	} else {
814 		out(O_ALTFP|O_NONL, "solving, ");
815 		defect = fmd_nvl_create_fault(hdl,
816 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
817 		(void) nvlist_add_string(defect, UNDIAG_REASON,
818 		    undiag_2reason_str(Undiag_reason));
819 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
820 		fmd_case_solve(hdl, bad->fmcase);
821 		Undiag_reason = UD_VAL_UNKNOWN;
822 	}
823 
824 	if (fmd_case_closed(hdl, bad->fmcase)) {
825 		out(O_ALTFP, "already closed ]");
826 	} else {
827 		out(O_ALTFP, "closing ]");
828 		fmd_case_close(hdl, bad->fmcase);
829 	}
830 }
831 
832 /*ARGSUSED*/
833 static void
834 globals_destructor(void *left, void *right, void *arg)
835 {
836 	struct evalue *evp = (struct evalue *)right;
837 	if (evp->t == NODEPTR)
838 		tree_free((struct node *)(uintptr_t)evp->v);
839 	evp->v = (uintptr_t)NULL;
840 	FREE(evp);
841 }
842 
843 void
844 destroy_fme(struct fme *f)
845 {
846 	stats_delete(f->Rcount);
847 	stats_delete(f->Hcallcount);
848 	stats_delete(f->Rcallcount);
849 	stats_delete(f->Ccallcount);
850 	stats_delete(f->Ecallcount);
851 	stats_delete(f->Tcallcount);
852 	stats_delete(f->Marrowcount);
853 	stats_delete(f->diags);
854 
855 	if (f->eventtree != NULL)
856 		itree_free(f->eventtree);
857 	if (f->config)
858 		structconfig_free(f->config);
859 	lut_free(f->globals, globals_destructor, NULL);
860 	FREE(f);
861 }
862 
863 static const char *
864 fme_state2str(enum fme_state s)
865 {
866 	switch (s) {
867 	case FME_NOTHING:	return ("NOTHING");
868 	case FME_WAIT:		return ("WAIT");
869 	case FME_CREDIBLE:	return ("CREDIBLE");
870 	case FME_DISPROVED:	return ("DISPROVED");
871 	case FME_DEFERRED:	return ("DEFERRED");
872 	default:		return ("UNKNOWN");
873 	}
874 }
875 
876 static int
877 is_problem(enum nametype t)
878 {
879 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
880 }
881 
882 static int
883 is_defect(enum nametype t)
884 {
885 	return (t == N_DEFECT);
886 }
887 
888 static int
889 is_upset(enum nametype t)
890 {
891 	return (t == N_UPSET);
892 }
893 
894 static void
895 fme_print(int flags, struct fme *fmep)
896 {
897 	struct event *ep;
898 
899 	out(flags, "Fault Management Exercise %d", fmep->id);
900 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
901 	out(flags|O_NONL, "\t  Start time: ");
902 	ptree_timeval(flags|O_NONL, &fmep->ull);
903 	out(flags, NULL);
904 	if (fmep->wull) {
905 		out(flags|O_NONL, "\t   Wait time: ");
906 		ptree_timeval(flags|O_NONL, &fmep->wull);
907 		out(flags, NULL);
908 	}
909 	out(flags|O_NONL, "\t          E0: ");
910 	if (fmep->e0)
911 		itree_pevent_brief(flags|O_NONL, fmep->e0);
912 	else
913 		out(flags|O_NONL, "NULL");
914 	out(flags, NULL);
915 	out(flags|O_NONL, "\tObservations:");
916 	for (ep = fmep->observations; ep; ep = ep->observations) {
917 		out(flags|O_NONL, " ");
918 		itree_pevent_brief(flags|O_NONL, ep);
919 	}
920 	out(flags, NULL);
921 	out(flags|O_NONL, "\tSuspect list:");
922 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
923 		out(flags|O_NONL, " ");
924 		itree_pevent_brief(flags|O_NONL, ep);
925 	}
926 	out(flags, NULL);
927 	if (fmep->eventtree != NULL) {
928 		out(flags|O_VERB2, "\t        Tree:");
929 		itree_ptree(flags|O_VERB2, fmep->eventtree);
930 	}
931 }
932 
933 static struct node *
934 pathstring2epnamenp(char *path)
935 {
936 	char *sep = "/";
937 	struct node *ret;
938 	char *ptr;
939 
940 	if ((ptr = strtok(path, sep)) == NULL)
941 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
942 
943 	ret = tree_iname(stable(ptr), NULL, 0);
944 
945 	while ((ptr = strtok(NULL, sep)) != NULL)
946 		ret = tree_name_append(ret,
947 		    tree_iname(stable(ptr), NULL, 0));
948 
949 	return (ret);
950 }
951 
952 /*
953  * for a given upset sp, increment the corresponding SERD engine.  if the
954  * SERD engine trips, return the ename and ipp of the resulting ereport.
955  * returns true if engine tripped and *enamep and *ippp were filled in.
956  */
957 static int
958 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
959     fmd_case_t *fmcase, struct event *sp, const char **enamep,
960     const struct ipath **ippp)
961 {
962 	struct node *serdinst;
963 	char *serdname;
964 	char *serdresource;
965 	char *serdclass;
966 	struct node *nid;
967 	struct serd_entry *newentp;
968 	int i, serdn = -1, serdincrement = 1, len = 0;
969 	char *serdsuffix = NULL, *serdt = NULL;
970 	struct evalue *ep;
971 
972 	ASSERT(sp->t == N_UPSET);
973 	ASSERT(ffep != NULL);
974 
975 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
976 	    (void *)"n", (lut_cmp)strcmp)) != NULL) {
977 		ASSERT(ep->t == UINT64);
978 		serdn = (int)ep->v;
979 	}
980 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
981 	    (void *)"t", (lut_cmp)strcmp)) != NULL) {
982 		ASSERT(ep->t == STRING);
983 		serdt = (char *)(uintptr_t)ep->v;
984 	}
985 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
986 	    (void *)"suffix", (lut_cmp)strcmp)) != NULL) {
987 		ASSERT(ep->t == STRING);
988 		serdsuffix = (char *)(uintptr_t)ep->v;
989 	}
990 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
991 	    (void *)"increment", (lut_cmp)strcmp)) != NULL) {
992 		ASSERT(ep->t == UINT64);
993 		serdincrement = (int)ep->v;
994 	}
995 
996 	/*
997 	 * obtain instanced SERD engine from the upset sp.  from this
998 	 * derive serdname, the string used to identify the SERD engine.
999 	 */
1000 	serdinst = eventprop_lookup(sp, L_engine);
1001 
1002 	if (serdinst == NULL)
1003 		return (-1);
1004 
1005 	len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1;
1006 	if (serdsuffix != NULL)
1007 		len += strlen(serdsuffix);
1008 	serdclass = MALLOC(len);
1009 	if (serdsuffix != NULL)
1010 		(void) snprintf(serdclass, len, "%s%s",
1011 		    serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix);
1012 	else
1013 		(void) snprintf(serdclass, len, "%s",
1014 		    serdinst->u.stmt.np->u.event.ename->u.name.s);
1015 	serdresource = ipath2str(NULL,
1016 	    ipath(serdinst->u.stmt.np->u.event.epname));
1017 	len += strlen(serdresource) + 1;
1018 	serdname = MALLOC(len);
1019 	(void) snprintf(serdname, len, "%s@%s", serdclass, serdresource);
1020 	FREE(serdresource);
1021 
1022 	/* handle serd engine "id" property, if there is one */
1023 	if ((nid =
1024 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
1025 		struct evalue *gval;
1026 		char suffixbuf[200];
1027 		char *suffix;
1028 		char *nserdname;
1029 		size_t nname;
1030 
1031 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
1032 		ptree_name_iter(O_ALTFP|O_NONL, nid);
1033 
1034 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
1035 
1036 		if ((gval = lut_lookup(fmep->globals,
1037 		    (void *)nid->u.globid.s, NULL)) == NULL) {
1038 			out(O_ALTFP, " undefined");
1039 		} else if (gval->t == UINT64) {
1040 			out(O_ALTFP, " %llu", gval->v);
1041 			(void) sprintf(suffixbuf, "%llu", gval->v);
1042 			suffix = suffixbuf;
1043 		} else {
1044 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
1045 			suffix = (char *)(uintptr_t)gval->v;
1046 		}
1047 
1048 		nname = strlen(serdname) + strlen(suffix) + 2;
1049 		nserdname = MALLOC(nname);
1050 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
1051 		FREE(serdname);
1052 		serdname = nserdname;
1053 	}
1054 
1055 	/*
1056 	 * if the engine is empty, and we have an override for n/t then
1057 	 * destroy and recreate it.
1058 	 */
1059 	if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) &&
1060 	    fmd_serd_empty(hdl, serdname))
1061 		fmd_serd_destroy(hdl, serdname);
1062 
1063 	if (!fmd_serd_exists(hdl, serdname)) {
1064 		struct node *nN, *nT;
1065 		const char *s;
1066 		struct node *nodep;
1067 		struct config *cp;
1068 		char *path;
1069 		uint_t nval;
1070 		hrtime_t tval;
1071 		int i;
1072 		char *ptr;
1073 		int got_n_override = 0, got_t_override = 0;
1074 
1075 		/* no SERD engine yet, so create it */
1076 		nodep = serdinst->u.stmt.np->u.event.epname;
1077 		path = ipath2str(NULL, ipath(nodep));
1078 		cp = config_lookup(fmep->config, path, 0);
1079 		FREE((void *)path);
1080 
1081 		/*
1082 		 * We allow serd paramaters to be overridden, either from
1083 		 * eft.conf file values (if Serd_Override is set) or from
1084 		 * driver properties (for "serd.io.device" engines).
1085 		 */
1086 		if (Serd_Override != NULL) {
1087 			char *save_ptr, *ptr1, *ptr2, *ptr3;
1088 			ptr3 = save_ptr = STRDUP(Serd_Override);
1089 			while (*ptr3 != '\0') {
1090 				ptr1 = strchr(ptr3, ',');
1091 				*ptr1 = '\0';
1092 				if (strcmp(ptr3, serdclass) == 0) {
1093 					ptr2 =  strchr(ptr1 + 1, ',');
1094 					*ptr2 = '\0';
1095 					nval = atoi(ptr1 + 1);
1096 					out(O_ALTFP, "serd override %s_n %d",
1097 					    serdclass, nval);
1098 					ptr3 =  strchr(ptr2 + 1, ' ');
1099 					if (ptr3)
1100 						*ptr3 = '\0';
1101 					ptr = STRDUP(ptr2 + 1);
1102 					out(O_ALTFP, "serd override %s_t %s",
1103 					    serdclass, ptr);
1104 					got_n_override = 1;
1105 					got_t_override = 1;
1106 					break;
1107 				} else {
1108 					ptr2 =  strchr(ptr1 + 1, ',');
1109 					ptr3 =  strchr(ptr2 + 1, ' ');
1110 					if (ptr3 == NULL)
1111 						break;
1112 				}
1113 				ptr3++;
1114 			}
1115 			FREE(save_ptr);
1116 		}
1117 
1118 		if (cp && got_n_override == 0) {
1119 			/*
1120 			 * convert serd engine class into property name
1121 			 */
1122 			char *prop_name = MALLOC(strlen(serdclass) + 3);
1123 			for (i = 0; i < strlen(serdclass); i++) {
1124 				if (serdclass[i] == '.')
1125 					prop_name[i] = '_';
1126 				else
1127 					prop_name[i] = serdclass[i];
1128 			}
1129 			prop_name[i++] = '_';
1130 			prop_name[i++] = 'n';
1131 			prop_name[i] = '\0';
1132 			if (s = config_getprop(cp, prop_name)) {
1133 				nval = atoi(s);
1134 				out(O_ALTFP, "serd override %s_n %s",
1135 				    serdclass, s);
1136 				got_n_override = 1;
1137 			}
1138 			prop_name[i - 1] = 't';
1139 			if (s = config_getprop(cp, prop_name)) {
1140 				ptr = STRDUP(s);
1141 				out(O_ALTFP, "serd override %s_t %s",
1142 				    serdclass, s);
1143 				got_t_override = 1;
1144 			}
1145 			FREE(prop_name);
1146 		}
1147 
1148 		if (serdn != -1 && got_n_override == 0) {
1149 			nval = serdn;
1150 			out(O_ALTFP, "serd override %s_n %d", serdclass, serdn);
1151 			got_n_override = 1;
1152 		}
1153 		if (serdt != NULL && got_t_override == 0) {
1154 			ptr = STRDUP(serdt);
1155 			out(O_ALTFP, "serd override %s_t %s", serdclass, serdt);
1156 			got_t_override = 1;
1157 		}
1158 
1159 		if (!got_n_override) {
1160 			nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N,
1161 			    NULL);
1162 			ASSERT(nN->t == T_NUM);
1163 			nval = (uint_t)nN->u.ull;
1164 		}
1165 		if (!got_t_override) {
1166 			nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T,
1167 			    NULL);
1168 			ASSERT(nT->t == T_TIMEVAL);
1169 			tval = (hrtime_t)nT->u.ull;
1170 		} else {
1171 			const unsigned long long *ullp;
1172 			const char *suffix;
1173 			int len;
1174 
1175 			len = strspn(ptr, "0123456789");
1176 			suffix = stable(&ptr[len]);
1177 			ullp = (unsigned long long *)lut_lookup(Timesuffixlut,
1178 			    (void *)suffix, NULL);
1179 			ptr[len] = '\0';
1180 			tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll);
1181 			FREE(ptr);
1182 		}
1183 		fmd_serd_create(hdl, serdname, nval, tval);
1184 	}
1185 
1186 	newentp = MALLOC(sizeof (*newentp));
1187 	newentp->ename = stable(serdclass);
1188 	FREE(serdclass);
1189 	newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname);
1190 	newentp->hdl = hdl;
1191 	if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) {
1192 		SerdEngines = lut_add(SerdEngines, (void *)newentp,
1193 		    (void *)newentp, (lut_cmp)serd_cmp);
1194 		Serd_need_save = 1;
1195 		serd_save();
1196 	} else {
1197 		FREE(newentp);
1198 	}
1199 
1200 
1201 	/*
1202 	 * increment SERD engine.  if engine fires, reset serd
1203 	 * engine and return trip_strcode if required.
1204 	 */
1205 	for (i = 0; i < serdincrement; i++) {
1206 		if (fmd_serd_record(hdl, serdname, ffep)) {
1207 			fmd_case_add_serd(hdl, fmcase, serdname);
1208 			fmd_serd_reset(hdl, serdname);
1209 
1210 			if (ippp) {
1211 				struct node *tripinst =
1212 				    lut_lookup(serdinst->u.stmt.lutp,
1213 				    (void *)L_trip, NULL);
1214 				ASSERT(tripinst != NULL);
1215 				*enamep = tripinst->u.event.ename->u.name.s;
1216 				*ippp = ipath(tripinst->u.event.epname);
1217 				out(O_ALTFP|O_NONL,
1218 				    "[engine fired: %s, sending: ", serdname);
1219 				ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
1220 				out(O_ALTFP, "]");
1221 			} else {
1222 				out(O_ALTFP, "[engine fired: %s, no trip]",
1223 				    serdname);
1224 			}
1225 			FREE(serdname);
1226 			return (1);
1227 		}
1228 	}
1229 
1230 	FREE(serdname);
1231 	return (0);
1232 }
1233 
1234 /*
1235  * search a suspect list for upsets.  feed each upset to serd_eval() and
1236  * build up tripped[], an array of ereports produced by the firing of
1237  * any SERD engines.  then feed each ereport back into
1238  * fme_receive_report().
1239  *
1240  * returns ntrip, the number of these ereports produced.
1241  */
1242 static int
1243 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
1244 {
1245 	/* we build an array of tripped ereports that we send ourselves */
1246 	struct {
1247 		const char *ename;
1248 		const struct ipath *ipp;
1249 	} *tripped;
1250 	struct event *sp;
1251 	int ntrip, nupset, i;
1252 
1253 	/*
1254 	 * count the number of upsets to determine the upper limit on
1255 	 * expected trip ereport strings.  remember that one upset can
1256 	 * lead to at most one ereport.
1257 	 */
1258 	nupset = 0;
1259 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
1260 		if (sp->t == N_UPSET)
1261 			nupset++;
1262 	}
1263 
1264 	if (nupset == 0)
1265 		return (0);
1266 
1267 	/*
1268 	 * get to this point if we have upsets and expect some trip
1269 	 * ereports
1270 	 */
1271 	tripped = alloca(sizeof (*tripped) * nupset);
1272 	bzero((void *)tripped, sizeof (*tripped) * nupset);
1273 
1274 	ntrip = 0;
1275 	for (sp = fmep->suspects; sp; sp = sp->suspects)
1276 		if (sp->t == N_UPSET &&
1277 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
1278 		    &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1)
1279 			ntrip++;
1280 
1281 	for (i = 0; i < ntrip; i++) {
1282 		struct event *ep, *nep;
1283 		struct fme *nfmep;
1284 		fmd_case_t *fmcase;
1285 		const struct ipath *ipp;
1286 		const char *eventstring;
1287 		int prev_verbose;
1288 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1289 		enum fme_state state;
1290 
1291 		/*
1292 		 * First try and evaluate a case with the trip ereport plus
1293 		 * all the other ereports that cause the trip. If that fails
1294 		 * to evaluate then try again with just this ereport on its own.
1295 		 */
1296 		out(O_ALTFP|O_NONL, "fme_receive_report_serd: ");
1297 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1298 		out(O_ALTFP|O_STAMP, NULL);
1299 		ep = fmep->e0;
1300 		eventstring = ep->enode->u.event.ename->u.name.s;
1301 		ipp = ep->ipp;
1302 		prune_propagations(eventstring, ipp);
1303 
1304 		/*
1305 		 * create a duplicate fme and case
1306 		 */
1307 		fmcase = fmd_case_open(fmep->hdl, NULL);
1308 		out(O_ALTFP|O_NONL, "duplicate fme for event [");
1309 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1310 		out(O_ALTFP, " ]");
1311 		if ((nfmep = newfme(eventstring, ipp, fmep->hdl,
1312 		    fmcase)) == NULL) {
1313 			out(O_ALTFP|O_NONL, "[");
1314 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1315 			out(O_ALTFP, " CANNOT DIAGNOSE]");
1316 			publish_undiagnosable(fmep->hdl, ffep, fmcase);
1317 			continue;
1318 		}
1319 		Open_fme_count++;
1320 		nfmep->pull = fmep->pull;
1321 		init_fme_bufs(nfmep);
1322 		out(O_ALTFP|O_NONL, "[");
1323 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1324 		out(O_ALTFP, " created FME%d, case %s]", nfmep->id,
1325 		    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
1326 		if (ffep) {
1327 			fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep);
1328 			fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep);
1329 			nfmep->e0r = ffep;
1330 		}
1331 
1332 		/*
1333 		 * add the original ereports
1334 		 */
1335 		for (ep = fmep->observations; ep; ep = ep->observations) {
1336 			eventstring = ep->enode->u.event.ename->u.name.s;
1337 			ipp = ep->ipp;
1338 			out(O_ALTFP|O_NONL, "adding event [");
1339 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1340 			out(O_ALTFP, " ]");
1341 			nep = itree_lookup(nfmep->eventtree, eventstring, ipp);
1342 			if (nep->count++ == 0) {
1343 				nep->observations = nfmep->observations;
1344 				nfmep->observations = nep;
1345 				serialize_observation(nfmep, eventstring, ipp);
1346 				nep->nvp = evnv_dupnvl(ep->nvp);
1347 			}
1348 			if (ep->ffep && ep->ffep != ffep)
1349 				fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase,
1350 				    ep->ffep);
1351 			stats_counter_bump(nfmep->Rcount);
1352 		}
1353 
1354 		/*
1355 		 * add the serd trigger ereport
1356 		 */
1357 		if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename,
1358 		    tripped[i].ipp)) == NULL) {
1359 			/*
1360 			 * The trigger ereport is not in the instance tree. It
1361 			 * was presumably removed by prune_propagations() as
1362 			 * this combination of events is not present in the
1363 			 * rules.
1364 			 */
1365 			out(O_ALTFP, "upsets_eval: e0 not in instance tree");
1366 			Undiag_reason = UD_VAL_BADEVENTI;
1367 			goto retry_lone_ereport;
1368 		}
1369 		out(O_ALTFP|O_NONL, "adding event [");
1370 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1371 		out(O_ALTFP, " ]");
1372 		nfmep->ecurrent = ep;
1373 		ep->nvp = NULL;
1374 		ep->count = 1;
1375 		ep->observations = nfmep->observations;
1376 		nfmep->observations = ep;
1377 
1378 		/*
1379 		 * just peek first.
1380 		 */
1381 		nfmep->peek = 1;
1382 		prev_verbose = Verbose;
1383 		if (Debug == 0)
1384 			Verbose = 0;
1385 		lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep);
1386 		state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay);
1387 		nfmep->peek = 0;
1388 		Verbose = prev_verbose;
1389 		if (state == FME_DISPROVED) {
1390 			out(O_ALTFP, "upsets_eval: hypothesis disproved");
1391 			Undiag_reason = UD_VAL_UNSOLVD;
1392 retry_lone_ereport:
1393 			/*
1394 			 * However the trigger ereport on its own might be
1395 			 * diagnosable, so check for that. Undo the new fme
1396 			 * and case we just created and call fme_receive_report.
1397 			 */
1398 			out(O_ALTFP|O_NONL, "[");
1399 			ipath_print(O_ALTFP|O_NONL, tripped[i].ename,
1400 			    tripped[i].ipp);
1401 			out(O_ALTFP, " retrying with just trigger ereport]");
1402 			itree_free(nfmep->eventtree);
1403 			nfmep->eventtree = NULL;
1404 			structconfig_free(nfmep->config);
1405 			nfmep->config = NULL;
1406 			destroy_fme_bufs(nfmep);
1407 			fmd_case_close(nfmep->hdl, nfmep->fmcase);
1408 			fme_receive_report(fmep->hdl, ffep,
1409 			    tripped[i].ename, tripped[i].ipp, NULL);
1410 			continue;
1411 		}
1412 
1413 		/*
1414 		 * and evaluate
1415 		 */
1416 		serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp);
1417 		fme_eval(nfmep, ffep);
1418 	}
1419 
1420 	return (ntrip);
1421 }
1422 
1423 /*
1424  * fme_receive_external_report -- call when an external ereport comes in
1425  *
1426  * this routine just converts the relevant information from the ereport
1427  * into a format used internally and passes it on to fme_receive_report().
1428  */
1429 void
1430 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1431     const char *class)
1432 {
1433 	struct node		*epnamenp;
1434 	fmd_case_t		*fmcase;
1435 	const struct ipath	*ipp;
1436 
1437 	class = stable(class);
1438 
1439 	/* Get the component path from the ereport */
1440 	epnamenp = platform_getpath(nvl);
1441 
1442 	/* See if we ended up without a path. */
1443 	if (epnamenp == NULL) {
1444 		/* See if class permits silent discard on unknown component. */
1445 		if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) {
1446 			out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport "
1447 			    "to component path, but silent discard allowed.",
1448 			    class);
1449 		} else {
1450 			/*
1451 			 * XFILE: Failure to find a component is bad unless
1452 			 * 'discard_if_config_unknown=1' was specified in the
1453 			 * ereport definition. Indicate undiagnosable.
1454 			 */
1455 			out(O_ALTFP, "XFILE: Unable to map \"%s\" ereport "
1456 			    "to component path.", class);
1457 			Undiag_reason = UD_VAL_NOPATH;
1458 			fmcase = fmd_case_open(hdl, NULL);
1459 			publish_undiagnosable(hdl, ffep, fmcase);
1460 		}
1461 		return;
1462 	}
1463 
1464 	ipp = ipath(epnamenp);
1465 	tree_free(epnamenp);
1466 	fme_receive_report(hdl, ffep, class, ipp, nvl);
1467 }
1468 
1469 /*ARGSUSED*/
1470 void
1471 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1472     const char *eventstring)
1473 {
1474 	char *uuid;
1475 	nvlist_t **nva;
1476 	uint_t nvc;
1477 	const struct ipath *ipp;
1478 
1479 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
1480 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1481 	    &nva, &nvc) != 0) {
1482 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
1483 		return;
1484 	}
1485 
1486 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
1487 
1488 	while (nvc-- != 0) {
1489 		/*
1490 		 * Reset any istat or serd engine associated with this path.
1491 		 */
1492 		char *path;
1493 
1494 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
1495 			continue;
1496 
1497 		path = ipath2str(NULL, ipp);
1498 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
1499 		    path);
1500 		FREE(path);
1501 
1502 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
1503 		istat_save();
1504 
1505 		lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp);
1506 		serd_save();
1507 	}
1508 }
1509 
1510 /*ARGSUSED*/
1511 void
1512 fme_receive_topology_change(void)
1513 {
1514 	lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL);
1515 	istat_save();
1516 
1517 	lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL);
1518 	serd_save();
1519 }
1520 
1521 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1522     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1523 
1524 /* ARGSUSED */
1525 static void
1526 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1527 {
1528 	struct bubble *bp;
1529 	struct arrowlist *ap;
1530 
1531 	ep->cached_state = 0;
1532 	ep->keep_in_tree = 0;
1533 	for (bp = itree_next_bubble(ep, NULL); bp;
1534 	    bp = itree_next_bubble(ep, bp)) {
1535 		if (bp->t != B_FROM)
1536 			continue;
1537 		bp->mark = 0;
1538 		for (ap = itree_next_arrow(bp, NULL); ap;
1539 		    ap = itree_next_arrow(bp, ap))
1540 			ap->arrowp->mark = 0;
1541 	}
1542 }
1543 
1544 static void
1545 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1546     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1547 {
1548 	struct event *ep;
1549 	struct fme *fmep = NULL;
1550 	struct fme *ofmep = NULL;
1551 	struct fme *cfmep, *svfmep;
1552 	int matched = 0;
1553 	nvlist_t *defect;
1554 	fmd_case_t *fmcase;
1555 
1556 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1557 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1558 	out(O_ALTFP|O_STAMP, NULL);
1559 
1560 	/* decide which FME it goes to */
1561 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1562 		int prev_verbose;
1563 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1564 		enum fme_state state;
1565 		nvlist_t *pre_peek_nvp = NULL;
1566 
1567 		if (fmep->overflow) {
1568 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1569 				ofmep = fmep;
1570 
1571 			continue;
1572 		}
1573 
1574 		/*
1575 		 * ignore solved or closed cases
1576 		 */
1577 		if (fmep->posted_suspects ||
1578 		    fmd_case_solved(fmep->hdl, fmep->fmcase) ||
1579 		    fmd_case_closed(fmep->hdl, fmep->fmcase))
1580 			continue;
1581 
1582 		/* look up event in event tree for this FME */
1583 		if ((ep = itree_lookup(fmep->eventtree,
1584 		    eventstring, ipp)) == NULL)
1585 			continue;
1586 
1587 		/* note observation */
1588 		fmep->ecurrent = ep;
1589 		if (ep->count++ == 0) {
1590 			/* link it into list of observations seen */
1591 			ep->observations = fmep->observations;
1592 			fmep->observations = ep;
1593 			ep->nvp = evnv_dupnvl(nvl);
1594 		} else {
1595 			/* use new payload values for peek */
1596 			pre_peek_nvp = ep->nvp;
1597 			ep->nvp = evnv_dupnvl(nvl);
1598 		}
1599 
1600 		/* tell hypothesise() not to mess with suspect list */
1601 		fmep->peek = 1;
1602 
1603 		/* don't want this to be verbose (unless Debug is set) */
1604 		prev_verbose = Verbose;
1605 		if (Debug == 0)
1606 			Verbose = 0;
1607 
1608 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1609 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1610 
1611 		fmep->peek = 0;
1612 
1613 		/* put verbose flag back */
1614 		Verbose = prev_verbose;
1615 
1616 		if (state != FME_DISPROVED) {
1617 			/* found an FME that explains the ereport */
1618 			matched++;
1619 			out(O_ALTFP|O_NONL, "[");
1620 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1621 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1622 
1623 			if (pre_peek_nvp)
1624 				nvlist_free(pre_peek_nvp);
1625 
1626 			if (ep->count == 1)
1627 				serialize_observation(fmep, eventstring, ipp);
1628 
1629 			if (ffep) {
1630 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1631 				ep->ffep = ffep;
1632 			}
1633 
1634 			stats_counter_bump(fmep->Rcount);
1635 
1636 			/* re-eval FME */
1637 			fme_eval(fmep, ffep);
1638 		} else {
1639 
1640 			/* not a match, undo noting of observation */
1641 			fmep->ecurrent = NULL;
1642 			if (--ep->count == 0) {
1643 				/* unlink it from observations */
1644 				fmep->observations = ep->observations;
1645 				ep->observations = NULL;
1646 				nvlist_free(ep->nvp);
1647 				ep->nvp = NULL;
1648 			} else {
1649 				nvlist_free(ep->nvp);
1650 				ep->nvp = pre_peek_nvp;
1651 			}
1652 		}
1653 	}
1654 
1655 	if (matched)
1656 		return;	/* explained by at least one existing FME */
1657 
1658 	/* clean up closed fmes */
1659 	cfmep = ClosedFMEs;
1660 	while (cfmep != NULL) {
1661 		svfmep = cfmep->next;
1662 		destroy_fme(cfmep);
1663 		cfmep = svfmep;
1664 	}
1665 	ClosedFMEs = NULL;
1666 	prune_propagations(eventstring, ipp);
1667 
1668 	if (ofmep) {
1669 		out(O_ALTFP|O_NONL, "[");
1670 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1671 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1672 		if (ffep)
1673 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1674 
1675 		return;
1676 
1677 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1678 		out(O_ALTFP|O_NONL, "[");
1679 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1680 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1681 
1682 		fmcase = fmd_case_open(hdl, NULL);
1683 
1684 		/* Create overflow fme */
1685 		if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1686 			out(O_ALTFP|O_NONL, "[");
1687 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1688 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1689 			publish_undiagnosable(hdl, ffep, fmcase);
1690 			return;
1691 		}
1692 
1693 		Open_fme_count++;
1694 
1695 		init_fme_bufs(fmep);
1696 		fmep->overflow = B_TRUE;
1697 
1698 		if (ffep)
1699 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1700 
1701 		Undiag_reason = UD_VAL_MAXFME;
1702 		defect = fmd_nvl_create_fault(hdl,
1703 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
1704 		(void) nvlist_add_string(defect, UNDIAG_REASON,
1705 		    undiag_2reason_str(Undiag_reason));
1706 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1707 		fmd_case_solve(hdl, fmep->fmcase);
1708 		Undiag_reason = UD_VAL_UNKNOWN;
1709 		return;
1710 	}
1711 
1712 	/* open a case */
1713 	fmcase = fmd_case_open(hdl, NULL);
1714 
1715 	/* start a new FME */
1716 	if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) {
1717 		out(O_ALTFP|O_NONL, "[");
1718 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1719 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1720 		publish_undiagnosable(hdl, ffep, fmcase);
1721 		return;
1722 	}
1723 
1724 	Open_fme_count++;
1725 
1726 	init_fme_bufs(fmep);
1727 
1728 	out(O_ALTFP|O_NONL, "[");
1729 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1730 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1731 	    fmd_case_uuid(hdl, fmep->fmcase));
1732 
1733 	ep = fmep->e0;
1734 	ASSERT(ep != NULL);
1735 
1736 	/* note observation */
1737 	fmep->ecurrent = ep;
1738 	if (ep->count++ == 0) {
1739 		/* link it into list of observations seen */
1740 		ep->observations = fmep->observations;
1741 		fmep->observations = ep;
1742 		ep->nvp = evnv_dupnvl(nvl);
1743 		serialize_observation(fmep, eventstring, ipp);
1744 	} else {
1745 		/* new payload overrides any previous */
1746 		nvlist_free(ep->nvp);
1747 		ep->nvp = evnv_dupnvl(nvl);
1748 	}
1749 
1750 	stats_counter_bump(fmep->Rcount);
1751 
1752 	if (ffep) {
1753 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1754 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1755 		fmep->e0r = ffep;
1756 		ep->ffep = ffep;
1757 	}
1758 
1759 	/* give the diagnosis algorithm a shot at the new FME state */
1760 	fme_eval(fmep, ffep);
1761 }
1762 
1763 void
1764 fme_status(int flags)
1765 {
1766 	struct fme *fmep;
1767 
1768 	if (FMElist == NULL) {
1769 		out(flags, "No fault management exercises underway.");
1770 		return;
1771 	}
1772 
1773 	for (fmep = FMElist; fmep; fmep = fmep->next)
1774 		fme_print(flags, fmep);
1775 }
1776 
1777 /*
1778  * "indent" routines used mostly for nicely formatted debug output, but also
1779  * for sanity checking for infinite recursion bugs.
1780  */
1781 
1782 #define	MAX_INDENT 1024
1783 static const char *indent_s[MAX_INDENT];
1784 static int current_indent;
1785 
1786 static void
1787 indent_push(const char *s)
1788 {
1789 	if (current_indent < MAX_INDENT)
1790 		indent_s[current_indent++] = s;
1791 	else
1792 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1793 }
1794 
1795 static void
1796 indent_set(const char *s)
1797 {
1798 	current_indent = 0;
1799 	indent_push(s);
1800 }
1801 
1802 static void
1803 indent_pop(void)
1804 {
1805 	if (current_indent > 0)
1806 		current_indent--;
1807 	else
1808 		out(O_DIE, "recursion underflow");
1809 }
1810 
1811 static void
1812 indent(void)
1813 {
1814 	int i;
1815 	if (!Verbose)
1816 		return;
1817 	for (i = 0; i < current_indent; i++)
1818 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1819 }
1820 
1821 #define	SLNEW		1
1822 #define	SLCHANGED	2
1823 #define	SLWAIT		3
1824 #define	SLDISPROVED	4
1825 
1826 static void
1827 print_suspects(int circumstance, struct fme *fmep)
1828 {
1829 	struct event *ep;
1830 
1831 	out(O_ALTFP|O_NONL, "[");
1832 	if (circumstance == SLCHANGED) {
1833 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1834 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1835 	} else if (circumstance == SLWAIT) {
1836 		out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id,
1837 		    fmep->timer);
1838 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1839 	} else if (circumstance == SLDISPROVED) {
1840 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1841 	} else {
1842 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1843 	}
1844 
1845 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1846 		out(O_ALTFP, "]");
1847 		return;
1848 	}
1849 
1850 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1851 		out(O_ALTFP|O_NONL, " ");
1852 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1853 	}
1854 	out(O_ALTFP, "]");
1855 }
1856 
1857 static struct node *
1858 eventprop_lookup(struct event *ep, const char *propname)
1859 {
1860 	return (lut_lookup(ep->props, (void *)propname, NULL));
1861 }
1862 
1863 #define	MAXDIGITIDX	23
1864 static char numbuf[MAXDIGITIDX + 1];
1865 
1866 static int
1867 node2uint(struct node *n, uint_t *valp)
1868 {
1869 	struct evalue value;
1870 	struct lut *globals = NULL;
1871 
1872 	if (n == NULL)
1873 		return (1);
1874 
1875 	/*
1876 	 * check value.v since we are being asked to convert an unsigned
1877 	 * long long int to an unsigned int
1878 	 */
1879 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1880 	    value.t != UINT64 || value.v > (1ULL << 32))
1881 		return (1);
1882 
1883 	*valp = (uint_t)value.v;
1884 
1885 	return (0);
1886 }
1887 
1888 static nvlist_t *
1889 node2fmri(struct node *n)
1890 {
1891 	nvlist_t **pa, *f, *p;
1892 	struct node *nc;
1893 	uint_t depth = 0;
1894 	char *numstr, *nullbyte;
1895 	char *failure;
1896 	int err, i;
1897 
1898 	/* XXX do we need to be able to handle a non-T_NAME node? */
1899 	if (n == NULL || n->t != T_NAME)
1900 		return (NULL);
1901 
1902 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1903 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1904 			break;
1905 		depth++;
1906 	}
1907 
1908 	if (nc != NULL) {
1909 		/* We bailed early, something went wrong */
1910 		return (NULL);
1911 	}
1912 
1913 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1914 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1915 	pa = alloca(depth * sizeof (nvlist_t *));
1916 	for (i = 0; i < depth; i++)
1917 		pa[i] = NULL;
1918 
1919 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1920 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1921 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1922 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1923 	if (err != 0) {
1924 		failure = "basic construction of FMRI failed";
1925 		goto boom;
1926 	}
1927 
1928 	numbuf[MAXDIGITIDX] = '\0';
1929 	nullbyte = &numbuf[MAXDIGITIDX];
1930 	i = 0;
1931 
1932 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1933 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
1934 		if (err != 0) {
1935 			failure = "alloc of an hc-pair failed";
1936 			goto boom;
1937 		}
1938 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
1939 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
1940 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
1941 		if (err != 0) {
1942 			failure = "construction of an hc-pair failed";
1943 			goto boom;
1944 		}
1945 		pa[i++] = p;
1946 	}
1947 
1948 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
1949 	if (err == 0) {
1950 		for (i = 0; i < depth; i++)
1951 			if (pa[i] != NULL)
1952 				nvlist_free(pa[i]);
1953 		return (f);
1954 	}
1955 	failure = "addition of hc-pair array to FMRI failed";
1956 
1957 boom:
1958 	for (i = 0; i < depth; i++)
1959 		if (pa[i] != NULL)
1960 			nvlist_free(pa[i]);
1961 	nvlist_free(f);
1962 	out(O_DIE, "%s", failure);
1963 	/*NOTREACHED*/
1964 	return (NULL);
1965 }
1966 
1967 /* an ipath cache entry is an array of these, with s==NULL at the end */
1968 struct ipath {
1969 	const char *s;	/* component name (in stable) */
1970 	int i;		/* instance number */
1971 };
1972 
1973 static nvlist_t *
1974 ipath2fmri(struct ipath *ipath)
1975 {
1976 	nvlist_t **pa, *f, *p;
1977 	uint_t depth = 0;
1978 	char *numstr, *nullbyte;
1979 	char *failure;
1980 	int err, i;
1981 	struct ipath *ipp;
1982 
1983 	for (ipp = ipath; ipp->s != NULL; ipp++)
1984 		depth++;
1985 
1986 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
1987 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
1988 	pa = alloca(depth * sizeof (nvlist_t *));
1989 	for (i = 0; i < depth; i++)
1990 		pa[i] = NULL;
1991 
1992 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
1993 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
1994 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
1995 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
1996 	if (err != 0) {
1997 		failure = "basic construction of FMRI failed";
1998 		goto boom;
1999 	}
2000 
2001 	numbuf[MAXDIGITIDX] = '\0';
2002 	nullbyte = &numbuf[MAXDIGITIDX];
2003 	i = 0;
2004 
2005 	for (ipp = ipath; ipp->s != NULL; ipp++) {
2006 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
2007 		if (err != 0) {
2008 			failure = "alloc of an hc-pair failed";
2009 			goto boom;
2010 		}
2011 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s);
2012 		numstr = ulltostr(ipp->i, nullbyte);
2013 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
2014 		if (err != 0) {
2015 			failure = "construction of an hc-pair failed";
2016 			goto boom;
2017 		}
2018 		pa[i++] = p;
2019 	}
2020 
2021 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
2022 	if (err == 0) {
2023 		for (i = 0; i < depth; i++)
2024 			if (pa[i] != NULL)
2025 				nvlist_free(pa[i]);
2026 		return (f);
2027 	}
2028 	failure = "addition of hc-pair array to FMRI failed";
2029 
2030 boom:
2031 	for (i = 0; i < depth; i++)
2032 		if (pa[i] != NULL)
2033 			nvlist_free(pa[i]);
2034 	nvlist_free(f);
2035 	out(O_DIE, "%s", failure);
2036 	/*NOTREACHED*/
2037 	return (NULL);
2038 }
2039 
2040 static uint8_t
2041 percentof(uint_t part, uint_t whole)
2042 {
2043 	unsigned long long p = part * 1000;
2044 
2045 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
2046 }
2047 
2048 struct rsl {
2049 	struct event *suspect;
2050 	nvlist_t *asru;
2051 	nvlist_t *fru;
2052 	nvlist_t *rsrc;
2053 };
2054 
2055 static void publish_suspects(struct fme *fmep, struct rsl *srl);
2056 
2057 /*
2058  *  rslfree -- free internal members of struct rsl not expected to be
2059  *	freed elsewhere.
2060  */
2061 static void
2062 rslfree(struct rsl *freeme)
2063 {
2064 	if (freeme->asru != NULL)
2065 		nvlist_free(freeme->asru);
2066 	if (freeme->fru != NULL)
2067 		nvlist_free(freeme->fru);
2068 	if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru)
2069 		nvlist_free(freeme->rsrc);
2070 }
2071 
2072 /*
2073  *  rslcmp -- compare two rsl structures.  Use the following
2074  *	comparisons to establish cardinality:
2075  *
2076  *	1. Name of the suspect's class. (simple strcmp)
2077  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
2078  *
2079  */
2080 static int
2081 rslcmp(const void *a, const void *b)
2082 {
2083 	struct rsl *r1 = (struct rsl *)a;
2084 	struct rsl *r2 = (struct rsl *)b;
2085 	int rv;
2086 
2087 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
2088 	    r2->suspect->enode->u.event.ename->u.name.s);
2089 	if (rv != 0)
2090 		return (rv);
2091 
2092 	if (r1->rsrc == NULL && r2->rsrc == NULL)
2093 		return (0);
2094 	if (r1->rsrc == NULL)
2095 		return (-1);
2096 	if (r2->rsrc == NULL)
2097 		return (1);
2098 	return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0));
2099 }
2100 
2101 /*
2102  * get_resources -- for a given suspect, determine what ASRU, FRU and
2103  *     RSRC nvlists should be advertised in the final suspect list.
2104  */
2105 void
2106 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
2107 {
2108 	struct node *asrudef, *frudef;
2109 	nvlist_t *asru, *fru;
2110 	nvlist_t *rsrc = NULL;
2111 	char *pathstr;
2112 
2113 	/*
2114 	 * First find any ASRU and/or FRU defined in the
2115 	 * initial fault tree.
2116 	 */
2117 	asrudef = eventprop_lookup(sp, L_ASRU);
2118 	frudef = eventprop_lookup(sp, L_FRU);
2119 
2120 	/*
2121 	 * Create FMRIs based on those definitions
2122 	 */
2123 	asru = node2fmri(asrudef);
2124 	fru = node2fmri(frudef);
2125 	pathstr = ipath2str(NULL, sp->ipp);
2126 
2127 	/*
2128 	 *  Allow for platform translations of the FMRIs
2129 	 */
2130 	platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc,
2131 	    pathstr);
2132 
2133 	FREE(pathstr);
2134 	rsrcs->suspect = sp;
2135 	rsrcs->asru = asru;
2136 	rsrcs->fru = fru;
2137 	rsrcs->rsrc = rsrc;
2138 }
2139 
2140 /*
2141  * trim_suspects -- prior to publishing, we may need to remove some
2142  *    suspects from the list.  If we're auto-closing upsets, we don't
2143  *    want any of those in the published list.  If the ASRUs for multiple
2144  *    defects resolve to the same ASRU (driver) we only want to publish
2145  *    that as a single suspect.
2146  */
2147 static int
2148 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2,
2149     fmd_event_t *ffep)
2150 {
2151 	struct event *ep;
2152 	struct rsl *rp = begin;
2153 	struct rsl *rp2 = begin2;
2154 	int mess_zero_count = 0;
2155 	int serd_rval;
2156 	uint_t messval;
2157 
2158 	/* remove any unwanted upsets and populate our array */
2159 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
2160 		if (is_upset(ep->t))
2161 			continue;
2162 		serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep,
2163 		    NULL, NULL);
2164 		if (serd_rval == 0)
2165 			continue;
2166 		if (node2uint(eventprop_lookup(ep, L_message),
2167 		    &messval) == 0 && messval == 0) {
2168 			get_resources(ep, rp2, fmep->config);
2169 			rp2++;
2170 			mess_zero_count++;
2171 		} else {
2172 			get_resources(ep, rp, fmep->config);
2173 			rp++;
2174 			fmep->nsuspects++;
2175 		}
2176 	}
2177 	return (mess_zero_count);
2178 }
2179 
2180 /*
2181  * addpayloadprop -- add a payload prop to a problem
2182  */
2183 static void
2184 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
2185 {
2186 	nvlist_t *rsrc, *hcs;
2187 
2188 	ASSERT(fault != NULL);
2189 	ASSERT(lhs != NULL);
2190 	ASSERT(rhs != NULL);
2191 
2192 	if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0)
2193 		out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs);
2194 
2195 	if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) {
2196 		out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific");
2197 		if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0)
2198 			out(O_DIE,
2199 			    "cannot add payloadprop \"%s\" to fault", lhs);
2200 		if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0)
2201 			out(O_DIE,
2202 			    "cannot add payloadprop \"%s\" to fault", lhs);
2203 		nvlist_free(hcs);
2204 		if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0)
2205 			out(O_DIE,
2206 			    "cannot add payloadprop \"%s\" to fault", lhs);
2207 	} else
2208 		out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific");
2209 
2210 	if (rhs->t == UINT64) {
2211 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
2212 
2213 		if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0)
2214 			out(O_DIE,
2215 			    "cannot add payloadprop \"%s\" to fault", lhs);
2216 	} else {
2217 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
2218 		    lhs, (char *)(uintptr_t)rhs->v);
2219 
2220 		if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0)
2221 			out(O_DIE,
2222 			    "cannot add payloadprop \"%s\" to fault", lhs);
2223 	}
2224 }
2225 
2226 static char *Istatbuf;
2227 static char *Istatbufptr;
2228 static int Istatsz;
2229 
2230 /*
2231  * istataddsize -- calculate size of istat and add it to Istatsz
2232  */
2233 /*ARGSUSED2*/
2234 static void
2235 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2236 {
2237 	int val;
2238 
2239 	ASSERT(lhs != NULL);
2240 	ASSERT(rhs != NULL);
2241 
2242 	if ((val = stats_counter_value(rhs)) == 0)
2243 		return;	/* skip zero-valued stats */
2244 
2245 	/* count up the size of the stat name */
2246 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
2247 	Istatsz++;	/* for the trailing NULL byte */
2248 
2249 	/* count up the size of the stat value */
2250 	Istatsz += snprintf(NULL, 0, "%d", val);
2251 	Istatsz++;	/* for the trailing NULL byte */
2252 }
2253 
2254 /*
2255  * istat2str -- serialize an istat, writing result to *Istatbufptr
2256  */
2257 /*ARGSUSED2*/
2258 static void
2259 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2260 {
2261 	char *str;
2262 	int len;
2263 	int val;
2264 
2265 	ASSERT(lhs != NULL);
2266 	ASSERT(rhs != NULL);
2267 
2268 	if ((val = stats_counter_value(rhs)) == 0)
2269 		return;	/* skip zero-valued stats */
2270 
2271 	/* serialize the stat name */
2272 	str = ipath2str(lhs->ename, lhs->ipath);
2273 	len = strlen(str);
2274 
2275 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
2276 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
2277 	Istatbufptr += len;
2278 	FREE(str);
2279 	*Istatbufptr++ = '\0';
2280 
2281 	/* serialize the stat value */
2282 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
2283 	    "%d", val);
2284 	*Istatbufptr++ = '\0';
2285 
2286 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
2287 }
2288 
2289 void
2290 istat_save()
2291 {
2292 	if (Istat_need_save == 0)
2293 		return;
2294 
2295 	/* figure out how big the serialzed info is */
2296 	Istatsz = 0;
2297 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
2298 
2299 	if (Istatsz == 0) {
2300 		/* no stats to save */
2301 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2302 		return;
2303 	}
2304 
2305 	/* create the serialized buffer */
2306 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
2307 	lut_walk(Istats, (lut_cb)istat2str, NULL);
2308 
2309 	/* clear out current saved stats */
2310 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2311 
2312 	/* write out the new version */
2313 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
2314 	FREE(Istatbuf);
2315 
2316 	Istat_need_save = 0;
2317 }
2318 
2319 int
2320 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
2321 {
2322 	if (ent1->ename != ent2->ename)
2323 		return (ent2->ename - ent1->ename);
2324 	if (ent1->ipath != ent2->ipath)
2325 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2326 
2327 	return (0);
2328 }
2329 
2330 /*
2331  * istat-verify -- verify the component associated with a stat still exists
2332  *
2333  * if the component no longer exists, this routine resets the stat and
2334  * returns 0.  if the component still exists, it returns 1.
2335  */
2336 static int
2337 istat_verify(struct node *snp, struct istat_entry *entp)
2338 {
2339 	struct stats *statp;
2340 	nvlist_t *fmri;
2341 
2342 	fmri = node2fmri(snp->u.event.epname);
2343 	if (platform_path_exists(fmri)) {
2344 		nvlist_free(fmri);
2345 		return (1);
2346 	}
2347 	nvlist_free(fmri);
2348 
2349 	/* component no longer in system.  zero out the associated stats */
2350 	if ((statp = (struct stats *)
2351 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
2352 	    stats_counter_value(statp) == 0)
2353 		return (0);	/* stat is already reset */
2354 
2355 	Istat_need_save = 1;
2356 	stats_counter_reset(statp);
2357 	return (0);
2358 }
2359 
2360 static void
2361 istat_bump(struct node *snp, int n)
2362 {
2363 	struct stats *statp;
2364 	struct istat_entry ent;
2365 
2366 	ASSERT(snp != NULL);
2367 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
2368 	ASSERT(snp->u.event.epname != NULL);
2369 
2370 	/* class name should be hoisted into a single stable entry */
2371 	ASSERT(snp->u.event.ename->u.name.next == NULL);
2372 	ent.ename = snp->u.event.ename->u.name.s;
2373 	ent.ipath = ipath(snp->u.event.epname);
2374 
2375 	if (!istat_verify(snp, &ent)) {
2376 		/* component no longer exists in system, nothing to do */
2377 		return;
2378 	}
2379 
2380 	if ((statp = (struct stats *)
2381 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
2382 		/* need to create the counter */
2383 		int cnt = 0;
2384 		struct node *np;
2385 		char *sname;
2386 		char *snamep;
2387 		struct istat_entry *newentp;
2388 
2389 		/* count up the size of the stat name */
2390 		np = snp->u.event.ename;
2391 		while (np != NULL) {
2392 			cnt += strlen(np->u.name.s);
2393 			cnt++;	/* for the '.' or '@' */
2394 			np = np->u.name.next;
2395 		}
2396 		np = snp->u.event.epname;
2397 		while (np != NULL) {
2398 			cnt += snprintf(NULL, 0, "%s%llu",
2399 			    np->u.name.s, np->u.name.child->u.ull);
2400 			cnt++;	/* for the '/' or trailing NULL byte */
2401 			np = np->u.name.next;
2402 		}
2403 
2404 		/* build the stat name */
2405 		snamep = sname = alloca(cnt);
2406 		np = snp->u.event.ename;
2407 		while (np != NULL) {
2408 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2409 			    "%s", np->u.name.s);
2410 			np = np->u.name.next;
2411 			if (np)
2412 				*snamep++ = '.';
2413 		}
2414 		*snamep++ = '@';
2415 		np = snp->u.event.epname;
2416 		while (np != NULL) {
2417 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2418 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
2419 			np = np->u.name.next;
2420 			if (np)
2421 				*snamep++ = '/';
2422 		}
2423 		*snamep++ = '\0';
2424 
2425 		/* create the new stat & add it to our list */
2426 		newentp = MALLOC(sizeof (*newentp));
2427 		*newentp = ent;
2428 		statp = stats_new_counter(NULL, sname, 0);
2429 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
2430 		    (lut_cmp)istat_cmp);
2431 	}
2432 
2433 	/* if n is non-zero, set that value instead of bumping */
2434 	if (n) {
2435 		stats_counter_reset(statp);
2436 		stats_counter_add(statp, n);
2437 	} else
2438 		stats_counter_bump(statp);
2439 	Istat_need_save = 1;
2440 
2441 	ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath);
2442 	out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented",
2443 	    stats_counter_value(statp));
2444 }
2445 
2446 /*ARGSUSED*/
2447 static void
2448 istat_destructor(void *left, void *right, void *arg)
2449 {
2450 	struct istat_entry *entp = (struct istat_entry *)left;
2451 	struct stats *statp = (struct stats *)right;
2452 	FREE(entp);
2453 	stats_delete(statp);
2454 }
2455 
2456 /*
2457  * Callback used in a walk of the Istats to reset matching stat counters.
2458  */
2459 static void
2460 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
2461     const struct ipath *ipp)
2462 {
2463 	char *path;
2464 
2465 	if (entp->ipath == ipp) {
2466 		path = ipath2str(entp->ename, ipp);
2467 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
2468 		FREE(path);
2469 		stats_counter_reset(statp);
2470 		Istat_need_save = 1;
2471 	}
2472 }
2473 
2474 /*ARGSUSED*/
2475 static void
2476 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp,
2477     void *unused)
2478 {
2479 	char *path;
2480 	nvlist_t *fmri;
2481 
2482 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2483 	if (!platform_path_exists(fmri)) {
2484 		path = ipath2str(entp->ename, entp->ipath);
2485 		out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path);
2486 		FREE(path);
2487 		stats_counter_reset(statp);
2488 		Istat_need_save = 1;
2489 	}
2490 	nvlist_free(fmri);
2491 }
2492 
2493 void
2494 istat_fini(void)
2495 {
2496 	lut_free(Istats, istat_destructor, NULL);
2497 }
2498 
2499 static char *Serdbuf;
2500 static char *Serdbufptr;
2501 static int Serdsz;
2502 
2503 /*
2504  * serdaddsize -- calculate size of serd and add it to Serdsz
2505  */
2506 /*ARGSUSED*/
2507 static void
2508 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2509 {
2510 	ASSERT(lhs != NULL);
2511 
2512 	/* count up the size of the stat name */
2513 	Serdsz += ipath2strlen(lhs->ename, lhs->ipath);
2514 	Serdsz++;	/* for the trailing NULL byte */
2515 }
2516 
2517 /*
2518  * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2519  */
2520 /*ARGSUSED*/
2521 static void
2522 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2523 {
2524 	char *str;
2525 	int len;
2526 
2527 	ASSERT(lhs != NULL);
2528 
2529 	/* serialize the serd engine name */
2530 	str = ipath2str(lhs->ename, lhs->ipath);
2531 	len = strlen(str);
2532 
2533 	ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]);
2534 	(void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr);
2535 	Serdbufptr += len;
2536 	FREE(str);
2537 	*Serdbufptr++ = '\0';
2538 	ASSERT(Serdbufptr <= &Serdbuf[Serdsz]);
2539 }
2540 
2541 void
2542 serd_save()
2543 {
2544 	if (Serd_need_save == 0)
2545 		return;
2546 
2547 	/* figure out how big the serialzed info is */
2548 	Serdsz = 0;
2549 	lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL);
2550 
2551 	if (Serdsz == 0) {
2552 		/* no serd engines to save */
2553 		fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2554 		return;
2555 	}
2556 
2557 	/* create the serialized buffer */
2558 	Serdbufptr = Serdbuf = MALLOC(Serdsz);
2559 	lut_walk(SerdEngines, (lut_cb)serd2str, NULL);
2560 
2561 	/* clear out current saved stats */
2562 	fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2563 
2564 	/* write out the new version */
2565 	fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz);
2566 	FREE(Serdbuf);
2567 	Serd_need_save = 0;
2568 }
2569 
2570 int
2571 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2)
2572 {
2573 	if (ent1->ename != ent2->ename)
2574 		return (ent2->ename - ent1->ename);
2575 	if (ent1->ipath != ent2->ipath)
2576 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2577 
2578 	return (0);
2579 }
2580 
2581 void
2582 fme_serd_load(fmd_hdl_t *hdl)
2583 {
2584 	int sz;
2585 	char *sbuf;
2586 	char *sepptr;
2587 	char *ptr;
2588 	struct serd_entry *newentp;
2589 	struct node *epname;
2590 	nvlist_t *fmri;
2591 	char *namestring;
2592 
2593 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0)
2594 		return;
2595 	sbuf = alloca(sz);
2596 	fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz);
2597 	ptr = sbuf;
2598 	while (ptr < &sbuf[sz]) {
2599 		sepptr = strchr(ptr, '@');
2600 		*sepptr = '\0';
2601 		namestring = ptr;
2602 		sepptr++;
2603 		ptr = sepptr;
2604 		ptr += strlen(ptr);
2605 		ptr++;	/* move past the '\0' separating paths */
2606 		epname = pathstring2epnamenp(sepptr);
2607 		fmri = node2fmri(epname);
2608 		if (platform_path_exists(fmri)) {
2609 			newentp = MALLOC(sizeof (*newentp));
2610 			newentp->hdl = hdl;
2611 			newentp->ipath = ipath(epname);
2612 			newentp->ename = stable(namestring);
2613 			SerdEngines = lut_add(SerdEngines, (void *)newentp,
2614 			    (void *)newentp, (lut_cmp)serd_cmp);
2615 		} else
2616 			Serd_need_save = 1;
2617 		tree_free(epname);
2618 		nvlist_free(fmri);
2619 	}
2620 	/* save it back again in case some of the paths no longer exist */
2621 	serd_save();
2622 }
2623 
2624 /*ARGSUSED*/
2625 static void
2626 serd_destructor(void *left, void *right, void *arg)
2627 {
2628 	struct serd_entry *entp = (struct serd_entry *)left;
2629 	FREE(entp);
2630 }
2631 
2632 /*
2633  * Callback used in a walk of the SerdEngines to reset matching serd engines.
2634  */
2635 /*ARGSUSED*/
2636 static void
2637 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp)
2638 {
2639 	char *path;
2640 
2641 	if (entp->ipath == ipp) {
2642 		path = ipath2str(entp->ename, ipp);
2643 		out(O_ALTFP, "serd_reset_cb: resetting %s", path);
2644 		fmd_serd_reset(entp->hdl, path);
2645 		FREE(path);
2646 		Serd_need_save = 1;
2647 	}
2648 }
2649 
2650 /*ARGSUSED*/
2651 static void
2652 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2)
2653 {
2654 	char *path;
2655 	nvlist_t *fmri;
2656 
2657 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2658 	if (!platform_path_exists(fmri)) {
2659 		path = ipath2str(entp->ename, entp->ipath);
2660 		out(O_ALTFP, "serd_topo_chg_cb: not present %s", path);
2661 		fmd_serd_reset(entp->hdl, path);
2662 		FREE(path);
2663 		Serd_need_save = 1;
2664 	}
2665 	nvlist_free(fmri);
2666 }
2667 
2668 void
2669 serd_fini(void)
2670 {
2671 	lut_free(SerdEngines, serd_destructor, NULL);
2672 }
2673 
2674 static void
2675 publish_suspects(struct fme *fmep, struct rsl *srl)
2676 {
2677 	struct rsl *rp;
2678 	nvlist_t *fault;
2679 	uint8_t cert;
2680 	uint_t *frs;
2681 	uint_t frsum, fr;
2682 	uint_t messval;
2683 	uint_t retireval;
2684 	uint_t responseval;
2685 	struct node *snp;
2686 	int frcnt, fridx;
2687 	boolean_t allfaulty = B_TRUE;
2688 	struct rsl *erl = srl + fmep->nsuspects - 1;
2689 
2690 	/*
2691 	 * sort the array
2692 	 */
2693 	qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp);
2694 
2695 	/* sum the fitrates */
2696 	frs = alloca(fmep->nsuspects * sizeof (uint_t));
2697 	fridx = frcnt = frsum = 0;
2698 
2699 	for (rp = srl; rp <= erl; rp++) {
2700 		struct node *n;
2701 
2702 		n = eventprop_lookup(rp->suspect, L_FITrate);
2703 		if (node2uint(n, &fr) != 0) {
2704 			out(O_DEBUG|O_NONL, "event ");
2705 			ipath_print(O_DEBUG|O_NONL,
2706 			    rp->suspect->enode->u.event.ename->u.name.s,
2707 			    rp->suspect->ipp);
2708 			out(O_DEBUG, " has no FITrate (using 1)");
2709 			fr = 1;
2710 		} else if (fr == 0) {
2711 			out(O_DEBUG|O_NONL, "event ");
2712 			ipath_print(O_DEBUG|O_NONL,
2713 			    rp->suspect->enode->u.event.ename->u.name.s,
2714 			    rp->suspect->ipp);
2715 			out(O_DEBUG, " has zero FITrate (using 1)");
2716 			fr = 1;
2717 		}
2718 
2719 		frs[fridx++] = fr;
2720 		frsum += fr;
2721 		frcnt++;
2722 	}
2723 
2724 	/* Add them in reverse order of our sort, as fmd reverses order */
2725 	for (rp = erl; rp >= srl; rp--) {
2726 		cert = percentof(frs[--fridx], frsum);
2727 		fault = fmd_nvl_create_fault(fmep->hdl,
2728 		    rp->suspect->enode->u.event.ename->u.name.s,
2729 		    cert,
2730 		    rp->asru,
2731 		    rp->fru,
2732 		    rp->rsrc);
2733 		if (fault == NULL)
2734 			out(O_DIE, "fault creation failed");
2735 		/* if "message" property exists, add it to the fault */
2736 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2737 		    &messval) == 0) {
2738 
2739 			out(O_ALTFP,
2740 			    "[FME%d, %s adds message=%d to suspect list]",
2741 			    fmep->id,
2742 			    rp->suspect->enode->u.event.ename->u.name.s,
2743 			    messval);
2744 			if (nvlist_add_boolean_value(fault,
2745 			    FM_SUSPECT_MESSAGE,
2746 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2747 				out(O_DIE, "cannot add no-message to fault");
2748 			}
2749 		}
2750 
2751 		/* if "retire" property exists, add it to the fault */
2752 		if (node2uint(eventprop_lookup(rp->suspect, L_retire),
2753 		    &retireval) == 0) {
2754 
2755 			out(O_ALTFP,
2756 			    "[FME%d, %s adds retire=%d to suspect list]",
2757 			    fmep->id,
2758 			    rp->suspect->enode->u.event.ename->u.name.s,
2759 			    retireval);
2760 			if (nvlist_add_boolean_value(fault,
2761 			    FM_SUSPECT_RETIRE,
2762 			    (retireval) ? B_TRUE : B_FALSE) != 0) {
2763 				out(O_DIE, "cannot add no-retire to fault");
2764 			}
2765 		}
2766 
2767 		/* if "response" property exists, add it to the fault */
2768 		if (node2uint(eventprop_lookup(rp->suspect, L_response),
2769 		    &responseval) == 0) {
2770 
2771 			out(O_ALTFP,
2772 			    "[FME%d, %s adds response=%d to suspect list]",
2773 			    fmep->id,
2774 			    rp->suspect->enode->u.event.ename->u.name.s,
2775 			    responseval);
2776 			if (nvlist_add_boolean_value(fault,
2777 			    FM_SUSPECT_RESPONSE,
2778 			    (responseval) ? B_TRUE : B_FALSE) != 0) {
2779 				out(O_DIE, "cannot add no-response to fault");
2780 			}
2781 		}
2782 
2783 		/* add any payload properties */
2784 		lut_walk(rp->suspect->payloadprops,
2785 		    (lut_cb)addpayloadprop, (void *)fault);
2786 		rslfree(rp);
2787 
2788 		/*
2789 		 * If "action" property exists, evaluate it;  this must be done
2790 		 * before the allfaulty check below since some actions may
2791 		 * modify the asru to be used in fmd_nvl_fmri_has_fault.  This
2792 		 * needs to be restructured if any new actions are introduced
2793 		 * that have effects that we do not want to be visible if
2794 		 * we decide not to publish in the dupclose check below.
2795 		 */
2796 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2797 			struct evalue evalue;
2798 
2799 			out(O_ALTFP|O_NONL,
2800 			    "[FME%d, %s action ", fmep->id,
2801 			    rp->suspect->enode->u.event.ename->u.name.s);
2802 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2803 			out(O_ALTFP, "]");
2804 			Action_nvl = fault;
2805 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2806 			    NULL, 0, &evalue);
2807 		}
2808 
2809 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2810 
2811 		/*
2812 		 * check if the asru is already marked as "faulty".
2813 		 */
2814 		if (allfaulty) {
2815 			nvlist_t *asru;
2816 
2817 			out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id);
2818 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2819 			out(O_ALTFP|O_VERB|O_NONL, " ");
2820 			if (nvlist_lookup_nvlist(fault,
2821 			    FM_FAULT_ASRU, &asru) != 0) {
2822 				out(O_ALTFP|O_VERB, "NULL asru");
2823 				allfaulty = B_FALSE;
2824 			} else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru,
2825 			    FMD_HAS_FAULT_ASRU, NULL)) {
2826 				out(O_ALTFP|O_VERB, "faulty");
2827 			} else {
2828 				out(O_ALTFP|O_VERB, "not faulty");
2829 				allfaulty = B_FALSE;
2830 			}
2831 		}
2832 
2833 	}
2834 
2835 	if (!allfaulty) {
2836 		/*
2837 		 * don't update the count stat if all asrus are already
2838 		 * present and unrepaired in the asru cache
2839 		 */
2840 		for (rp = erl; rp >= srl; rp--) {
2841 			struct event *suspect = rp->suspect;
2842 
2843 			if (suspect == NULL)
2844 				continue;
2845 
2846 			/* if "count" exists, increment the appropriate stat */
2847 			if ((snp = eventprop_lookup(suspect,
2848 			    L_count)) != NULL) {
2849 				out(O_ALTFP|O_NONL,
2850 				    "[FME%d, %s count ", fmep->id,
2851 				    suspect->enode->u.event.ename->u.name.s);
2852 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2853 				out(O_ALTFP, "]");
2854 				istat_bump(snp, 0);
2855 
2856 			}
2857 		}
2858 		istat_save();	/* write out any istat changes */
2859 	}
2860 }
2861 
2862 static const char *
2863 undiag_2defect_str(int ud)
2864 {
2865 	switch (ud) {
2866 	case UD_VAL_MISSINGINFO:
2867 	case UD_VAL_MISSINGOBS:
2868 	case UD_VAL_MISSINGPATH:
2869 	case UD_VAL_MISSINGZERO:
2870 	case UD_VAL_BADOBS:
2871 	case UD_VAL_CFGMISMATCH:
2872 		return (UNDIAG_DEFECT_CHKPT);
2873 		break;
2874 
2875 	case UD_VAL_BADEVENTI:
2876 	case UD_VAL_INSTFAIL:
2877 	case UD_VAL_NOPATH:
2878 	case UD_VAL_UNSOLVD:
2879 		return (UNDIAG_DEFECT_FME);
2880 		break;
2881 
2882 	case UD_VAL_MAXFME:
2883 		return (UNDIAG_DEFECT_LIMIT);
2884 		break;
2885 
2886 	case UD_VAL_UNKNOWN:
2887 	default:
2888 		return (UNDIAG_DEFECT_UNKNOWN);
2889 		break;
2890 	}
2891 }
2892 
2893 const char *
2894 undiag_2reason_str(int ud)
2895 {
2896 	switch (ud) {
2897 	case UD_VAL_BADEVENTI:
2898 		return (UD_STR_BADEVENTI);
2899 	case UD_VAL_BADOBS:
2900 		return (UD_STR_BADOBS);
2901 	case UD_VAL_CFGMISMATCH:
2902 		return (UD_STR_CFGMISMATCH);
2903 	case UD_VAL_INSTFAIL:
2904 		return (UD_STR_INSTFAIL);
2905 	case UD_VAL_MAXFME:
2906 		return (UD_STR_MAXFME);
2907 	case UD_VAL_MISSINGINFO:
2908 		return (UD_STR_MISSINGINFO);
2909 	case UD_VAL_MISSINGOBS:
2910 		return (UD_STR_MISSINGOBS);
2911 	case UD_VAL_MISSINGPATH:
2912 		return (UD_STR_MISSINGPATH);
2913 	case UD_VAL_MISSINGZERO:
2914 		return (UD_STR_MISSINGZERO);
2915 	case UD_VAL_NOPATH:
2916 		return (UD_STR_NOPATH);
2917 	case UD_VAL_UNSOLVD:
2918 		return (UD_STR_UNSOLVD);
2919 	case UD_VAL_UNKNOWN:
2920 	default:
2921 		return (UD_STR_UNKNOWN);
2922 	}
2923 }
2924 
2925 static void
2926 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase)
2927 {
2928 	struct case_list *newcase;
2929 	nvlist_t *defect;
2930 
2931 	out(O_ALTFP,
2932 	    "[undiagnosable ereport received, "
2933 	    "creating and closing a new case (%s)]",
2934 	    undiag_2reason_str(Undiag_reason));
2935 
2936 	newcase = MALLOC(sizeof (struct case_list));
2937 	newcase->next = NULL;
2938 	newcase->fmcase = fmcase;
2939 	if (Undiagablecaselist != NULL)
2940 		newcase->next = Undiagablecaselist;
2941 	Undiagablecaselist = newcase;
2942 
2943 	if (ffep != NULL)
2944 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
2945 
2946 	defect = fmd_nvl_create_fault(hdl,
2947 	    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
2948 	(void) nvlist_add_string(defect, UNDIAG_REASON,
2949 	    undiag_2reason_str(Undiag_reason));
2950 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
2951 
2952 	fmd_case_solve(hdl, newcase->fmcase);
2953 	fmd_case_close(hdl, newcase->fmcase);
2954 	Undiag_reason = UD_VAL_UNKNOWN;
2955 }
2956 
2957 static void
2958 fme_undiagnosable(struct fme *f)
2959 {
2960 	nvlist_t *defect;
2961 
2962 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
2963 	    f->id, fmd_case_uuid(f->hdl, f->fmcase),
2964 	    undiag_2reason_str(Undiag_reason));
2965 
2966 	defect = fmd_nvl_create_fault(f->hdl,
2967 	    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
2968 	(void) nvlist_add_string(defect, UNDIAG_REASON,
2969 	    undiag_2reason_str(Undiag_reason));
2970 	fmd_case_add_suspect(f->hdl, f->fmcase, defect);
2971 	fmd_case_solve(f->hdl, f->fmcase);
2972 	fmd_case_close(f->hdl, f->fmcase);
2973 	Undiag_reason = UD_VAL_UNKNOWN;
2974 }
2975 
2976 /*
2977  * fme_close_case
2978  *
2979  *	Find the requested case amongst our fmes and close it.  Free up
2980  *	the related fme.
2981  */
2982 void
2983 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
2984 {
2985 	struct case_list *ucasep, *prevcasep = NULL;
2986 	struct fme *prev = NULL;
2987 	struct fme *fmep;
2988 
2989 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
2990 		if (fmcase != ucasep->fmcase) {
2991 			prevcasep = ucasep;
2992 			continue;
2993 		}
2994 
2995 		if (prevcasep == NULL)
2996 			Undiagablecaselist = Undiagablecaselist->next;
2997 		else
2998 			prevcasep->next = ucasep->next;
2999 
3000 		FREE(ucasep);
3001 		return;
3002 	}
3003 
3004 	for (fmep = FMElist; fmep; fmep = fmep->next) {
3005 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
3006 			break;
3007 		prev = fmep;
3008 	}
3009 
3010 	if (fmep == NULL) {
3011 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
3012 		    fmd_case_uuid(hdl, fmcase));
3013 		return;
3014 	}
3015 
3016 	if (EFMElist == fmep)
3017 		EFMElist = prev;
3018 
3019 	if (prev == NULL)
3020 		FMElist = FMElist->next;
3021 	else
3022 		prev->next = fmep->next;
3023 
3024 	fmep->next = NULL;
3025 
3026 	/* Get rid of any timer this fme has set */
3027 	if (fmep->wull != 0)
3028 		fmd_timer_remove(fmep->hdl, fmep->timer);
3029 
3030 	if (ClosedFMEs == NULL) {
3031 		ClosedFMEs = fmep;
3032 	} else {
3033 		fmep->next = ClosedFMEs;
3034 		ClosedFMEs = fmep;
3035 	}
3036 
3037 	Open_fme_count--;
3038 
3039 	/* See if we can close the overflow FME */
3040 	if (Open_fme_count <= Max_fme) {
3041 		for (fmep = FMElist; fmep; fmep = fmep->next) {
3042 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
3043 			    fmep->fmcase)))
3044 				break;
3045 		}
3046 
3047 		if (fmep != NULL)
3048 			fmd_case_close(fmep->hdl, fmep->fmcase);
3049 	}
3050 }
3051 
3052 /*
3053  * fme_set_timer()
3054  *	If the time we need to wait for the given FME is less than the
3055  *	current timer, kick that old timer out and establish a new one.
3056  */
3057 static int
3058 fme_set_timer(struct fme *fmep, unsigned long long wull)
3059 {
3060 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
3061 	ptree_timeval(O_ALTFP|O_VERB, &wull);
3062 
3063 	if (wull <= fmep->pull) {
3064 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
3065 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
3066 		out(O_ALTFP|O_VERB, NULL);
3067 		/* we've waited at least wull already, don't need timer */
3068 		return (0);
3069 	}
3070 
3071 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
3072 	if (fmep->wull != 0) {
3073 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
3074 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
3075 		out(O_ALTFP|O_VERB, NULL);
3076 	} else {
3077 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
3078 		out(O_ALTFP|O_VERB, NULL);
3079 	}
3080 
3081 	if (fmep->wull != 0)
3082 		if (wull >= fmep->wull)
3083 			/* New timer would fire later than established timer */
3084 			return (0);
3085 
3086 	if (fmep->wull != 0) {
3087 		fmd_timer_remove(fmep->hdl, fmep->timer);
3088 	}
3089 
3090 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
3091 	    fmep->e0r, wull);
3092 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
3093 	fmep->wull = wull;
3094 	return (1);
3095 }
3096 
3097 void
3098 fme_timer_fired(struct fme *fmep, id_t tid)
3099 {
3100 	struct fme *ffmep = NULL;
3101 
3102 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
3103 		if (ffmep == fmep)
3104 			break;
3105 
3106 	if (ffmep == NULL) {
3107 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
3108 		    (void *)fmep);
3109 		return;
3110 	}
3111 
3112 	out(O_ALTFP|O_VERB, "Timer fired %lx", tid);
3113 	fmep->pull = fmep->wull;
3114 	fmep->wull = 0;
3115 	fmd_buf_write(fmep->hdl, fmep->fmcase,
3116 	    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
3117 
3118 	fme_eval(fmep, fmep->e0r);
3119 }
3120 
3121 /*
3122  * Preserve the fme's suspect list in its psuspects list, NULLing the
3123  * suspects list in the meantime.
3124  */
3125 static void
3126 save_suspects(struct fme *fmep)
3127 {
3128 	struct event *ep;
3129 	struct event *nextep;
3130 
3131 	/* zero out the previous suspect list */
3132 	for (ep = fmep->psuspects; ep; ep = nextep) {
3133 		nextep = ep->psuspects;
3134 		ep->psuspects = NULL;
3135 	}
3136 	fmep->psuspects = NULL;
3137 
3138 	/* zero out the suspect list, copying it to previous suspect list */
3139 	fmep->psuspects = fmep->suspects;
3140 	for (ep = fmep->suspects; ep; ep = nextep) {
3141 		nextep = ep->suspects;
3142 		ep->psuspects = ep->suspects;
3143 		ep->suspects = NULL;
3144 		ep->is_suspect = 0;
3145 	}
3146 	fmep->suspects = NULL;
3147 	fmep->nsuspects = 0;
3148 }
3149 
3150 /*
3151  * Retrieve the fme's suspect list from its psuspects list.
3152  */
3153 static void
3154 restore_suspects(struct fme *fmep)
3155 {
3156 	struct event *ep;
3157 	struct event *nextep;
3158 
3159 	fmep->nsuspects = 0;
3160 	fmep->suspects = fmep->psuspects;
3161 	for (ep = fmep->psuspects; ep; ep = nextep) {
3162 		fmep->nsuspects++;
3163 		nextep = ep->psuspects;
3164 		ep->suspects = ep->psuspects;
3165 	}
3166 }
3167 
3168 /*
3169  * this is what we use to call the Emrys prototype code instead of main()
3170  */
3171 static void
3172 fme_eval(struct fme *fmep, fmd_event_t *ffep)
3173 {
3174 	struct event *ep;
3175 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
3176 	struct rsl *srl = NULL;
3177 	struct rsl *srl2 = NULL;
3178 	int mess_zero_count;
3179 	int rpcnt;
3180 
3181 	save_suspects(fmep);
3182 
3183 	out(O_ALTFP, "Evaluate FME %d", fmep->id);
3184 	indent_set("  ");
3185 
3186 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
3187 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
3188 
3189 	out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
3190 	    fme_state2str(fmep->state));
3191 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
3192 		out(O_ALTFP|O_NONL, " ");
3193 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
3194 	}
3195 	out(O_ALTFP, NULL);
3196 
3197 	switch (fmep->state) {
3198 	case FME_CREDIBLE:
3199 		print_suspects(SLNEW, fmep);
3200 		(void) upsets_eval(fmep, ffep);
3201 
3202 		/*
3203 		 * we may have already posted suspects in upsets_eval() which
3204 		 * can recurse into fme_eval() again. If so then just return.
3205 		 */
3206 		if (fmep->posted_suspects)
3207 			return;
3208 
3209 		stats_counter_bump(fmep->diags);
3210 		rpcnt = fmep->nsuspects;
3211 		save_suspects(fmep);
3212 
3213 		/*
3214 		 * create two lists, one for "message=1" faults and one for
3215 		 * "message=0" faults. If we have a mixture we will generate
3216 		 * two separate suspect lists.
3217 		 */
3218 		srl = MALLOC(rpcnt * sizeof (struct rsl));
3219 		bzero(srl, rpcnt * sizeof (struct rsl));
3220 		srl2 = MALLOC(rpcnt * sizeof (struct rsl));
3221 		bzero(srl2, rpcnt * sizeof (struct rsl));
3222 		mess_zero_count = trim_suspects(fmep, srl, srl2, ffep);
3223 
3224 		/*
3225 		 * If the resulting suspect list has no members, we're
3226 		 * done so simply close the case. Otherwise sort and publish.
3227 		 */
3228 		if (fmep->nsuspects == 0 && mess_zero_count == 0) {
3229 			out(O_ALTFP,
3230 			    "[FME%d, case %s (all suspects are upsets)]",
3231 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
3232 			fmd_case_close(fmep->hdl, fmep->fmcase);
3233 		} else if (fmep->nsuspects != 0 && mess_zero_count == 0) {
3234 			publish_suspects(fmep, srl);
3235 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3236 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3237 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3238 		} else if (fmep->nsuspects == 0 && mess_zero_count != 0) {
3239 			fmep->nsuspects = mess_zero_count;
3240 			publish_suspects(fmep, srl2);
3241 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3242 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3243 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3244 		} else {
3245 			struct event *obsp;
3246 			struct fme *nfmep;
3247 
3248 			publish_suspects(fmep, srl);
3249 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3250 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3251 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3252 
3253 			/*
3254 			 * Got both message=0 and message=1 so create a
3255 			 * duplicate case. Also need a temporary duplicate fme
3256 			 * structure for use by publish_suspects().
3257 			 */
3258 			nfmep = alloc_fme();
3259 			nfmep->id =  Nextid++;
3260 			nfmep->hdl = fmep->hdl;
3261 			nfmep->nsuspects = mess_zero_count;
3262 			nfmep->fmcase = fmd_case_open(fmep->hdl, NULL);
3263 			out(O_ALTFP|O_STAMP,
3264 			    "[creating parallel FME%d, case %s]", nfmep->id,
3265 			    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
3266 			Open_fme_count++;
3267 			if (ffep) {
3268 				fmd_case_setprincipal(nfmep->hdl,
3269 				    nfmep->fmcase, ffep);
3270 				fmd_case_add_ereport(nfmep->hdl,
3271 				    nfmep->fmcase, ffep);
3272 			}
3273 			for (obsp = fmep->observations; obsp;
3274 			    obsp = obsp->observations)
3275 				if (obsp->ffep && obsp->ffep != ffep)
3276 					fmd_case_add_ereport(nfmep->hdl,
3277 					    nfmep->fmcase, obsp->ffep);
3278 
3279 			publish_suspects(nfmep, srl2);
3280 			out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id,
3281 			    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
3282 			fmd_case_solve(nfmep->hdl, nfmep->fmcase);
3283 			FREE(nfmep);
3284 		}
3285 		FREE(srl);
3286 		FREE(srl2);
3287 		restore_suspects(fmep);
3288 
3289 		fmep->posted_suspects = 1;
3290 		fmd_buf_write(fmep->hdl, fmep->fmcase,
3291 		    WOBUF_POSTD,
3292 		    (void *)&fmep->posted_suspects,
3293 		    sizeof (fmep->posted_suspects));
3294 
3295 		/*
3296 		 * Now the suspects have been posted, we can clear up
3297 		 * the instance tree as we won't be looking at it again.
3298 		 * Also cancel the timer as the case is now solved.
3299 		 */
3300 		if (fmep->wull != 0) {
3301 			fmd_timer_remove(fmep->hdl, fmep->timer);
3302 			fmep->wull = 0;
3303 		}
3304 		break;
3305 
3306 	case FME_WAIT:
3307 		ASSERT(my_delay > fmep->ull);
3308 		(void) fme_set_timer(fmep, my_delay);
3309 		print_suspects(SLWAIT, fmep);
3310 		itree_prune(fmep->eventtree);
3311 		return;
3312 
3313 	case FME_DISPROVED:
3314 		print_suspects(SLDISPROVED, fmep);
3315 		Undiag_reason = UD_VAL_UNSOLVD;
3316 		fme_undiagnosable(fmep);
3317 		break;
3318 	}
3319 
3320 	itree_free(fmep->eventtree);
3321 	fmep->eventtree = NULL;
3322 	structconfig_free(fmep->config);
3323 	fmep->config = NULL;
3324 	destroy_fme_bufs(fmep);
3325 }
3326 
3327 static void indent(void);
3328 static int triggered(struct fme *fmep, struct event *ep, int mark);
3329 static enum fme_state effects_test(struct fme *fmep,
3330     struct event *fault_event, unsigned long long at_latest_by,
3331     unsigned long long *pdelay);
3332 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
3333     unsigned long long at_latest_by, unsigned long long *pdelay);
3334 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
3335     unsigned long long at_latest_by, unsigned long long *pdelay);
3336 
3337 static int
3338 checkconstraints(struct fme *fmep, struct arrow *arrowp)
3339 {
3340 	struct constraintlist *ctp;
3341 	struct evalue value;
3342 	char *sep = "";
3343 
3344 	if (arrowp->forever_false) {
3345 		indent();
3346 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
3347 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3348 			out(O_ALTFP|O_VERB|O_NONL, sep);
3349 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3350 			sep = ", ";
3351 		}
3352 		out(O_ALTFP|O_VERB, NULL);
3353 		return (0);
3354 	}
3355 	if (arrowp->forever_true) {
3356 		indent();
3357 		out(O_ALTFP|O_VERB|O_NONL, "  Forever true constraint: ");
3358 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3359 			out(O_ALTFP|O_VERB|O_NONL, sep);
3360 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3361 			sep = ", ";
3362 		}
3363 		out(O_ALTFP|O_VERB, NULL);
3364 		return (1);
3365 	}
3366 
3367 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3368 		if (eval_expr(ctp->cnode, NULL, NULL,
3369 		    &fmep->globals, fmep->config,
3370 		    arrowp, 0, &value)) {
3371 			/* evaluation successful */
3372 			if (value.t == UNDEFINED || value.v == 0) {
3373 				/* known false */
3374 				arrowp->forever_false = 1;
3375 				indent();
3376 				out(O_ALTFP|O_VERB|O_NONL,
3377 				    "  False constraint: ");
3378 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3379 				out(O_ALTFP|O_VERB, NULL);
3380 				return (0);
3381 			}
3382 		} else {
3383 			/* evaluation unsuccessful -- unknown value */
3384 			indent();
3385 			out(O_ALTFP|O_VERB|O_NONL,
3386 			    "  Deferred constraint: ");
3387 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3388 			out(O_ALTFP|O_VERB, NULL);
3389 			return (1);
3390 		}
3391 	}
3392 	/* known true */
3393 	arrowp->forever_true = 1;
3394 	indent();
3395 	out(O_ALTFP|O_VERB|O_NONL, "  True constraint: ");
3396 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3397 		out(O_ALTFP|O_VERB|O_NONL, sep);
3398 		ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3399 		sep = ", ";
3400 	}
3401 	out(O_ALTFP|O_VERB, NULL);
3402 	return (1);
3403 }
3404 
3405 static int
3406 triggered(struct fme *fmep, struct event *ep, int mark)
3407 {
3408 	struct bubble *bp;
3409 	struct arrowlist *ap;
3410 	int count = 0;
3411 
3412 	stats_counter_bump(fmep->Tcallcount);
3413 	for (bp = itree_next_bubble(ep, NULL); bp;
3414 	    bp = itree_next_bubble(ep, bp)) {
3415 		if (bp->t != B_TO)
3416 			continue;
3417 		for (ap = itree_next_arrow(bp, NULL); ap;
3418 		    ap = itree_next_arrow(bp, ap)) {
3419 			/* check count of marks against K in the bubble */
3420 			if ((ap->arrowp->mark & mark) &&
3421 			    ++count >= bp->nork)
3422 				return (1);
3423 		}
3424 	}
3425 	return (0);
3426 }
3427 
3428 static int
3429 mark_arrows(struct fme *fmep, struct event *ep, int mark,
3430     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
3431 {
3432 	struct bubble *bp;
3433 	struct arrowlist *ap;
3434 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3435 	unsigned long long my_delay;
3436 	enum fme_state result;
3437 	int retval = 0;
3438 
3439 	for (bp = itree_next_bubble(ep, NULL); bp;
3440 	    bp = itree_next_bubble(ep, bp)) {
3441 		if (bp->t != B_FROM)
3442 			continue;
3443 		stats_counter_bump(fmep->Marrowcount);
3444 		for (ap = itree_next_arrow(bp, NULL); ap;
3445 		    ap = itree_next_arrow(bp, ap)) {
3446 			struct event *ep2 = ap->arrowp->head->myevent;
3447 			/*
3448 			 * if we're clearing marks, we can avoid doing
3449 			 * all that work evaluating constraints.
3450 			 */
3451 			if (mark == 0) {
3452 				if (ap->arrowp->arrow_marked == 0)
3453 					continue;
3454 				ap->arrowp->arrow_marked = 0;
3455 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
3456 				if (keep && (ep2->cached_state &
3457 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
3458 					ep2->keep_in_tree = 1;
3459 				ep2->cached_state &=
3460 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
3461 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
3462 				    keep);
3463 				continue;
3464 			}
3465 			ap->arrowp->arrow_marked = 1;
3466 			if (ep2->cached_state & REQMNTS_DISPROVED) {
3467 				indent();
3468 				out(O_ALTFP|O_VERB|O_NONL,
3469 				    "  ALREADY DISPROVED ");
3470 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3471 				out(O_ALTFP|O_VERB, NULL);
3472 				continue;
3473 			}
3474 			if (ep2->cached_state & WAIT_EFFECT) {
3475 				indent();
3476 				out(O_ALTFP|O_VERB|O_NONL,
3477 				    "  ALREADY EFFECTS WAIT ");
3478 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3479 				out(O_ALTFP|O_VERB, NULL);
3480 				continue;
3481 			}
3482 			if (ep2->cached_state & CREDIBLE_EFFECT) {
3483 				indent();
3484 				out(O_ALTFP|O_VERB|O_NONL,
3485 				    "  ALREADY EFFECTS CREDIBLE ");
3486 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3487 				out(O_ALTFP|O_VERB, NULL);
3488 				continue;
3489 			}
3490 			if ((ep2->cached_state & PARENT_WAIT) &&
3491 			    (mark & PARENT_WAIT)) {
3492 				indent();
3493 				out(O_ALTFP|O_VERB|O_NONL,
3494 				    "  ALREADY PARENT EFFECTS WAIT ");
3495 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3496 				out(O_ALTFP|O_VERB, NULL);
3497 				continue;
3498 			}
3499 			platform_set_payloadnvp(ep2->nvp);
3500 			if (checkconstraints(fmep, ap->arrowp) == 0) {
3501 				platform_set_payloadnvp(NULL);
3502 				indent();
3503 				out(O_ALTFP|O_VERB|O_NONL,
3504 				    "  CONSTRAINTS FAIL ");
3505 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3506 				out(O_ALTFP|O_VERB, NULL);
3507 				continue;
3508 			}
3509 			platform_set_payloadnvp(NULL);
3510 			ap->arrowp->mark |= EFFECTS_COUNTER;
3511 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
3512 				indent();
3513 				out(O_ALTFP|O_VERB|O_NONL,
3514 				    "  K-COUNT NOT YET MET ");
3515 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3516 				out(O_ALTFP|O_VERB, NULL);
3517 				continue;
3518 			}
3519 			ep2->cached_state &= ~PARENT_WAIT;
3520 			/*
3521 			 * if we've reached an ereport and no propagation time
3522 			 * is specified, use the Hesitate value
3523 			 */
3524 			if (ep2->t == N_EREPORT && at_latest_by == 0ULL &&
3525 			    ap->arrowp->maxdelay == 0ULL) {
3526 				out(O_ALTFP|O_VERB|O_NONL, "  default wait ");
3527 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3528 				out(O_ALTFP|O_VERB, NULL);
3529 				result = requirements_test(fmep, ep2, Hesitate,
3530 				    &my_delay);
3531 			} else {
3532 				result = requirements_test(fmep, ep2,
3533 				    at_latest_by + ap->arrowp->maxdelay,
3534 				    &my_delay);
3535 			}
3536 			if (result == FME_WAIT) {
3537 				retval = WAIT_EFFECT;
3538 				if (overall_delay > my_delay)
3539 					overall_delay = my_delay;
3540 				ep2->cached_state |= WAIT_EFFECT;
3541 				indent();
3542 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
3543 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3544 				out(O_ALTFP|O_VERB, NULL);
3545 				indent_push("  E");
3546 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
3547 				    at_latest_by, &my_delay, 0) ==
3548 				    WAIT_EFFECT) {
3549 					retval = WAIT_EFFECT;
3550 					if (overall_delay > my_delay)
3551 						overall_delay = my_delay;
3552 				}
3553 				indent_pop();
3554 			} else if (result == FME_DISPROVED) {
3555 				indent();
3556 				out(O_ALTFP|O_VERB|O_NONL,
3557 				    "  EFFECTS DISPROVED ");
3558 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3559 				out(O_ALTFP|O_VERB, NULL);
3560 			} else {
3561 				ep2->cached_state |= mark;
3562 				indent();
3563 				if (mark == CREDIBLE_EFFECT)
3564 					out(O_ALTFP|O_VERB|O_NONL,
3565 					    "  EFFECTS CREDIBLE ");
3566 				else
3567 					out(O_ALTFP|O_VERB|O_NONL,
3568 					    "  PARENT EFFECTS WAIT ");
3569 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3570 				out(O_ALTFP|O_VERB, NULL);
3571 				indent_push("  E");
3572 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
3573 				    &my_delay, 0) == WAIT_EFFECT) {
3574 					retval = WAIT_EFFECT;
3575 					if (overall_delay > my_delay)
3576 						overall_delay = my_delay;
3577 				}
3578 				indent_pop();
3579 			}
3580 		}
3581 	}
3582 	if (retval == WAIT_EFFECT)
3583 		*pdelay = overall_delay;
3584 	return (retval);
3585 }
3586 
3587 static enum fme_state
3588 effects_test(struct fme *fmep, struct event *fault_event,
3589     unsigned long long at_latest_by, unsigned long long *pdelay)
3590 {
3591 	struct event *error_event;
3592 	enum fme_state return_value = FME_CREDIBLE;
3593 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3594 	unsigned long long my_delay;
3595 
3596 	stats_counter_bump(fmep->Ecallcount);
3597 	indent_push("  E");
3598 	indent();
3599 	out(O_ALTFP|O_VERB|O_NONL, "->");
3600 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3601 	out(O_ALTFP|O_VERB, NULL);
3602 
3603 	if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
3604 	    &my_delay, 0) == WAIT_EFFECT) {
3605 		return_value = FME_WAIT;
3606 		if (overall_delay > my_delay)
3607 			overall_delay = my_delay;
3608 	}
3609 	for (error_event = fmep->observations;
3610 	    error_event; error_event = error_event->observations) {
3611 		indent();
3612 		out(O_ALTFP|O_VERB|O_NONL, " ");
3613 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
3614 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
3615 			if (error_event->cached_state &
3616 			    (PARENT_WAIT|WAIT_EFFECT)) {
3617 				out(O_ALTFP|O_VERB, " NOT YET triggered");
3618 				continue;
3619 			}
3620 			return_value = FME_DISPROVED;
3621 			out(O_ALTFP|O_VERB, " NOT triggered");
3622 			break;
3623 		} else {
3624 			out(O_ALTFP|O_VERB, " triggered");
3625 		}
3626 	}
3627 	if (return_value == FME_DISPROVED) {
3628 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
3629 	} else {
3630 		fault_event->keep_in_tree = 1;
3631 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
3632 	}
3633 
3634 	indent();
3635 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
3636 	    fme_state2str(return_value));
3637 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3638 	out(O_ALTFP|O_VERB, NULL);
3639 	indent_pop();
3640 	if (return_value == FME_WAIT)
3641 		*pdelay = overall_delay;
3642 	return (return_value);
3643 }
3644 
3645 static enum fme_state
3646 requirements_test(struct fme *fmep, struct event *ep,
3647     unsigned long long at_latest_by, unsigned long long *pdelay)
3648 {
3649 	int waiting_events;
3650 	int credible_events;
3651 	int deferred_events;
3652 	enum fme_state return_value = FME_CREDIBLE;
3653 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3654 	unsigned long long arrow_delay;
3655 	unsigned long long my_delay;
3656 	struct event *ep2;
3657 	struct bubble *bp;
3658 	struct arrowlist *ap;
3659 
3660 	if (ep->cached_state & REQMNTS_CREDIBLE) {
3661 		indent();
3662 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
3663 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3664 		out(O_ALTFP|O_VERB, NULL);
3665 		return (FME_CREDIBLE);
3666 	}
3667 	if (ep->cached_state & REQMNTS_DISPROVED) {
3668 		indent();
3669 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
3670 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3671 		out(O_ALTFP|O_VERB, NULL);
3672 		return (FME_DISPROVED);
3673 	}
3674 	if (ep->cached_state & REQMNTS_WAIT) {
3675 		indent();
3676 		*pdelay = ep->cached_delay;
3677 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
3678 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3679 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
3680 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3681 		out(O_ALTFP|O_VERB, NULL);
3682 		return (FME_WAIT);
3683 	}
3684 	stats_counter_bump(fmep->Rcallcount);
3685 	indent_push("  R");
3686 	indent();
3687 	out(O_ALTFP|O_VERB|O_NONL, "->");
3688 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3689 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3690 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3691 	out(O_ALTFP|O_VERB, NULL);
3692 
3693 	if (ep->t == N_EREPORT) {
3694 		if (ep->count == 0) {
3695 			if (fmep->pull >= at_latest_by) {
3696 				return_value = FME_DISPROVED;
3697 			} else {
3698 				ep->cached_delay = *pdelay = at_latest_by;
3699 				return_value = FME_WAIT;
3700 			}
3701 		}
3702 
3703 		indent();
3704 		switch (return_value) {
3705 		case FME_CREDIBLE:
3706 			ep->cached_state |= REQMNTS_CREDIBLE;
3707 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
3708 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3709 			break;
3710 		case FME_DISPROVED:
3711 			ep->cached_state |= REQMNTS_DISPROVED;
3712 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3713 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3714 			break;
3715 		case FME_WAIT:
3716 			ep->cached_state |= REQMNTS_WAIT;
3717 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
3718 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3719 			out(O_ALTFP|O_VERB|O_NONL, " to ");
3720 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3721 			break;
3722 		default:
3723 			out(O_DIE, "requirements_test: unexpected fme_state");
3724 			break;
3725 		}
3726 		out(O_ALTFP|O_VERB, NULL);
3727 		indent_pop();
3728 
3729 		return (return_value);
3730 	}
3731 
3732 	/* this event is not a report, descend the tree */
3733 	for (bp = itree_next_bubble(ep, NULL); bp;
3734 	    bp = itree_next_bubble(ep, bp)) {
3735 		int n;
3736 
3737 		if (bp->t != B_FROM)
3738 			continue;
3739 
3740 		n = bp->nork;
3741 
3742 		credible_events = 0;
3743 		waiting_events = 0;
3744 		deferred_events = 0;
3745 		arrow_delay = TIMEVAL_EVENTUALLY;
3746 		/*
3747 		 * n is -1 for 'A' so adjust it.
3748 		 * XXX just count up the arrows for now.
3749 		 */
3750 		if (n < 0) {
3751 			n = 0;
3752 			for (ap = itree_next_arrow(bp, NULL); ap;
3753 			    ap = itree_next_arrow(bp, ap))
3754 				n++;
3755 			indent();
3756 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
3757 		} else {
3758 			indent();
3759 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
3760 		}
3761 
3762 		if (n == 0)
3763 			continue;
3764 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
3765 			for (ap = itree_next_arrow(bp, NULL); ap;
3766 			    ap = itree_next_arrow(bp, ap)) {
3767 				ep2 = ap->arrowp->head->myevent;
3768 				platform_set_payloadnvp(ep2->nvp);
3769 				(void) checkconstraints(fmep, ap->arrowp);
3770 				if (!ap->arrowp->forever_false) {
3771 					/*
3772 					 * if all arrows are invalidated by the
3773 					 * constraints, then we should elide the
3774 					 * whole bubble to be consistant with
3775 					 * the tree creation time behaviour
3776 					 */
3777 					bp->mark |= BUBBLE_OK;
3778 					platform_set_payloadnvp(NULL);
3779 					break;
3780 				}
3781 				platform_set_payloadnvp(NULL);
3782 			}
3783 		}
3784 		for (ap = itree_next_arrow(bp, NULL); ap;
3785 		    ap = itree_next_arrow(bp, ap)) {
3786 			ep2 = ap->arrowp->head->myevent;
3787 			if (n <= credible_events)
3788 				break;
3789 
3790 			ap->arrowp->mark |= REQMNTS_COUNTER;
3791 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
3792 				/* XXX adding max timevals! */
3793 				switch (requirements_test(fmep, ep2,
3794 				    at_latest_by + ap->arrowp->maxdelay,
3795 				    &my_delay)) {
3796 				case FME_DEFERRED:
3797 					deferred_events++;
3798 					break;
3799 				case FME_CREDIBLE:
3800 					credible_events++;
3801 					break;
3802 				case FME_DISPROVED:
3803 					break;
3804 				case FME_WAIT:
3805 					if (my_delay < arrow_delay)
3806 						arrow_delay = my_delay;
3807 					waiting_events++;
3808 					break;
3809 				default:
3810 					out(O_DIE,
3811 					"Bug in requirements_test.");
3812 				}
3813 			else
3814 				deferred_events++;
3815 		}
3816 		if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) {
3817 			bp->mark |= BUBBLE_ELIDED;
3818 			continue;
3819 		}
3820 		indent();
3821 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
3822 		    credible_events + deferred_events, waiting_events);
3823 		if (credible_events + deferred_events + waiting_events < n) {
3824 			/* Can never meet requirements */
3825 			ep->cached_state |= REQMNTS_DISPROVED;
3826 			indent();
3827 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3828 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3829 			out(O_ALTFP|O_VERB, NULL);
3830 			indent_pop();
3831 			return (FME_DISPROVED);
3832 		}
3833 		if (credible_events + deferred_events < n) {
3834 			/* will have to wait */
3835 			/* wait time is shortest known */
3836 			if (arrow_delay < overall_delay)
3837 				overall_delay = arrow_delay;
3838 			return_value = FME_WAIT;
3839 		} else if (credible_events < n) {
3840 			if (return_value != FME_WAIT)
3841 				return_value = FME_DEFERRED;
3842 		}
3843 	}
3844 
3845 	/*
3846 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
3847 	 * path, then this will be considered FME_CREDIBLE. But if it is
3848 	 * reached by a different path so the K-count is met, then might
3849 	 * get overridden by FME_WAIT or FME_DISPROVED.
3850 	 */
3851 	if (return_value == FME_WAIT) {
3852 		ep->cached_state |= REQMNTS_WAIT;
3853 		ep->cached_delay = *pdelay = overall_delay;
3854 	} else if (return_value == FME_CREDIBLE) {
3855 		ep->cached_state |= REQMNTS_CREDIBLE;
3856 	}
3857 	indent();
3858 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
3859 	    fme_state2str(return_value));
3860 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3861 	out(O_ALTFP|O_VERB, NULL);
3862 	indent_pop();
3863 	return (return_value);
3864 }
3865 
3866 static enum fme_state
3867 causes_test(struct fme *fmep, struct event *ep,
3868     unsigned long long at_latest_by, unsigned long long *pdelay)
3869 {
3870 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3871 	unsigned long long my_delay;
3872 	int credible_results = 0;
3873 	int waiting_results = 0;
3874 	enum fme_state fstate;
3875 	struct event *tail_event;
3876 	struct bubble *bp;
3877 	struct arrowlist *ap;
3878 	int k = 1;
3879 
3880 	stats_counter_bump(fmep->Ccallcount);
3881 	indent_push("  C");
3882 	indent();
3883 	out(O_ALTFP|O_VERB|O_NONL, "->");
3884 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3885 	out(O_ALTFP|O_VERB, NULL);
3886 
3887 	for (bp = itree_next_bubble(ep, NULL); bp;
3888 	    bp = itree_next_bubble(ep, bp)) {
3889 		if (bp->t != B_TO)
3890 			continue;
3891 		k = bp->nork;	/* remember the K value */
3892 		for (ap = itree_next_arrow(bp, NULL); ap;
3893 		    ap = itree_next_arrow(bp, ap)) {
3894 			int do_not_follow = 0;
3895 
3896 			/*
3897 			 * if we get to the same event multiple times
3898 			 * only worry about the first one.
3899 			 */
3900 			if (ap->arrowp->tail->myevent->cached_state &
3901 			    CAUSES_TESTED) {
3902 				indent();
3903 				out(O_ALTFP|O_VERB|O_NONL,
3904 				    "  causes test already run for ");
3905 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3906 				    ap->arrowp->tail->myevent);
3907 				out(O_ALTFP|O_VERB, NULL);
3908 				continue;
3909 			}
3910 
3911 			/*
3912 			 * see if false constraint prevents us
3913 			 * from traversing this arrow
3914 			 */
3915 			platform_set_payloadnvp(ep->nvp);
3916 			if (checkconstraints(fmep, ap->arrowp) == 0)
3917 				do_not_follow = 1;
3918 			platform_set_payloadnvp(NULL);
3919 			if (do_not_follow) {
3920 				indent();
3921 				out(O_ALTFP|O_VERB|O_NONL,
3922 				    "  False arrow from ");
3923 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
3924 				    ap->arrowp->tail->myevent);
3925 				out(O_ALTFP|O_VERB, NULL);
3926 				continue;
3927 			}
3928 
3929 			ap->arrowp->tail->myevent->cached_state |=
3930 			    CAUSES_TESTED;
3931 			tail_event = ap->arrowp->tail->myevent;
3932 			fstate = hypothesise(fmep, tail_event, at_latest_by,
3933 			    &my_delay);
3934 
3935 			switch (fstate) {
3936 			case FME_WAIT:
3937 				if (my_delay < overall_delay)
3938 					overall_delay = my_delay;
3939 				waiting_results++;
3940 				break;
3941 			case FME_CREDIBLE:
3942 				credible_results++;
3943 				break;
3944 			case FME_DISPROVED:
3945 				break;
3946 			default:
3947 				out(O_DIE, "Bug in causes_test");
3948 			}
3949 		}
3950 	}
3951 	/* compare against K */
3952 	if (credible_results + waiting_results < k) {
3953 		indent();
3954 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
3955 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3956 		out(O_ALTFP|O_VERB, NULL);
3957 		indent_pop();
3958 		return (FME_DISPROVED);
3959 	}
3960 	if (waiting_results != 0) {
3961 		*pdelay = overall_delay;
3962 		indent();
3963 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
3964 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3965 		out(O_ALTFP|O_VERB|O_NONL, " to ");
3966 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3967 		out(O_ALTFP|O_VERB, NULL);
3968 		indent_pop();
3969 		return (FME_WAIT);
3970 	}
3971 	indent();
3972 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
3973 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3974 	out(O_ALTFP|O_VERB, NULL);
3975 	indent_pop();
3976 	return (FME_CREDIBLE);
3977 }
3978 
3979 static enum fme_state
3980 hypothesise(struct fme *fmep, struct event *ep,
3981 	unsigned long long at_latest_by, unsigned long long *pdelay)
3982 {
3983 	enum fme_state rtr, otr;
3984 	unsigned long long my_delay;
3985 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3986 
3987 	stats_counter_bump(fmep->Hcallcount);
3988 	indent_push("  H");
3989 	indent();
3990 	out(O_ALTFP|O_VERB|O_NONL, "->");
3991 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3992 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3993 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3994 	out(O_ALTFP|O_VERB, NULL);
3995 
3996 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
3997 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
3998 		overall_delay = my_delay;
3999 	if (rtr != FME_DISPROVED) {
4000 		if (is_problem(ep->t)) {
4001 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
4002 			if (otr != FME_DISPROVED) {
4003 				if (fmep->peek == 0 && ep->is_suspect == 0) {
4004 					ep->suspects = fmep->suspects;
4005 					ep->is_suspect = 1;
4006 					fmep->suspects = ep;
4007 					fmep->nsuspects++;
4008 				}
4009 			}
4010 		} else
4011 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
4012 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
4013 			overall_delay = my_delay;
4014 		if ((otr != FME_DISPROVED) &&
4015 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
4016 			*pdelay = overall_delay;
4017 	}
4018 	if (rtr == FME_DISPROVED) {
4019 		indent();
4020 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4021 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4022 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
4023 		indent_pop();
4024 		return (FME_DISPROVED);
4025 	}
4026 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
4027 		indent();
4028 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4029 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4030 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
4031 		indent_pop();
4032 		return (FME_DISPROVED);
4033 	}
4034 	if (otr == FME_DISPROVED) {
4035 		indent();
4036 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4037 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4038 		out(O_ALTFP|O_VERB, " (causes are not credible)");
4039 		indent_pop();
4040 		return (FME_DISPROVED);
4041 	}
4042 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
4043 		indent();
4044 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
4045 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4046 		out(O_ALTFP|O_VERB|O_NONL, " to ");
4047 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
4048 		out(O_ALTFP|O_VERB, NULL);
4049 		indent_pop();
4050 		return (FME_WAIT);
4051 	}
4052 	indent();
4053 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
4054 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4055 	out(O_ALTFP|O_VERB, NULL);
4056 	indent_pop();
4057 	return (FME_CREDIBLE);
4058 }
4059 
4060 /*
4061  * fme_istat_load -- reconstitute any persistent istats
4062  */
4063 void
4064 fme_istat_load(fmd_hdl_t *hdl)
4065 {
4066 	int sz;
4067 	char *sbuf;
4068 	char *ptr;
4069 
4070 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
4071 		out(O_ALTFP, "fme_istat_load: No stats");
4072 		return;
4073 	}
4074 
4075 	sbuf = alloca(sz);
4076 
4077 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
4078 
4079 	/*
4080 	 * pick apart the serialized stats
4081 	 *
4082 	 * format is:
4083 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
4084 	 * for example:
4085 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
4086 	 *
4087 	 * since this is parsing our own serialized data, any parsing issues
4088 	 * are fatal, so we check for them all with ASSERT() below.
4089 	 */
4090 	ptr = sbuf;
4091 	while (ptr < &sbuf[sz]) {
4092 		char *sepptr;
4093 		struct node *np;
4094 		int val;
4095 
4096 		sepptr = strchr(ptr, '@');
4097 		ASSERT(sepptr != NULL);
4098 		*sepptr = '\0';
4099 
4100 		/* construct the event */
4101 		np = newnode(T_EVENT, NULL, 0);
4102 		np->u.event.ename = newnode(T_NAME, NULL, 0);
4103 		np->u.event.ename->u.name.t = N_STAT;
4104 		np->u.event.ename->u.name.s = stable(ptr);
4105 		np->u.event.ename->u.name.it = IT_ENAME;
4106 		np->u.event.ename->u.name.last = np->u.event.ename;
4107 
4108 		ptr = sepptr + 1;
4109 		ASSERT(ptr < &sbuf[sz]);
4110 		ptr += strlen(ptr);
4111 		ptr++;	/* move past the '\0' separating path from value */
4112 		ASSERT(ptr < &sbuf[sz]);
4113 		ASSERT(isdigit(*ptr));
4114 		val = atoi(ptr);
4115 		ASSERT(val > 0);
4116 		ptr += strlen(ptr);
4117 		ptr++;	/* move past the final '\0' for this entry */
4118 
4119 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
4120 		ASSERT(np->u.event.epname != NULL);
4121 
4122 		istat_bump(np, val);
4123 		tree_free(np);
4124 	}
4125 
4126 	istat_save();
4127 }
4128