xref: /illumos-gate/usr/src/cmd/fm/modules/common/eversholt/fme.c (revision cfc9ef1dcc5d6a18778b3b10d738d19df873d1a3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2012 Milan Jurik. All rights reserved.
25  * Copyright (c) 2018, Joyent, Inc.
26  *
27  * fme.c -- fault management exercise module
28  *
29  * this module provides the simulated fault management exercise.
30  */
31 
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <strings.h>
36 #include <ctype.h>
37 #include <alloca.h>
38 #include <libnvpair.h>
39 #include <sys/fm/protocol.h>
40 #include <fm/fmd_api.h>
41 #include <fm/libtopo.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 #include "esclex.h"
58 
59 struct lut *Istats;
60 struct lut *SerdEngines;
61 nvlist_t *Action_nvl;
62 
63 /* imported from eft.c... */
64 extern hrtime_t Hesitate;
65 extern char *Serd_Override;
66 extern nv_alloc_t Eft_nv_hdl;
67 extern int Max_fme;
68 extern fmd_hdl_t *Hdl;
69 
70 static int Istat_need_save;
71 static int Serd_need_save;
72 void istat_save(void);
73 void serd_save(void);
74 
75 /* fme under construction is global so we can free it on module abort */
76 static struct fme *Nfmep;
77 
78 static int Undiag_reason = UD_VAL_UNKNOWN;
79 
80 static int Nextid = 0;
81 
82 static int Open_fme_count = 0;	/* Count of open FMEs */
83 
84 /* list of fault management exercises underway */
85 static struct fme {
86 	struct fme *next;		/* next exercise */
87 	unsigned long long ull;		/* time when fme was created */
88 	int id;				/* FME id */
89 	struct config *config;		/* cooked configuration data */
90 	struct lut *eventtree;		/* propagation tree for this FME */
91 	/*
92 	 * The initial error report that created this FME is kept in
93 	 * two forms.  e0 points to the instance tree node and is used
94 	 * by fme_eval() as the starting point for the inference
95 	 * algorithm.  e0r is the event handle FMD passed to us when
96 	 * the ereport first arrived and is used when setting timers,
97 	 * which are always relative to the time of this initial
98 	 * report.
99 	 */
100 	struct event *e0;
101 	fmd_event_t *e0r;
102 
103 	id_t    timer;			/* for setting an fmd time-out */
104 
105 	struct event *ecurrent;		/* ereport under consideration */
106 	struct event *suspects;		/* current suspect list */
107 	struct event *psuspects;	/* previous suspect list */
108 	int nsuspects;			/* count of suspects */
109 	int posted_suspects;		/* true if we've posted a diagnosis */
110 	int uniqobs;			/* number of unique events observed */
111 	int peek;			/* just peeking, don't track suspects */
112 	int overflow;			/* true if overflow FME */
113 	enum fme_state {
114 		FME_NOTHING = 5000,	/* not evaluated yet */
115 		FME_WAIT,		/* need to wait for more info */
116 		FME_CREDIBLE,		/* suspect list is credible */
117 		FME_DISPROVED,		/* no valid suspects found */
118 		FME_DEFERRED		/* don't know yet (k-count not met) */
119 	} state;
120 
121 	unsigned long long pull;	/* time passed since created */
122 	unsigned long long wull;	/* wait until this time for re-eval */
123 	struct event *observations;	/* observation list */
124 	struct lut *globals;		/* values of global variables */
125 	/* fmd interfacing */
126 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
127 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
128 	/* stats */
129 	struct stats *Rcount;
130 	struct stats *Hcallcount;
131 	struct stats *Rcallcount;
132 	struct stats *Ccallcount;
133 	struct stats *Ecallcount;
134 	struct stats *Tcallcount;
135 	struct stats *Marrowcount;
136 	struct stats *diags;
137 } *FMElist, *EFMElist, *ClosedFMEs;
138 
139 static struct case_list {
140 	fmd_case_t *fmcase;
141 	struct case_list *next;
142 } *Undiagablecaselist;
143 
144 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
145 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
146 	unsigned long long at_latest_by, unsigned long long *pdelay);
147 static struct node *eventprop_lookup(struct event *ep, const char *propname);
148 static struct node *pathstring2epnamenp(char *path);
149 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep,
150 	fmd_case_t *fmcase, nvlist_t *detector, char *arg);
151 static char *undiag_2reason_str(int ud, char *arg);
152 static const char *undiag_2defect_str(int ud);
153 static void restore_suspects(struct fme *fmep);
154 static void save_suspects(struct fme *fmep);
155 static void destroy_fme(struct fme *f);
156 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
157     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
158 static void istat_counter_reset_cb(struct istat_entry *entp,
159     struct stats *statp, const struct ipath *ipp);
160 static void istat_counter_topo_chg_cb(struct istat_entry *entp,
161     struct stats *statp, void *unused);
162 static void serd_reset_cb(struct serd_entry *entp, void *unused,
163     const struct ipath *ipp);
164 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused,
165     void *unused2);
166 static void destroy_fme_bufs(struct fme *fp);
167 
168 static struct fme *
alloc_fme(void)169 alloc_fme(void)
170 {
171 	struct fme *fmep;
172 
173 	fmep = MALLOC(sizeof (*fmep));
174 	bzero(fmep, sizeof (*fmep));
175 	return (fmep);
176 }
177 
178 /*
179  * fme_ready -- called when all initialization of the FME (except for
180  *	stats) has completed successfully.  Adds the fme to global lists
181  *	and establishes its stats.
182  */
183 static struct fme *
fme_ready(struct fme * fmep)184 fme_ready(struct fme *fmep)
185 {
186 	char nbuf[100];
187 
188 	Nfmep = NULL;	/* don't need to free this on module abort now */
189 
190 	if (EFMElist) {
191 		EFMElist->next = fmep;
192 		EFMElist = fmep;
193 	} else
194 		FMElist = EFMElist = fmep;
195 
196 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
197 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
198 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
199 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
200 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
201 	fmep->Rcallcount = stats_new_counter(nbuf,
202 	    "calls to requirements_test()", 1);
203 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
204 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
205 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
206 	fmep->Ecallcount =
207 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
208 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
209 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
210 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
211 	fmep->Marrowcount = stats_new_counter(nbuf,
212 	    "arrows marked by mark_arrows()", 1);
213 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
214 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
215 
216 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
217 	config_print(O_ALTFP|O_VERB2, fmep->config);
218 
219 	return (fmep);
220 }
221 
222 extern void ipath_dummy_lut(struct arrow *);
223 extern struct lut *itree_create_dummy(const char *, const struct ipath *);
224 
225 /* ARGSUSED */
226 static void
set_needed_arrows(struct event * ep,struct event * ep2,struct fme * fmep)227 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
228 {
229 	struct bubble *bp;
230 	struct arrowlist *ap;
231 
232 	for (bp = itree_next_bubble(ep, NULL); bp;
233 	    bp = itree_next_bubble(ep, bp)) {
234 		if (bp->t != B_FROM)
235 			continue;
236 		for (ap = itree_next_arrow(bp, NULL); ap;
237 		    ap = itree_next_arrow(bp, ap)) {
238 			ap->arrowp->pnode->u.arrow.needed = 1;
239 			ipath_dummy_lut(ap->arrowp);
240 		}
241 	}
242 }
243 
244 /* ARGSUSED */
245 static void
unset_needed_arrows(struct event * ep,struct event * ep2,struct fme * fmep)246 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
247 {
248 	struct bubble *bp;
249 	struct arrowlist *ap;
250 
251 	for (bp = itree_next_bubble(ep, NULL); bp;
252 	    bp = itree_next_bubble(ep, bp)) {
253 		if (bp->t != B_FROM)
254 			continue;
255 		for (ap = itree_next_arrow(bp, NULL); ap;
256 		    ap = itree_next_arrow(bp, ap))
257 			ap->arrowp->pnode->u.arrow.needed = 0;
258 	}
259 }
260 
261 static void globals_destructor(void *left, void *right, void *arg);
262 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep);
263 
264 static boolean_t
prune_propagations(const char * e0class,const struct ipath * e0ipp)265 prune_propagations(const char *e0class, const struct ipath *e0ipp)
266 {
267 	char nbuf[100];
268 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
269 	extern struct lut *Usednames;
270 
271 	Nfmep = alloc_fme();
272 	Nfmep->id = Nextid;
273 	Nfmep->state = FME_NOTHING;
274 	Nfmep->eventtree = itree_create_dummy(e0class, e0ipp);
275 	if ((Nfmep->e0 =
276 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
277 		itree_free(Nfmep->eventtree);
278 		FREE(Nfmep);
279 		Nfmep = NULL;
280 		return (B_FALSE);
281 	}
282 	Nfmep->ecurrent = Nfmep->observations = Nfmep->e0;
283 	Nfmep->e0->count++;
284 
285 	(void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id);
286 	Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
287 	(void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id);
288 	Nfmep->Hcallcount =
289 	    stats_new_counter(nbuf, "calls to hypothesise()", 1);
290 	(void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id);
291 	Nfmep->Rcallcount = stats_new_counter(nbuf,
292 	    "calls to requirements_test()", 1);
293 	(void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id);
294 	Nfmep->Ccallcount =
295 	    stats_new_counter(nbuf, "calls to causes_test()", 1);
296 	(void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id);
297 	Nfmep->Ecallcount =
298 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
299 	(void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id);
300 	Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
301 	(void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id);
302 	Nfmep->Marrowcount = stats_new_counter(nbuf,
303 	    "arrows marked by mark_arrows()", 1);
304 	(void) sprintf(nbuf, "fme%d.diags", Nfmep->id);
305 	Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
306 
307 	Nfmep->peek = 1;
308 	lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep);
309 	lut_free(Usednames, NULL, NULL);
310 	Usednames = NULL;
311 	lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep);
312 	(void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay);
313 	itree_prune(Nfmep->eventtree);
314 	lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep);
315 
316 	stats_delete(Nfmep->Rcount);
317 	stats_delete(Nfmep->Hcallcount);
318 	stats_delete(Nfmep->Rcallcount);
319 	stats_delete(Nfmep->Ccallcount);
320 	stats_delete(Nfmep->Ecallcount);
321 	stats_delete(Nfmep->Tcallcount);
322 	stats_delete(Nfmep->Marrowcount);
323 	stats_delete(Nfmep->diags);
324 	itree_free(Nfmep->eventtree);
325 	lut_free(Nfmep->globals, globals_destructor, NULL);
326 	FREE(Nfmep);
327 	return (B_TRUE);
328 }
329 
330 static struct fme *
newfme(const char * e0class,const struct ipath * e0ipp,fmd_hdl_t * hdl,fmd_case_t * fmcase,fmd_event_t * ffep,nvlist_t * nvl)331 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl,
332     fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl)
333 {
334 	struct cfgdata *cfgdata;
335 	int init_size;
336 	extern int alloc_total();
337 	nvlist_t *detector = NULL;
338 	char *pathstr;
339 	char *arg;
340 
341 	/*
342 	 * First check if e0ipp is actually in the topology so we can give a
343 	 * more useful error message.
344 	 */
345 	ipathlastcomp(e0ipp);
346 	pathstr = ipath2str(NULL, e0ipp);
347 	cfgdata = config_snapshot();
348 	platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE,
349 	    &detector, pathstr);
350 	FREE(pathstr);
351 	structconfig_free(cfgdata->cooked);
352 	config_free(cfgdata);
353 	if (detector == NULL) {
354 		/* See if class permits silent discard on unknown component. */
355 		if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) {
356 			out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport "
357 			    "to component path, but silent discard allowed.",
358 			    e0class);
359 			fmd_case_close(hdl, fmcase);
360 		} else {
361 			Undiag_reason = UD_VAL_BADEVENTPATH;
362 			(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
363 			    &detector);
364 			arg = ipath2str(e0class, e0ipp);
365 			publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
366 			FREE(arg);
367 		}
368 		return (NULL);
369 	}
370 
371 	/*
372 	 * Next run a quick first pass of the rules with a dummy config. This
373 	 * allows us to prune those rules which can't possibly cause this
374 	 * ereport.
375 	 */
376 	if (!prune_propagations(e0class, e0ipp)) {
377 		/*
378 		 * The fault class must have been in the rules or we would
379 		 * not have registered for it (and got a "nosub"), and the
380 		 * pathname must be in the topology or we would have failed the
381 		 * previous test. So to get here means the combination of
382 		 * class and pathname in the ereport must be invalid.
383 		 */
384 		Undiag_reason = UD_VAL_BADEVENTCLASS;
385 		arg = ipath2str(e0class, e0ipp);
386 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
387 		nvlist_free(detector);
388 		FREE(arg);
389 		return (NULL);
390 	}
391 
392 	/*
393 	 * Now go ahead and create the real fme using the pruned rules.
394 	 */
395 	init_size = alloc_total();
396 	out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size);
397 	nvlist_free(detector);
398 	pathstr = ipath2str(NULL, e0ipp);
399 	cfgdata = config_snapshot();
400 	platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE,
401 	    &detector, pathstr);
402 	FREE(pathstr);
403 	platform_save_config(hdl, fmcase);
404 	out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes",
405 	    alloc_total() - init_size);
406 
407 	Nfmep = alloc_fme();
408 
409 	Nfmep->id = Nextid++;
410 	Nfmep->config = cfgdata->cooked;
411 	config_free(cfgdata);
412 	Nfmep->posted_suspects = 0;
413 	Nfmep->uniqobs = 0;
414 	Nfmep->state = FME_NOTHING;
415 	Nfmep->pull = 0ULL;
416 	Nfmep->overflow = 0;
417 
418 	Nfmep->fmcase = fmcase;
419 	Nfmep->hdl = hdl;
420 
421 	if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) {
422 		Undiag_reason = UD_VAL_INSTFAIL;
423 		arg = ipath2str(e0class, e0ipp);
424 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
425 		nvlist_free(detector);
426 		FREE(arg);
427 		structconfig_free(Nfmep->config);
428 		destroy_fme_bufs(Nfmep);
429 		FREE(Nfmep);
430 		Nfmep = NULL;
431 		return (NULL);
432 	}
433 
434 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
435 
436 	if ((Nfmep->e0 =
437 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
438 		Undiag_reason = UD_VAL_BADEVENTI;
439 		arg = ipath2str(e0class, e0ipp);
440 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
441 		nvlist_free(detector);
442 		FREE(arg);
443 		itree_free(Nfmep->eventtree);
444 		structconfig_free(Nfmep->config);
445 		destroy_fme_bufs(Nfmep);
446 		FREE(Nfmep);
447 		Nfmep = NULL;
448 		return (NULL);
449 	}
450 
451 	nvlist_free(detector);
452 	return (fme_ready(Nfmep));
453 }
454 
455 void
fme_fini(void)456 fme_fini(void)
457 {
458 	struct fme *sfp, *fp;
459 	struct case_list *ucasep, *nextcasep;
460 
461 	ucasep = Undiagablecaselist;
462 	while (ucasep != NULL) {
463 		nextcasep = ucasep->next;
464 		FREE(ucasep);
465 		ucasep = nextcasep;
466 	}
467 	Undiagablecaselist = NULL;
468 
469 	/* clean up closed fmes */
470 	fp = ClosedFMEs;
471 	while (fp != NULL) {
472 		sfp = fp->next;
473 		destroy_fme(fp);
474 		fp = sfp;
475 	}
476 	ClosedFMEs = NULL;
477 
478 	fp = FMElist;
479 	while (fp != NULL) {
480 		sfp = fp->next;
481 		destroy_fme(fp);
482 		fp = sfp;
483 	}
484 	FMElist = EFMElist = NULL;
485 
486 	/* if we were in the middle of creating an fme, free it now */
487 	if (Nfmep) {
488 		destroy_fme(Nfmep);
489 		Nfmep = NULL;
490 	}
491 }
492 
493 /*
494  * Allocated space for a buffer name.  20 bytes allows for
495  * a ridiculous 9,999,999 unique observations.
496  */
497 #define	OBBUFNMSZ 20
498 
499 /*
500  *  serialize_observation
501  *
502  *  Create a recoverable version of the current observation
503  *  (f->ecurrent).  We keep a serialized version of each unique
504  *  observation in order that we may resume correctly the fme in the
505  *  correct state if eft or fmd crashes and we're restarted.
506  */
507 static void
serialize_observation(struct fme * fp,const char * cls,const struct ipath * ipp)508 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
509 {
510 	size_t pkdlen;
511 	char tmpbuf[OBBUFNMSZ];
512 	char *pkd = NULL;
513 	char *estr;
514 
515 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
516 	estr = ipath2str(cls, ipp);
517 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
518 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
519 	    strlen(estr) + 1);
520 	FREE(estr);
521 
522 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
523 		(void) snprintf(tmpbuf,
524 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
525 		if (nvlist_xpack(fp->ecurrent->nvp,
526 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
527 			out(O_DIE|O_SYS, "pack of observed nvl failed");
528 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
529 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
530 		FREE(pkd);
531 	}
532 
533 	fp->uniqobs++;
534 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
535 	    sizeof (fp->uniqobs));
536 }
537 
538 /*
539  *  init_fme_bufs -- We keep several bits of state about an fme for
540  *	use if eft or fmd crashes and we're restarted.
541  */
542 static void
init_fme_bufs(struct fme * fp)543 init_fme_bufs(struct fme *fp)
544 {
545 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
546 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
547 	    sizeof (fp->pull));
548 
549 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
550 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
551 	    sizeof (fp->id));
552 
553 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
554 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
555 	    sizeof (fp->uniqobs));
556 
557 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
558 	    sizeof (fp->posted_suspects));
559 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
560 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
561 }
562 
563 static void
destroy_fme_bufs(struct fme * fp)564 destroy_fme_bufs(struct fme *fp)
565 {
566 	char tmpbuf[OBBUFNMSZ];
567 	int o;
568 
569 	platform_restore_config(fp->hdl, fp->fmcase);
570 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
571 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
572 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
573 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
574 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
575 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
576 
577 	for (o = 0; o < fp->uniqobs; o++) {
578 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
579 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
580 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
581 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
582 	}
583 }
584 
585 /*
586  * reconstitute_observations -- convert a case's serialized observations
587  *	back into struct events.  Returns zero if all observations are
588  *	successfully reconstituted.
589  */
590 static int
reconstitute_observations(struct fme * fmep)591 reconstitute_observations(struct fme *fmep)
592 {
593 	struct event *ep;
594 	struct node *epnamenp = NULL;
595 	size_t pkdlen;
596 	char *pkd = NULL;
597 	char *tmpbuf = alloca(OBBUFNMSZ);
598 	char *sepptr;
599 	char *estr;
600 	int ocnt;
601 	int elen;
602 
603 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
604 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
605 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
606 		if (elen == 0) {
607 			out(O_ALTFP,
608 			    "reconstitute_observation: no %s buffer found.",
609 			    tmpbuf);
610 			Undiag_reason = UD_VAL_MISSINGOBS;
611 			break;
612 		}
613 
614 		estr = MALLOC(elen);
615 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
616 		sepptr = strchr(estr, '@');
617 		if (sepptr == NULL) {
618 			out(O_ALTFP,
619 			    "reconstitute_observation: %s: "
620 			    "missing @ separator in %s.",
621 			    tmpbuf, estr);
622 			Undiag_reason = UD_VAL_MISSINGPATH;
623 			FREE(estr);
624 			break;
625 		}
626 
627 		*sepptr = '\0';
628 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
629 			out(O_ALTFP,
630 			    "reconstitute_observation: %s: "
631 			    "trouble converting path string \"%s\" "
632 			    "to internal representation.",
633 			    tmpbuf, sepptr + 1);
634 			Undiag_reason = UD_VAL_MISSINGPATH;
635 			FREE(estr);
636 			break;
637 		}
638 
639 		/* construct the event */
640 		ep = itree_lookup(fmep->eventtree,
641 		    stable(estr), ipath(epnamenp));
642 		if (ep == NULL) {
643 			out(O_ALTFP,
644 			    "reconstitute_observation: %s: "
645 			    "lookup of  \"%s\" in itree failed.",
646 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
647 			Undiag_reason = UD_VAL_BADOBS;
648 			tree_free(epnamenp);
649 			FREE(estr);
650 			break;
651 		}
652 		tree_free(epnamenp);
653 
654 		/*
655 		 * We may or may not have a saved nvlist for the observation
656 		 */
657 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
658 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
659 		if (pkdlen != 0) {
660 			pkd = MALLOC(pkdlen);
661 			fmd_buf_read(fmep->hdl,
662 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
663 			ASSERT(ep->nvp == NULL);
664 			if (nvlist_xunpack(pkd,
665 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
666 				out(O_DIE|O_SYS, "pack of observed nvl failed");
667 			FREE(pkd);
668 		}
669 
670 		if (ocnt == 0)
671 			fmep->e0 = ep;
672 
673 		FREE(estr);
674 		fmep->ecurrent = ep;
675 		ep->count++;
676 
677 		/* link it into list of observations seen */
678 		ep->observations = fmep->observations;
679 		fmep->observations = ep;
680 	}
681 
682 	if (ocnt == fmep->uniqobs) {
683 		(void) fme_ready(fmep);
684 		return (0);
685 	}
686 
687 	return (1);
688 }
689 
690 /*
691  * restart_fme -- called during eft initialization.  Reconstitutes
692  *	an in-progress fme.
693  */
694 void
fme_restart(fmd_hdl_t * hdl,fmd_case_t * inprogress)695 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
696 {
697 	nvlist_t *defect;
698 	struct case_list *bad;
699 	struct fme *fmep;
700 	struct cfgdata *cfgdata;
701 	size_t rawsz;
702 	struct event *ep;
703 	char *tmpbuf = alloca(OBBUFNMSZ);
704 	char *sepptr;
705 	char *estr;
706 	int elen;
707 	struct node *epnamenp = NULL;
708 	int init_size;
709 	extern int alloc_total();
710 	char *reason;
711 
712 	/*
713 	 * ignore solved or closed cases
714 	 */
715 	if (fmd_case_solved(hdl, inprogress) ||
716 	    fmd_case_closed(hdl, inprogress))
717 		return;
718 
719 	fmep = alloc_fme();
720 	fmep->fmcase = inprogress;
721 	fmep->hdl = hdl;
722 
723 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
724 		out(O_ALTFP, "restart_fme: no saved posted status");
725 		Undiag_reason = UD_VAL_MISSINGINFO;
726 		goto badcase;
727 	} else {
728 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
729 		    (void *)&fmep->posted_suspects,
730 		    sizeof (fmep->posted_suspects));
731 	}
732 
733 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
734 		out(O_ALTFP, "restart_fme: no saved id");
735 		Undiag_reason = UD_VAL_MISSINGINFO;
736 		goto badcase;
737 	} else {
738 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
739 		    sizeof (fmep->id));
740 	}
741 	if (Nextid <= fmep->id)
742 		Nextid = fmep->id + 1;
743 
744 	out(O_ALTFP, "Replay FME %d", fmep->id);
745 
746 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
747 		out(O_ALTFP, "restart_fme: No config data");
748 		Undiag_reason = UD_VAL_MISSINGINFO;
749 		goto badcase;
750 	}
751 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
752 	    sizeof (size_t));
753 
754 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
755 		out(O_ALTFP, "restart_fme: No event zero");
756 		Undiag_reason = UD_VAL_MISSINGZERO;
757 		goto badcase;
758 	}
759 
760 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
761 		out(O_ALTFP, "restart_fme: no saved wait time");
762 		Undiag_reason = UD_VAL_MISSINGINFO;
763 		goto badcase;
764 	} else {
765 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
766 		    sizeof (fmep->pull));
767 	}
768 
769 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
770 		out(O_ALTFP, "restart_fme: no count of observations");
771 		Undiag_reason = UD_VAL_MISSINGINFO;
772 		goto badcase;
773 	} else {
774 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
775 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
776 	}
777 
778 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed0");
779 	elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
780 	if (elen == 0) {
781 		out(O_ALTFP, "reconstitute_observation: no %s buffer found.",
782 		    tmpbuf);
783 		Undiag_reason = UD_VAL_MISSINGOBS;
784 		goto badcase;
785 	}
786 	estr = MALLOC(elen);
787 	fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
788 	sepptr = strchr(estr, '@');
789 	if (sepptr == NULL) {
790 		out(O_ALTFP, "reconstitute_observation: %s: "
791 		    "missing @ separator in %s.",
792 		    tmpbuf, estr);
793 		Undiag_reason = UD_VAL_MISSINGPATH;
794 		FREE(estr);
795 		goto badcase;
796 	}
797 	*sepptr = '\0';
798 	if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
799 		out(O_ALTFP, "reconstitute_observation: %s: "
800 		    "trouble converting path string \"%s\" "
801 		    "to internal representation.", tmpbuf, sepptr + 1);
802 		Undiag_reason = UD_VAL_MISSINGPATH;
803 		FREE(estr);
804 		goto badcase;
805 	}
806 	(void) prune_propagations(stable(estr), ipath(epnamenp));
807 	tree_free(epnamenp);
808 	FREE(estr);
809 
810 	init_size = alloc_total();
811 	out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size);
812 	cfgdata = MALLOC(sizeof (struct cfgdata));
813 	cfgdata->cooked = NULL;
814 	cfgdata->devcache = NULL;
815 	cfgdata->devidcache = NULL;
816 	cfgdata->tpcache = NULL;
817 	cfgdata->cpucache = NULL;
818 	cfgdata->raw_refcnt = 1;
819 
820 	if (rawsz > 0) {
821 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
822 			out(O_ALTFP, "restart_fme: Config data size mismatch");
823 			Undiag_reason = UD_VAL_CFGMISMATCH;
824 			goto badcase;
825 		}
826 		cfgdata->begin = MALLOC(rawsz);
827 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
828 		fmd_buf_read(hdl,
829 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
830 	} else {
831 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
832 	}
833 
834 	config_cook(cfgdata);
835 	fmep->config = cfgdata->cooked;
836 	config_free(cfgdata);
837 	out(O_ALTFP|O_STAMP, "config_restore added %d bytes",
838 	    alloc_total() - init_size);
839 
840 	if ((fmep->eventtree = itree_create(fmep->config)) == NULL) {
841 		/* case not properly saved or irretrievable */
842 		out(O_ALTFP, "restart_fme: NULL instance tree");
843 		Undiag_reason = UD_VAL_INSTFAIL;
844 		goto badcase;
845 	}
846 
847 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
848 
849 	if (reconstitute_observations(fmep) != 0)
850 		goto badcase;
851 
852 	out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id);
853 	for (ep = fmep->observations; ep; ep = ep->observations) {
854 		out(O_ALTFP|O_NONL, " ");
855 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
856 	}
857 	out(O_ALTFP, NULL);
858 
859 	Open_fme_count++;
860 
861 	/* give the diagnosis algorithm a shot at the new FME state */
862 	fme_eval(fmep, fmep->e0r);
863 	return;
864 
865 badcase:
866 	if (fmep->eventtree != NULL)
867 		itree_free(fmep->eventtree);
868 	if (fmep->config)
869 		structconfig_free(fmep->config);
870 	destroy_fme_bufs(fmep);
871 	FREE(fmep);
872 
873 	/*
874 	 * Since we're unable to restart the case, add it to the undiagable
875 	 * list and solve and close it as appropriate.
876 	 */
877 	bad = MALLOC(sizeof (struct case_list));
878 	bad->next = NULL;
879 
880 	if (Undiagablecaselist != NULL)
881 		bad->next = Undiagablecaselist;
882 	Undiagablecaselist = bad;
883 	bad->fmcase = inprogress;
884 
885 	out(O_ALTFP|O_NONL, "[case %s (unable to restart), ",
886 	    fmd_case_uuid(hdl, bad->fmcase));
887 
888 	if (fmd_case_solved(hdl, bad->fmcase)) {
889 		out(O_ALTFP|O_NONL, "already solved, ");
890 	} else {
891 		out(O_ALTFP|O_NONL, "solving, ");
892 		defect = fmd_nvl_create_fault(hdl,
893 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
894 		reason = undiag_2reason_str(Undiag_reason, NULL);
895 		(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
896 		FREE(reason);
897 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
898 		fmd_case_solve(hdl, bad->fmcase);
899 		Undiag_reason = UD_VAL_UNKNOWN;
900 	}
901 
902 	if (fmd_case_closed(hdl, bad->fmcase)) {
903 		out(O_ALTFP, "already closed ]");
904 	} else {
905 		out(O_ALTFP, "closing ]");
906 		fmd_case_close(hdl, bad->fmcase);
907 	}
908 }
909 
910 /*ARGSUSED*/
911 static void
globals_destructor(void * left,void * right,void * arg)912 globals_destructor(void *left, void *right, void *arg)
913 {
914 	struct evalue *evp = (struct evalue *)right;
915 	if (evp->t == NODEPTR)
916 		tree_free((struct node *)(uintptr_t)evp->v);
917 	evp->v = (uintptr_t)NULL;
918 	FREE(evp);
919 }
920 
921 void
destroy_fme(struct fme * f)922 destroy_fme(struct fme *f)
923 {
924 	stats_delete(f->Rcount);
925 	stats_delete(f->Hcallcount);
926 	stats_delete(f->Rcallcount);
927 	stats_delete(f->Ccallcount);
928 	stats_delete(f->Ecallcount);
929 	stats_delete(f->Tcallcount);
930 	stats_delete(f->Marrowcount);
931 	stats_delete(f->diags);
932 
933 	if (f->eventtree != NULL)
934 		itree_free(f->eventtree);
935 	if (f->config)
936 		structconfig_free(f->config);
937 	lut_free(f->globals, globals_destructor, NULL);
938 	FREE(f);
939 }
940 
941 static const char *
fme_state2str(enum fme_state s)942 fme_state2str(enum fme_state s)
943 {
944 	switch (s) {
945 	case FME_NOTHING:	return ("NOTHING");
946 	case FME_WAIT:		return ("WAIT");
947 	case FME_CREDIBLE:	return ("CREDIBLE");
948 	case FME_DISPROVED:	return ("DISPROVED");
949 	case FME_DEFERRED:	return ("DEFERRED");
950 	default:		return ("UNKNOWN");
951 	}
952 }
953 
954 static int
is_problem(enum nametype t)955 is_problem(enum nametype t)
956 {
957 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
958 }
959 
960 static int
is_defect(enum nametype t)961 is_defect(enum nametype t)
962 {
963 	return (t == N_DEFECT);
964 }
965 
966 static int
is_upset(enum nametype t)967 is_upset(enum nametype t)
968 {
969 	return (t == N_UPSET);
970 }
971 
972 static void
fme_print(int flags,struct fme * fmep)973 fme_print(int flags, struct fme *fmep)
974 {
975 	struct event *ep;
976 
977 	out(flags, "Fault Management Exercise %d", fmep->id);
978 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
979 	out(flags|O_NONL, "\t  Start time: ");
980 	ptree_timeval(flags|O_NONL, &fmep->ull);
981 	out(flags, NULL);
982 	if (fmep->wull) {
983 		out(flags|O_NONL, "\t   Wait time: ");
984 		ptree_timeval(flags|O_NONL, &fmep->wull);
985 		out(flags, NULL);
986 	}
987 	out(flags|O_NONL, "\t          E0: ");
988 	if (fmep->e0)
989 		itree_pevent_brief(flags|O_NONL, fmep->e0);
990 	else
991 		out(flags|O_NONL, "NULL");
992 	out(flags, NULL);
993 	out(flags|O_NONL, "\tObservations:");
994 	for (ep = fmep->observations; ep; ep = ep->observations) {
995 		out(flags|O_NONL, " ");
996 		itree_pevent_brief(flags|O_NONL, ep);
997 	}
998 	out(flags, NULL);
999 	out(flags|O_NONL, "\tSuspect list:");
1000 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1001 		out(flags|O_NONL, " ");
1002 		itree_pevent_brief(flags|O_NONL, ep);
1003 	}
1004 	out(flags, NULL);
1005 	if (fmep->eventtree != NULL) {
1006 		out(flags|O_VERB2, "\t        Tree:");
1007 		itree_ptree(flags|O_VERB2, fmep->eventtree);
1008 	}
1009 }
1010 
1011 static struct node *
pathstring2epnamenp(char * path)1012 pathstring2epnamenp(char *path)
1013 {
1014 	char *sep = "/";
1015 	struct node *ret;
1016 	char *ptr;
1017 
1018 	if ((ptr = strtok(path, sep)) == NULL)
1019 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
1020 
1021 	ret = tree_iname(stable(ptr), NULL, 0);
1022 
1023 	while ((ptr = strtok(NULL, sep)) != NULL)
1024 		ret = tree_name_append(ret,
1025 		    tree_iname(stable(ptr), NULL, 0));
1026 
1027 	return (ret);
1028 }
1029 
1030 /*
1031  * for a given upset sp, increment the corresponding SERD engine.  if the
1032  * SERD engine trips, return the ename and ipp of the resulting ereport.
1033  * returns true if engine tripped and *enamep and *ippp were filled in.
1034  */
1035 static int
serd_eval(struct fme * fmep,fmd_hdl_t * hdl,fmd_event_t * ffep,fmd_case_t * fmcase,struct event * sp,const char ** enamep,const struct ipath ** ippp)1036 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
1037     fmd_case_t *fmcase, struct event *sp, const char **enamep,
1038     const struct ipath **ippp)
1039 {
1040 	struct node *serdinst;
1041 	char *serdname;
1042 	char *serdresource;
1043 	char *serdclass;
1044 	struct node *nid;
1045 	struct serd_entry *newentp;
1046 	int i, serdn = -1, serdincrement = 1, len = 0;
1047 	char *serdsuffix = NULL, *serdt = NULL;
1048 	struct evalue *ep;
1049 
1050 	ASSERT(sp->t == N_UPSET);
1051 	ASSERT(ffep != NULL);
1052 
1053 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1054 	    (void *)"n", (lut_cmp)strcmp)) != NULL) {
1055 		ASSERT(ep->t == UINT64);
1056 		serdn = (int)ep->v;
1057 	}
1058 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1059 	    (void *)"t", (lut_cmp)strcmp)) != NULL) {
1060 		ASSERT(ep->t == STRING);
1061 		serdt = (char *)(uintptr_t)ep->v;
1062 	}
1063 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1064 	    (void *)"suffix", (lut_cmp)strcmp)) != NULL) {
1065 		ASSERT(ep->t == STRING);
1066 		serdsuffix = (char *)(uintptr_t)ep->v;
1067 	}
1068 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1069 	    (void *)"increment", (lut_cmp)strcmp)) != NULL) {
1070 		ASSERT(ep->t == UINT64);
1071 		serdincrement = (int)ep->v;
1072 	}
1073 
1074 	/*
1075 	 * obtain instanced SERD engine from the upset sp.  from this
1076 	 * derive serdname, the string used to identify the SERD engine.
1077 	 */
1078 	serdinst = eventprop_lookup(sp, L_engine);
1079 
1080 	if (serdinst == NULL)
1081 		return (-1);
1082 
1083 	len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1;
1084 	if (serdsuffix != NULL)
1085 		len += strlen(serdsuffix);
1086 	serdclass = MALLOC(len);
1087 	if (serdsuffix != NULL)
1088 		(void) snprintf(serdclass, len, "%s%s",
1089 		    serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix);
1090 	else
1091 		(void) snprintf(serdclass, len, "%s",
1092 		    serdinst->u.stmt.np->u.event.ename->u.name.s);
1093 	serdresource = ipath2str(NULL,
1094 	    ipath(serdinst->u.stmt.np->u.event.epname));
1095 	len += strlen(serdresource) + 1;
1096 	serdname = MALLOC(len);
1097 	(void) snprintf(serdname, len, "%s@%s", serdclass, serdresource);
1098 	FREE(serdresource);
1099 
1100 	/* handle serd engine "id" property, if there is one */
1101 	if ((nid =
1102 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
1103 		struct evalue *gval;
1104 		char suffixbuf[200];
1105 		char *suffix;
1106 		char *nserdname;
1107 		size_t nname;
1108 
1109 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
1110 		ptree_name_iter(O_ALTFP|O_NONL, nid);
1111 
1112 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
1113 
1114 		if ((gval = lut_lookup(fmep->globals,
1115 		    (void *)nid->u.globid.s, NULL)) == NULL) {
1116 			out(O_ALTFP, " undefined");
1117 		} else if (gval->t == UINT64) {
1118 			out(O_ALTFP, " %llu", gval->v);
1119 			(void) sprintf(suffixbuf, "%llu", gval->v);
1120 			suffix = suffixbuf;
1121 		} else {
1122 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
1123 			suffix = (char *)(uintptr_t)gval->v;
1124 		}
1125 
1126 		nname = strlen(serdname) + strlen(suffix) + 2;
1127 		nserdname = MALLOC(nname);
1128 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
1129 		FREE(serdname);
1130 		serdname = nserdname;
1131 	}
1132 
1133 	/*
1134 	 * if the engine is empty, and we have an override for n/t then
1135 	 * destroy and recreate it.
1136 	 */
1137 	if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) &&
1138 	    fmd_serd_empty(hdl, serdname))
1139 		fmd_serd_destroy(hdl, serdname);
1140 
1141 	if (!fmd_serd_exists(hdl, serdname)) {
1142 		struct node *nN, *nT;
1143 		const char *s;
1144 		struct node *nodep;
1145 		struct config *cp;
1146 		char *path;
1147 		uint_t nval;
1148 		hrtime_t tval;
1149 		int i;
1150 		char *ptr;
1151 		int got_n_override = 0, got_t_override = 0;
1152 
1153 		/* no SERD engine yet, so create it */
1154 		nodep = serdinst->u.stmt.np->u.event.epname;
1155 		path = ipath2str(NULL, ipath(nodep));
1156 		cp = config_lookup(fmep->config, path, 0);
1157 		FREE((void *)path);
1158 
1159 		/*
1160 		 * We allow serd paramaters to be overridden, either from
1161 		 * eft.conf file values (if Serd_Override is set) or from
1162 		 * driver properties (for "serd.io.device" engines).
1163 		 */
1164 		if (Serd_Override != NULL) {
1165 			char *save_ptr, *ptr1, *ptr2, *ptr3;
1166 			ptr3 = save_ptr = STRDUP(Serd_Override);
1167 			while (*ptr3 != '\0') {
1168 				ptr1 = strchr(ptr3, ',');
1169 				*ptr1 = '\0';
1170 				if (strcmp(ptr3, serdclass) == 0) {
1171 					ptr2 =  strchr(ptr1 + 1, ',');
1172 					*ptr2 = '\0';
1173 					nval = atoi(ptr1 + 1);
1174 					out(O_ALTFP, "serd override %s_n %d",
1175 					    serdclass, nval);
1176 					ptr3 =  strchr(ptr2 + 1, ' ');
1177 					if (ptr3)
1178 						*ptr3 = '\0';
1179 					ptr = STRDUP(ptr2 + 1);
1180 					out(O_ALTFP, "serd override %s_t %s",
1181 					    serdclass, ptr);
1182 					got_n_override = 1;
1183 					got_t_override = 1;
1184 					break;
1185 				} else {
1186 					ptr2 =  strchr(ptr1 + 1, ',');
1187 					ptr3 =  strchr(ptr2 + 1, ' ');
1188 					if (ptr3 == NULL)
1189 						break;
1190 				}
1191 				ptr3++;
1192 			}
1193 			FREE(save_ptr);
1194 		}
1195 
1196 		if (cp && got_n_override == 0) {
1197 			/*
1198 			 * convert serd engine class into property name
1199 			 */
1200 			char *prop_name = MALLOC(strlen(serdclass) + 3);
1201 			for (i = 0; i < strlen(serdclass); i++) {
1202 				if (serdclass[i] == '.')
1203 					prop_name[i] = '_';
1204 				else
1205 					prop_name[i] = serdclass[i];
1206 			}
1207 			prop_name[i++] = '_';
1208 			prop_name[i++] = 'n';
1209 			prop_name[i] = '\0';
1210 			if (s = config_getprop(cp, prop_name)) {
1211 				nval = atoi(s);
1212 				out(O_ALTFP, "serd override %s_n %s",
1213 				    serdclass, s);
1214 				got_n_override = 1;
1215 			}
1216 			prop_name[i - 1] = 't';
1217 			if (s = config_getprop(cp, prop_name)) {
1218 				ptr = STRDUP(s);
1219 				out(O_ALTFP, "serd override %s_t %s",
1220 				    serdclass, s);
1221 				got_t_override = 1;
1222 			}
1223 			FREE(prop_name);
1224 		}
1225 
1226 		if (serdn != -1 && got_n_override == 0) {
1227 			nval = serdn;
1228 			out(O_ALTFP, "serd override %s_n %d", serdclass, serdn);
1229 			got_n_override = 1;
1230 		}
1231 		if (serdt != NULL && got_t_override == 0) {
1232 			ptr = STRDUP(serdt);
1233 			out(O_ALTFP, "serd override %s_t %s", serdclass, serdt);
1234 			got_t_override = 1;
1235 		}
1236 
1237 		if (!got_n_override) {
1238 			nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N,
1239 			    NULL);
1240 			ASSERT(nN->t == T_NUM);
1241 			nval = (uint_t)nN->u.ull;
1242 		}
1243 		if (!got_t_override) {
1244 			nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T,
1245 			    NULL);
1246 			ASSERT(nT->t == T_TIMEVAL);
1247 			tval = (hrtime_t)nT->u.ull;
1248 		} else {
1249 			const unsigned long long *ullp;
1250 			const char *suffix;
1251 			int len;
1252 
1253 			len = strspn(ptr, "0123456789");
1254 			suffix = stable(&ptr[len]);
1255 			ullp = (unsigned long long *)lut_lookup(Timesuffixlut,
1256 			    (void *)suffix, NULL);
1257 			ptr[len] = '\0';
1258 			tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll);
1259 			FREE(ptr);
1260 		}
1261 		fmd_serd_create(hdl, serdname, nval, tval);
1262 	}
1263 
1264 	newentp = MALLOC(sizeof (*newentp));
1265 	newentp->ename = stable(serdclass);
1266 	FREE(serdclass);
1267 	newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname);
1268 	newentp->hdl = hdl;
1269 	if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) {
1270 		SerdEngines = lut_add(SerdEngines, (void *)newentp,
1271 		    (void *)newentp, (lut_cmp)serd_cmp);
1272 		Serd_need_save = 1;
1273 		serd_save();
1274 	} else {
1275 		FREE(newentp);
1276 	}
1277 
1278 
1279 	/*
1280 	 * increment SERD engine.  if engine fires, reset serd
1281 	 * engine and return trip_strcode if required.
1282 	 */
1283 	for (i = 0; i < serdincrement; i++) {
1284 		if (fmd_serd_record(hdl, serdname, ffep)) {
1285 			fmd_case_add_serd(hdl, fmcase, serdname);
1286 			fmd_serd_reset(hdl, serdname);
1287 
1288 			if (ippp) {
1289 				struct node *tripinst =
1290 				    lut_lookup(serdinst->u.stmt.lutp,
1291 				    (void *)L_trip, NULL);
1292 				ASSERT(tripinst != NULL);
1293 				*enamep = tripinst->u.event.ename->u.name.s;
1294 				*ippp = ipath(tripinst->u.event.epname);
1295 				out(O_ALTFP|O_NONL,
1296 				    "[engine fired: %s, sending: ", serdname);
1297 				ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
1298 				out(O_ALTFP, "]");
1299 			} else {
1300 				out(O_ALTFP, "[engine fired: %s, no trip]",
1301 				    serdname);
1302 			}
1303 			FREE(serdname);
1304 			return (1);
1305 		}
1306 	}
1307 
1308 	FREE(serdname);
1309 	return (0);
1310 }
1311 
1312 /*
1313  * search a suspect list for upsets.  feed each upset to serd_eval() and
1314  * build up tripped[], an array of ereports produced by the firing of
1315  * any SERD engines.  then feed each ereport back into
1316  * fme_receive_report().
1317  *
1318  * returns ntrip, the number of these ereports produced.
1319  */
1320 static int
upsets_eval(struct fme * fmep,fmd_event_t * ffep)1321 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
1322 {
1323 	/* we build an array of tripped ereports that we send ourselves */
1324 	struct {
1325 		const char *ename;
1326 		const struct ipath *ipp;
1327 	} *tripped;
1328 	struct event *sp;
1329 	int ntrip, nupset, i;
1330 
1331 	/*
1332 	 * count the number of upsets to determine the upper limit on
1333 	 * expected trip ereport strings.  remember that one upset can
1334 	 * lead to at most one ereport.
1335 	 */
1336 	nupset = 0;
1337 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
1338 		if (sp->t == N_UPSET)
1339 			nupset++;
1340 	}
1341 
1342 	if (nupset == 0)
1343 		return (0);
1344 
1345 	/*
1346 	 * get to this point if we have upsets and expect some trip
1347 	 * ereports
1348 	 */
1349 	tripped = alloca(sizeof (*tripped) * nupset);
1350 	bzero((void *)tripped, sizeof (*tripped) * nupset);
1351 
1352 	ntrip = 0;
1353 	for (sp = fmep->suspects; sp; sp = sp->suspects)
1354 		if (sp->t == N_UPSET &&
1355 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
1356 		    &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1)
1357 			ntrip++;
1358 
1359 	for (i = 0; i < ntrip; i++) {
1360 		struct event *ep, *nep;
1361 		struct fme *nfmep;
1362 		fmd_case_t *fmcase;
1363 		const struct ipath *ipp;
1364 		const char *eventstring;
1365 		int prev_verbose;
1366 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1367 		enum fme_state state;
1368 
1369 		/*
1370 		 * First try and evaluate a case with the trip ereport plus
1371 		 * all the other ereports that cause the trip. If that fails
1372 		 * to evaluate then try again with just this ereport on its own.
1373 		 */
1374 		out(O_ALTFP|O_NONL, "fme_receive_report_serd: ");
1375 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1376 		out(O_ALTFP|O_STAMP, NULL);
1377 		ep = fmep->e0;
1378 		eventstring = ep->enode->u.event.ename->u.name.s;
1379 		ipp = ep->ipp;
1380 
1381 		/*
1382 		 * create a duplicate fme and case
1383 		 */
1384 		fmcase = fmd_case_open(fmep->hdl, NULL);
1385 		out(O_ALTFP|O_NONL, "duplicate fme for event [");
1386 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1387 		out(O_ALTFP, " ]");
1388 
1389 		if ((nfmep = newfme(eventstring, ipp, fmep->hdl,
1390 		    fmcase, ffep, ep->nvp)) == NULL) {
1391 			out(O_ALTFP|O_NONL, "[");
1392 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1393 			out(O_ALTFP, " CANNOT DIAGNOSE]");
1394 			continue;
1395 		}
1396 
1397 		Open_fme_count++;
1398 		nfmep->pull = fmep->pull;
1399 		init_fme_bufs(nfmep);
1400 		out(O_ALTFP|O_NONL, "[");
1401 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1402 		out(O_ALTFP, " created FME%d, case %s]", nfmep->id,
1403 		    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
1404 		if (ffep) {
1405 			fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep);
1406 			fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep);
1407 			nfmep->e0r = ffep;
1408 		}
1409 
1410 		/*
1411 		 * add the original ereports
1412 		 */
1413 		for (ep = fmep->observations; ep; ep = ep->observations) {
1414 			eventstring = ep->enode->u.event.ename->u.name.s;
1415 			ipp = ep->ipp;
1416 			out(O_ALTFP|O_NONL, "adding event [");
1417 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1418 			out(O_ALTFP, " ]");
1419 			nep = itree_lookup(nfmep->eventtree, eventstring, ipp);
1420 			if (nep->count++ == 0) {
1421 				nep->observations = nfmep->observations;
1422 				nfmep->observations = nep;
1423 				serialize_observation(nfmep, eventstring, ipp);
1424 				nep->nvp = evnv_dupnvl(ep->nvp);
1425 			}
1426 			if (ep->ffep && ep->ffep != ffep)
1427 				fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase,
1428 				    ep->ffep);
1429 			stats_counter_bump(nfmep->Rcount);
1430 		}
1431 
1432 		/*
1433 		 * add the serd trigger ereport
1434 		 */
1435 		if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename,
1436 		    tripped[i].ipp)) == NULL) {
1437 			/*
1438 			 * The trigger ereport is not in the instance tree. It
1439 			 * was presumably removed by prune_propagations() as
1440 			 * this combination of events is not present in the
1441 			 * rules.
1442 			 */
1443 			out(O_ALTFP, "upsets_eval: e0 not in instance tree");
1444 			Undiag_reason = UD_VAL_BADEVENTI;
1445 			goto retry_lone_ereport;
1446 		}
1447 		out(O_ALTFP|O_NONL, "adding event [");
1448 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1449 		out(O_ALTFP, " ]");
1450 		nfmep->ecurrent = ep;
1451 		ep->nvp = NULL;
1452 		ep->count = 1;
1453 		ep->observations = nfmep->observations;
1454 		nfmep->observations = ep;
1455 
1456 		/*
1457 		 * just peek first.
1458 		 */
1459 		nfmep->peek = 1;
1460 		prev_verbose = Verbose;
1461 		if (Debug == 0)
1462 			Verbose = 0;
1463 		lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep);
1464 		state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay);
1465 		nfmep->peek = 0;
1466 		Verbose = prev_verbose;
1467 		if (state == FME_DISPROVED) {
1468 			out(O_ALTFP, "upsets_eval: hypothesis disproved");
1469 			Undiag_reason = UD_VAL_UNSOLVD;
1470 retry_lone_ereport:
1471 			/*
1472 			 * However the trigger ereport on its own might be
1473 			 * diagnosable, so check for that. Undo the new fme
1474 			 * and case we just created and call fme_receive_report.
1475 			 */
1476 			out(O_ALTFP|O_NONL, "[");
1477 			ipath_print(O_ALTFP|O_NONL, tripped[i].ename,
1478 			    tripped[i].ipp);
1479 			out(O_ALTFP, " retrying with just trigger ereport]");
1480 			itree_free(nfmep->eventtree);
1481 			nfmep->eventtree = NULL;
1482 			structconfig_free(nfmep->config);
1483 			nfmep->config = NULL;
1484 			destroy_fme_bufs(nfmep);
1485 			fmd_case_close(nfmep->hdl, nfmep->fmcase);
1486 			fme_receive_report(fmep->hdl, ffep,
1487 			    tripped[i].ename, tripped[i].ipp, NULL);
1488 			continue;
1489 		}
1490 
1491 		/*
1492 		 * and evaluate
1493 		 */
1494 		serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp);
1495 		fme_eval(nfmep, ffep);
1496 	}
1497 
1498 	return (ntrip);
1499 }
1500 
1501 /*
1502  * fme_receive_external_report -- call when an external ereport comes in
1503  *
1504  * this routine just converts the relevant information from the ereport
1505  * into a format used internally and passes it on to fme_receive_report().
1506  */
1507 void
fme_receive_external_report(fmd_hdl_t * hdl,fmd_event_t * ffep,nvlist_t * nvl,const char * class)1508 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1509     const char *class)
1510 {
1511 	struct node		*epnamenp;
1512 	fmd_case_t		*fmcase;
1513 	const struct ipath	*ipp;
1514 	nvlist_t		*detector = NULL;
1515 
1516 	class = stable(class);
1517 
1518 	/* Get the component path from the ereport */
1519 	epnamenp = platform_getpath(nvl);
1520 
1521 	/* See if we ended up without a path. */
1522 	if (epnamenp == NULL) {
1523 		/* See if class permits silent discard on unknown component. */
1524 		if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) {
1525 			out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport "
1526 			    "to component path, but silent discard allowed.",
1527 			    class);
1528 		} else {
1529 			/*
1530 			 * XFILE: Failure to find a component is bad unless
1531 			 * 'discard_if_config_unknown=1' was specified in the
1532 			 * ereport definition. Indicate undiagnosable.
1533 			 */
1534 			Undiag_reason = UD_VAL_NOPATH;
1535 			fmcase = fmd_case_open(hdl, NULL);
1536 
1537 			/*
1538 			 * We don't have a component path here (which means that
1539 			 * the detector was not in hc-scheme and couldn't be
1540 			 * converted to hc-scheme. Report the raw detector as
1541 			 * the suspect resource if there is one.
1542 			 */
1543 			(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
1544 			    &detector);
1545 			publish_undiagnosable(hdl, ffep, fmcase, detector,
1546 			    (char *)class);
1547 		}
1548 		return;
1549 	}
1550 
1551 	ipp = ipath(epnamenp);
1552 	tree_free(epnamenp);
1553 	fme_receive_report(hdl, ffep, class, ipp, nvl);
1554 }
1555 
1556 /*ARGSUSED*/
1557 void
fme_receive_repair_list(fmd_hdl_t * hdl,fmd_event_t * ffep,nvlist_t * nvl,const char * eventstring)1558 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1559     const char *eventstring)
1560 {
1561 	char *uuid;
1562 	nvlist_t **nva;
1563 	uint_t nvc;
1564 	const struct ipath *ipp;
1565 
1566 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
1567 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1568 	    &nva, &nvc) != 0) {
1569 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
1570 		return;
1571 	}
1572 
1573 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
1574 
1575 	while (nvc-- != 0) {
1576 		/*
1577 		 * Reset any istat or serd engine associated with this path.
1578 		 */
1579 		char *path;
1580 
1581 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
1582 			continue;
1583 
1584 		path = ipath2str(NULL, ipp);
1585 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
1586 		    path);
1587 		FREE(path);
1588 
1589 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
1590 		istat_save();
1591 
1592 		lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp);
1593 		serd_save();
1594 	}
1595 }
1596 
1597 /*ARGSUSED*/
1598 void
fme_receive_topology_change(void)1599 fme_receive_topology_change(void)
1600 {
1601 	lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL);
1602 	istat_save();
1603 
1604 	lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL);
1605 	serd_save();
1606 }
1607 
1608 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1609     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1610 
1611 /* ARGSUSED */
1612 static void
clear_arrows(struct event * ep,struct event * ep2,struct fme * fmep)1613 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1614 {
1615 	struct bubble *bp;
1616 	struct arrowlist *ap;
1617 
1618 	ep->cached_state = 0;
1619 	ep->keep_in_tree = 0;
1620 	for (bp = itree_next_bubble(ep, NULL); bp;
1621 	    bp = itree_next_bubble(ep, bp)) {
1622 		if (bp->t != B_FROM)
1623 			continue;
1624 		bp->mark = 0;
1625 		for (ap = itree_next_arrow(bp, NULL); ap;
1626 		    ap = itree_next_arrow(bp, ap))
1627 			ap->arrowp->mark = 0;
1628 	}
1629 }
1630 
1631 static void
fme_receive_report(fmd_hdl_t * hdl,fmd_event_t * ffep,const char * eventstring,const struct ipath * ipp,nvlist_t * nvl)1632 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1633     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1634 {
1635 	struct event *ep;
1636 	struct fme *fmep = NULL;
1637 	struct fme *ofmep = NULL;
1638 	struct fme *cfmep, *svfmep;
1639 	int matched = 0;
1640 	nvlist_t *defect;
1641 	fmd_case_t *fmcase;
1642 	char *reason;
1643 
1644 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1645 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1646 	out(O_ALTFP|O_STAMP, NULL);
1647 
1648 	/* decide which FME it goes to */
1649 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1650 		int prev_verbose;
1651 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1652 		enum fme_state state;
1653 		nvlist_t *pre_peek_nvp = NULL;
1654 
1655 		if (fmep->overflow) {
1656 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1657 				ofmep = fmep;
1658 
1659 			continue;
1660 		}
1661 
1662 		/*
1663 		 * ignore solved or closed cases
1664 		 */
1665 		if (fmep->posted_suspects ||
1666 		    fmd_case_solved(fmep->hdl, fmep->fmcase) ||
1667 		    fmd_case_closed(fmep->hdl, fmep->fmcase))
1668 			continue;
1669 
1670 		/* look up event in event tree for this FME */
1671 		if ((ep = itree_lookup(fmep->eventtree,
1672 		    eventstring, ipp)) == NULL)
1673 			continue;
1674 
1675 		/* note observation */
1676 		fmep->ecurrent = ep;
1677 		if (ep->count++ == 0) {
1678 			/* link it into list of observations seen */
1679 			ep->observations = fmep->observations;
1680 			fmep->observations = ep;
1681 			ep->nvp = evnv_dupnvl(nvl);
1682 		} else {
1683 			/* use new payload values for peek */
1684 			pre_peek_nvp = ep->nvp;
1685 			ep->nvp = evnv_dupnvl(nvl);
1686 		}
1687 
1688 		/* tell hypothesise() not to mess with suspect list */
1689 		fmep->peek = 1;
1690 
1691 		/* don't want this to be verbose (unless Debug is set) */
1692 		prev_verbose = Verbose;
1693 		if (Debug == 0)
1694 			Verbose = 0;
1695 
1696 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1697 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1698 
1699 		fmep->peek = 0;
1700 
1701 		/* put verbose flag back */
1702 		Verbose = prev_verbose;
1703 
1704 		if (state != FME_DISPROVED) {
1705 			/* found an FME that explains the ereport */
1706 			matched++;
1707 			out(O_ALTFP|O_NONL, "[");
1708 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1709 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1710 
1711 			nvlist_free(pre_peek_nvp);
1712 
1713 			if (ep->count == 1)
1714 				serialize_observation(fmep, eventstring, ipp);
1715 
1716 			if (ffep) {
1717 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1718 				ep->ffep = ffep;
1719 			}
1720 
1721 			stats_counter_bump(fmep->Rcount);
1722 
1723 			/* re-eval FME */
1724 			fme_eval(fmep, ffep);
1725 		} else {
1726 
1727 			/* not a match, undo noting of observation */
1728 			fmep->ecurrent = NULL;
1729 			if (--ep->count == 0) {
1730 				/* unlink it from observations */
1731 				fmep->observations = ep->observations;
1732 				ep->observations = NULL;
1733 				nvlist_free(ep->nvp);
1734 				ep->nvp = NULL;
1735 			} else {
1736 				nvlist_free(ep->nvp);
1737 				ep->nvp = pre_peek_nvp;
1738 			}
1739 		}
1740 	}
1741 
1742 	if (matched)
1743 		return;	/* explained by at least one existing FME */
1744 
1745 	/* clean up closed fmes */
1746 	cfmep = ClosedFMEs;
1747 	while (cfmep != NULL) {
1748 		svfmep = cfmep->next;
1749 		destroy_fme(cfmep);
1750 		cfmep = svfmep;
1751 	}
1752 	ClosedFMEs = NULL;
1753 
1754 	if (ofmep) {
1755 		out(O_ALTFP|O_NONL, "[");
1756 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1757 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1758 		if (ffep)
1759 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1760 
1761 		return;
1762 
1763 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1764 		out(O_ALTFP|O_NONL, "[");
1765 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1766 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1767 
1768 		fmcase = fmd_case_open(hdl, NULL);
1769 
1770 		/* Create overflow fme */
1771 		if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep,
1772 		    nvl)) == NULL) {
1773 			out(O_ALTFP|O_NONL, "[");
1774 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1775 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1776 			return;
1777 		}
1778 
1779 		Open_fme_count++;
1780 
1781 		init_fme_bufs(fmep);
1782 		fmep->overflow = B_TRUE;
1783 
1784 		if (ffep)
1785 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1786 
1787 		Undiag_reason = UD_VAL_MAXFME;
1788 		defect = fmd_nvl_create_fault(hdl,
1789 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
1790 		reason = undiag_2reason_str(Undiag_reason, NULL);
1791 		(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
1792 		FREE(reason);
1793 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1794 		fmd_case_solve(hdl, fmep->fmcase);
1795 		Undiag_reason = UD_VAL_UNKNOWN;
1796 		return;
1797 	}
1798 
1799 	/* open a case */
1800 	fmcase = fmd_case_open(hdl, NULL);
1801 
1802 	/* start a new FME */
1803 	if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) {
1804 		out(O_ALTFP|O_NONL, "[");
1805 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1806 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1807 		return;
1808 	}
1809 
1810 	Open_fme_count++;
1811 
1812 	init_fme_bufs(fmep);
1813 
1814 	out(O_ALTFP|O_NONL, "[");
1815 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1816 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1817 	    fmd_case_uuid(hdl, fmep->fmcase));
1818 
1819 	ep = fmep->e0;
1820 	ASSERT(ep != NULL);
1821 
1822 	/* note observation */
1823 	fmep->ecurrent = ep;
1824 	if (ep->count++ == 0) {
1825 		/* link it into list of observations seen */
1826 		ep->observations = fmep->observations;
1827 		fmep->observations = ep;
1828 		ep->nvp = evnv_dupnvl(nvl);
1829 		serialize_observation(fmep, eventstring, ipp);
1830 	} else {
1831 		/* new payload overrides any previous */
1832 		nvlist_free(ep->nvp);
1833 		ep->nvp = evnv_dupnvl(nvl);
1834 	}
1835 
1836 	stats_counter_bump(fmep->Rcount);
1837 
1838 	if (ffep) {
1839 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1840 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1841 		fmep->e0r = ffep;
1842 		ep->ffep = ffep;
1843 	}
1844 
1845 	/* give the diagnosis algorithm a shot at the new FME state */
1846 	fme_eval(fmep, ffep);
1847 }
1848 
1849 void
fme_status(int flags)1850 fme_status(int flags)
1851 {
1852 	struct fme *fmep;
1853 
1854 	if (FMElist == NULL) {
1855 		out(flags, "No fault management exercises underway.");
1856 		return;
1857 	}
1858 
1859 	for (fmep = FMElist; fmep; fmep = fmep->next)
1860 		fme_print(flags, fmep);
1861 }
1862 
1863 /*
1864  * "indent" routines used mostly for nicely formatted debug output, but also
1865  * for sanity checking for infinite recursion bugs.
1866  */
1867 
1868 #define	MAX_INDENT 1024
1869 static const char *indent_s[MAX_INDENT];
1870 static int current_indent;
1871 
1872 static void
indent_push(const char * s)1873 indent_push(const char *s)
1874 {
1875 	if (current_indent < MAX_INDENT)
1876 		indent_s[current_indent++] = s;
1877 	else
1878 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1879 }
1880 
1881 static void
indent_set(const char * s)1882 indent_set(const char *s)
1883 {
1884 	current_indent = 0;
1885 	indent_push(s);
1886 }
1887 
1888 static void
indent_pop(void)1889 indent_pop(void)
1890 {
1891 	if (current_indent > 0)
1892 		current_indent--;
1893 	else
1894 		out(O_DIE, "recursion underflow");
1895 }
1896 
1897 static void
indent(void)1898 indent(void)
1899 {
1900 	int i;
1901 	if (!Verbose)
1902 		return;
1903 	for (i = 0; i < current_indent; i++)
1904 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1905 }
1906 
1907 #define	SLNEW		1
1908 #define	SLCHANGED	2
1909 #define	SLWAIT		3
1910 #define	SLDISPROVED	4
1911 
1912 static void
print_suspects(int circumstance,struct fme * fmep)1913 print_suspects(int circumstance, struct fme *fmep)
1914 {
1915 	struct event *ep;
1916 
1917 	out(O_ALTFP|O_NONL, "[");
1918 	if (circumstance == SLCHANGED) {
1919 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1920 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1921 	} else if (circumstance == SLWAIT) {
1922 		out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id,
1923 		    fmep->timer);
1924 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1925 	} else if (circumstance == SLDISPROVED) {
1926 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1927 	} else {
1928 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1929 	}
1930 
1931 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1932 		out(O_ALTFP, "]");
1933 		return;
1934 	}
1935 
1936 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1937 		out(O_ALTFP|O_NONL, " ");
1938 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1939 	}
1940 	out(O_ALTFP, "]");
1941 }
1942 
1943 static struct node *
eventprop_lookup(struct event * ep,const char * propname)1944 eventprop_lookup(struct event *ep, const char *propname)
1945 {
1946 	return (lut_lookup(ep->props, (void *)propname, NULL));
1947 }
1948 
1949 #define	MAXDIGITIDX	23
1950 static char numbuf[MAXDIGITIDX + 1];
1951 
1952 static int
node2uint(struct node * n,uint_t * valp)1953 node2uint(struct node *n, uint_t *valp)
1954 {
1955 	struct evalue value;
1956 	struct lut *globals = NULL;
1957 
1958 	if (n == NULL)
1959 		return (1);
1960 
1961 	/*
1962 	 * check value.v since we are being asked to convert an unsigned
1963 	 * long long int to an unsigned int
1964 	 */
1965 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1966 	    value.t != UINT64 || value.v > (1ULL << 32))
1967 		return (1);
1968 
1969 	*valp = (uint_t)value.v;
1970 
1971 	return (0);
1972 }
1973 
1974 static nvlist_t *
node2fmri(struct node * n)1975 node2fmri(struct node *n)
1976 {
1977 	nvlist_t **pa, *f, *p;
1978 	struct node *nc;
1979 	uint_t depth = 0;
1980 	char *numstr, *nullbyte;
1981 	char *failure;
1982 	int err, i;
1983 
1984 	/* XXX do we need to be able to handle a non-T_NAME node? */
1985 	if (n == NULL || n->t != T_NAME)
1986 		return (NULL);
1987 
1988 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1989 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1990 			break;
1991 		depth++;
1992 	}
1993 
1994 	if (nc != NULL) {
1995 		/* We bailed early, something went wrong */
1996 		return (NULL);
1997 	}
1998 
1999 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
2000 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
2001 	pa = alloca(depth * sizeof (nvlist_t *));
2002 	for (i = 0; i < depth; i++)
2003 		pa[i] = NULL;
2004 
2005 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
2006 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
2007 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
2008 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
2009 	if (err != 0) {
2010 		failure = "basic construction of FMRI failed";
2011 		goto boom;
2012 	}
2013 
2014 	numbuf[MAXDIGITIDX] = '\0';
2015 	nullbyte = &numbuf[MAXDIGITIDX];
2016 	i = 0;
2017 
2018 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
2019 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
2020 		if (err != 0) {
2021 			failure = "alloc of an hc-pair failed";
2022 			goto boom;
2023 		}
2024 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
2025 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
2026 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
2027 		if (err != 0) {
2028 			failure = "construction of an hc-pair failed";
2029 			goto boom;
2030 		}
2031 		pa[i++] = p;
2032 	}
2033 
2034 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
2035 	if (err == 0) {
2036 		for (i = 0; i < depth; i++)
2037 			nvlist_free(pa[i]);
2038 		return (f);
2039 	}
2040 	failure = "addition of hc-pair array to FMRI failed";
2041 
2042 boom:
2043 	for (i = 0; i < depth; i++)
2044 		nvlist_free(pa[i]);
2045 	nvlist_free(f);
2046 	out(O_DIE, "%s", failure);
2047 	/*NOTREACHED*/
2048 	return (NULL);
2049 }
2050 
2051 /* an ipath cache entry is an array of these, with s==NULL at the end */
2052 struct ipath {
2053 	const char *s;	/* component name (in stable) */
2054 	int i;		/* instance number */
2055 };
2056 
2057 static nvlist_t *
ipath2fmri(struct ipath * ipath)2058 ipath2fmri(struct ipath *ipath)
2059 {
2060 	nvlist_t **pa, *f, *p;
2061 	uint_t depth = 0;
2062 	char *numstr, *nullbyte;
2063 	char *failure;
2064 	int err, i;
2065 	struct ipath *ipp;
2066 
2067 	for (ipp = ipath; ipp->s != NULL; ipp++)
2068 		depth++;
2069 
2070 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
2071 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
2072 	pa = alloca(depth * sizeof (nvlist_t *));
2073 	for (i = 0; i < depth; i++)
2074 		pa[i] = NULL;
2075 
2076 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
2077 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
2078 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
2079 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
2080 	if (err != 0) {
2081 		failure = "basic construction of FMRI failed";
2082 		goto boom;
2083 	}
2084 
2085 	numbuf[MAXDIGITIDX] = '\0';
2086 	nullbyte = &numbuf[MAXDIGITIDX];
2087 	i = 0;
2088 
2089 	for (ipp = ipath; ipp->s != NULL; ipp++) {
2090 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
2091 		if (err != 0) {
2092 			failure = "alloc of an hc-pair failed";
2093 			goto boom;
2094 		}
2095 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s);
2096 		numstr = ulltostr(ipp->i, nullbyte);
2097 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
2098 		if (err != 0) {
2099 			failure = "construction of an hc-pair failed";
2100 			goto boom;
2101 		}
2102 		pa[i++] = p;
2103 	}
2104 
2105 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
2106 	if (err == 0) {
2107 		for (i = 0; i < depth; i++)
2108 			nvlist_free(pa[i]);
2109 		return (f);
2110 	}
2111 	failure = "addition of hc-pair array to FMRI failed";
2112 
2113 boom:
2114 	for (i = 0; i < depth; i++)
2115 		nvlist_free(pa[i]);
2116 	nvlist_free(f);
2117 	out(O_DIE, "%s", failure);
2118 	/*NOTREACHED*/
2119 	return (NULL);
2120 }
2121 
2122 static uint8_t
percentof(uint_t part,uint_t whole)2123 percentof(uint_t part, uint_t whole)
2124 {
2125 	unsigned long long p = part * 1000;
2126 
2127 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
2128 }
2129 
2130 struct rsl {
2131 	struct event *suspect;
2132 	nvlist_t *asru;
2133 	nvlist_t *fru;
2134 	nvlist_t *rsrc;
2135 };
2136 
2137 static void publish_suspects(struct fme *fmep, struct rsl *srl);
2138 
2139 /*
2140  *  rslfree -- free internal members of struct rsl not expected to be
2141  *	freed elsewhere.
2142  */
2143 static void
rslfree(struct rsl * freeme)2144 rslfree(struct rsl *freeme)
2145 {
2146 	nvlist_free(freeme->asru);
2147 	nvlist_free(freeme->fru);
2148 	if (freeme->rsrc != freeme->asru)
2149 		nvlist_free(freeme->rsrc);
2150 }
2151 
2152 /*
2153  *  rslcmp -- compare two rsl structures.  Use the following
2154  *	comparisons to establish cardinality:
2155  *
2156  *	1. Name of the suspect's class. (simple strcmp)
2157  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
2158  *
2159  */
2160 static int
rslcmp(const void * a,const void * b)2161 rslcmp(const void *a, const void *b)
2162 {
2163 	struct rsl *r1 = (struct rsl *)a;
2164 	struct rsl *r2 = (struct rsl *)b;
2165 	int rv;
2166 
2167 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
2168 	    r2->suspect->enode->u.event.ename->u.name.s);
2169 	if (rv != 0)
2170 		return (rv);
2171 
2172 	if (r1->rsrc == NULL && r2->rsrc == NULL)
2173 		return (0);
2174 	if (r1->rsrc == NULL)
2175 		return (-1);
2176 	if (r2->rsrc == NULL)
2177 		return (1);
2178 	return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0));
2179 }
2180 
2181 /*
2182  * get_resources -- for a given suspect, determine what ASRU, FRU and
2183  *     RSRC nvlists should be advertised in the final suspect list.
2184  */
2185 void
get_resources(struct event * sp,struct rsl * rsrcs,struct config * croot)2186 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
2187 {
2188 	struct node *asrudef, *frudef;
2189 	const struct ipath *asrupath, *frupath;
2190 	nvlist_t *asru = NULL, *fru = NULL;
2191 	nvlist_t *rsrc = NULL;
2192 	char *pathstr;
2193 
2194 	/*
2195 	 * First find any ASRU and/or FRU defined in the
2196 	 * initial fault tree.
2197 	 */
2198 	asrudef = eventprop_lookup(sp, L_ASRU);
2199 	frudef = eventprop_lookup(sp, L_FRU);
2200 
2201 	/*
2202 	 * Create ipaths based on those definitions
2203 	 */
2204 	asrupath = ipath(asrudef);
2205 	frupath = ipath(frudef);
2206 
2207 	/*
2208 	 *  Allow for platform translations of the FMRIs
2209 	 */
2210 	pathstr = ipath2str(NULL, sp->ipp);
2211 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_RESOURCE,
2212 	    &rsrc, pathstr);
2213 	FREE(pathstr);
2214 
2215 	pathstr = ipath2str(NULL, asrupath);
2216 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_ASRU,
2217 	    &asru, pathstr);
2218 	FREE(pathstr);
2219 
2220 	pathstr = ipath2str(NULL, frupath);
2221 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_FRU,
2222 	    &fru, pathstr);
2223 	FREE(pathstr);
2224 
2225 	rsrcs->suspect = sp;
2226 	rsrcs->asru = asru;
2227 	rsrcs->fru = fru;
2228 	rsrcs->rsrc = rsrc;
2229 }
2230 
2231 /*
2232  * trim_suspects -- prior to publishing, we may need to remove some
2233  *    suspects from the list.  If we're auto-closing upsets, we don't
2234  *    want any of those in the published list.  If the ASRUs for multiple
2235  *    defects resolve to the same ASRU (driver) we only want to publish
2236  *    that as a single suspect.
2237  */
2238 static int
trim_suspects(struct fme * fmep,struct rsl * begin,struct rsl * begin2,fmd_event_t * ffep)2239 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2,
2240     fmd_event_t *ffep)
2241 {
2242 	struct event *ep;
2243 	struct rsl *rp = begin;
2244 	struct rsl *rp2 = begin2;
2245 	int mess_zero_count = 0;
2246 	int serd_rval;
2247 	uint_t messval;
2248 
2249 	/* remove any unwanted upsets and populate our array */
2250 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
2251 		if (is_upset(ep->t))
2252 			continue;
2253 		serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep,
2254 		    NULL, NULL);
2255 		if (serd_rval == 0)
2256 			continue;
2257 		if (node2uint(eventprop_lookup(ep, L_message),
2258 		    &messval) == 0 && messval == 0) {
2259 			get_resources(ep, rp2, fmep->config);
2260 			rp2++;
2261 			mess_zero_count++;
2262 		} else {
2263 			get_resources(ep, rp, fmep->config);
2264 			rp++;
2265 			fmep->nsuspects++;
2266 		}
2267 	}
2268 	return (mess_zero_count);
2269 }
2270 
2271 /*
2272  * addpayloadprop -- add a payload prop to a problem
2273  */
2274 static void
addpayloadprop(const char * lhs,struct evalue * rhs,nvlist_t * fault)2275 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
2276 {
2277 	nvlist_t *rsrc, *hcs;
2278 
2279 	ASSERT(fault != NULL);
2280 	ASSERT(lhs != NULL);
2281 	ASSERT(rhs != NULL);
2282 
2283 	if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0)
2284 		out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs);
2285 
2286 	if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) {
2287 		out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific");
2288 		if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0)
2289 			out(O_DIE,
2290 			    "cannot add payloadprop \"%s\" to fault", lhs);
2291 		if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0)
2292 			out(O_DIE,
2293 			    "cannot add payloadprop \"%s\" to fault", lhs);
2294 		nvlist_free(hcs);
2295 		if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0)
2296 			out(O_DIE,
2297 			    "cannot add payloadprop \"%s\" to fault", lhs);
2298 	} else
2299 		out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific");
2300 
2301 	if (rhs->t == UINT64) {
2302 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
2303 
2304 		if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0)
2305 			out(O_DIE,
2306 			    "cannot add payloadprop \"%s\" to fault", lhs);
2307 	} else {
2308 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
2309 		    lhs, (char *)(uintptr_t)rhs->v);
2310 
2311 		if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0)
2312 			out(O_DIE,
2313 			    "cannot add payloadprop \"%s\" to fault", lhs);
2314 	}
2315 }
2316 
2317 static char *Istatbuf;
2318 static char *Istatbufptr;
2319 static int Istatsz;
2320 
2321 /*
2322  * istataddsize -- calculate size of istat and add it to Istatsz
2323  */
2324 /*ARGSUSED2*/
2325 static void
istataddsize(const struct istat_entry * lhs,struct stats * rhs,void * arg)2326 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2327 {
2328 	int val;
2329 
2330 	ASSERT(lhs != NULL);
2331 	ASSERT(rhs != NULL);
2332 
2333 	if ((val = stats_counter_value(rhs)) == 0)
2334 		return;	/* skip zero-valued stats */
2335 
2336 	/* count up the size of the stat name */
2337 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
2338 	Istatsz++;	/* for the trailing NULL byte */
2339 
2340 	/* count up the size of the stat value */
2341 	Istatsz += snprintf(NULL, 0, "%d", val);
2342 	Istatsz++;	/* for the trailing NULL byte */
2343 }
2344 
2345 /*
2346  * istat2str -- serialize an istat, writing result to *Istatbufptr
2347  */
2348 /*ARGSUSED2*/
2349 static void
istat2str(const struct istat_entry * lhs,struct stats * rhs,void * arg)2350 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2351 {
2352 	char *str;
2353 	int len;
2354 	int val;
2355 
2356 	ASSERT(lhs != NULL);
2357 	ASSERT(rhs != NULL);
2358 
2359 	if ((val = stats_counter_value(rhs)) == 0)
2360 		return;	/* skip zero-valued stats */
2361 
2362 	/* serialize the stat name */
2363 	str = ipath2str(lhs->ename, lhs->ipath);
2364 	len = strlen(str);
2365 
2366 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
2367 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
2368 	Istatbufptr += len;
2369 	FREE(str);
2370 	*Istatbufptr++ = '\0';
2371 
2372 	/* serialize the stat value */
2373 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
2374 	    "%d", val);
2375 	*Istatbufptr++ = '\0';
2376 
2377 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
2378 }
2379 
2380 void
istat_save()2381 istat_save()
2382 {
2383 	if (Istat_need_save == 0)
2384 		return;
2385 
2386 	/* figure out how big the serialzed info is */
2387 	Istatsz = 0;
2388 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
2389 
2390 	if (Istatsz == 0) {
2391 		/* no stats to save */
2392 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2393 		return;
2394 	}
2395 
2396 	/* create the serialized buffer */
2397 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
2398 	lut_walk(Istats, (lut_cb)istat2str, NULL);
2399 
2400 	/* clear out current saved stats */
2401 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2402 
2403 	/* write out the new version */
2404 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
2405 	FREE(Istatbuf);
2406 
2407 	Istat_need_save = 0;
2408 }
2409 
2410 int
istat_cmp(struct istat_entry * ent1,struct istat_entry * ent2)2411 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
2412 {
2413 	if (ent1->ename != ent2->ename)
2414 		return (ent2->ename - ent1->ename);
2415 	if (ent1->ipath != ent2->ipath)
2416 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2417 
2418 	return (0);
2419 }
2420 
2421 /*
2422  * istat-verify -- verify the component associated with a stat still exists
2423  *
2424  * if the component no longer exists, this routine resets the stat and
2425  * returns 0.  if the component still exists, it returns 1.
2426  */
2427 static int
istat_verify(struct node * snp,struct istat_entry * entp)2428 istat_verify(struct node *snp, struct istat_entry *entp)
2429 {
2430 	struct stats *statp;
2431 	nvlist_t *fmri;
2432 
2433 	fmri = node2fmri(snp->u.event.epname);
2434 	if (platform_path_exists(fmri)) {
2435 		nvlist_free(fmri);
2436 		return (1);
2437 	}
2438 	nvlist_free(fmri);
2439 
2440 	/* component no longer in system.  zero out the associated stats */
2441 	if ((statp = (struct stats *)
2442 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
2443 	    stats_counter_value(statp) == 0)
2444 		return (0);	/* stat is already reset */
2445 
2446 	Istat_need_save = 1;
2447 	stats_counter_reset(statp);
2448 	return (0);
2449 }
2450 
2451 static void
istat_bump(struct node * snp,int n)2452 istat_bump(struct node *snp, int n)
2453 {
2454 	struct stats *statp;
2455 	struct istat_entry ent;
2456 
2457 	ASSERT(snp != NULL);
2458 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
2459 	ASSERT(snp->u.event.epname != NULL);
2460 
2461 	/* class name should be hoisted into a single stable entry */
2462 	ASSERT(snp->u.event.ename->u.name.next == NULL);
2463 	ent.ename = snp->u.event.ename->u.name.s;
2464 	ent.ipath = ipath(snp->u.event.epname);
2465 
2466 	if (!istat_verify(snp, &ent)) {
2467 		/* component no longer exists in system, nothing to do */
2468 		return;
2469 	}
2470 
2471 	if ((statp = (struct stats *)
2472 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
2473 		/* need to create the counter */
2474 		int cnt = 0;
2475 		struct node *np;
2476 		char *sname;
2477 		char *snamep;
2478 		struct istat_entry *newentp;
2479 
2480 		/* count up the size of the stat name */
2481 		np = snp->u.event.ename;
2482 		while (np != NULL) {
2483 			cnt += strlen(np->u.name.s);
2484 			cnt++;	/* for the '.' or '@' */
2485 			np = np->u.name.next;
2486 		}
2487 		np = snp->u.event.epname;
2488 		while (np != NULL) {
2489 			cnt += snprintf(NULL, 0, "%s%llu",
2490 			    np->u.name.s, np->u.name.child->u.ull);
2491 			cnt++;	/* for the '/' or trailing NULL byte */
2492 			np = np->u.name.next;
2493 		}
2494 
2495 		/* build the stat name */
2496 		snamep = sname = alloca(cnt);
2497 		np = snp->u.event.ename;
2498 		while (np != NULL) {
2499 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2500 			    "%s", np->u.name.s);
2501 			np = np->u.name.next;
2502 			if (np)
2503 				*snamep++ = '.';
2504 		}
2505 		*snamep++ = '@';
2506 		np = snp->u.event.epname;
2507 		while (np != NULL) {
2508 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2509 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
2510 			np = np->u.name.next;
2511 			if (np)
2512 				*snamep++ = '/';
2513 		}
2514 		*snamep++ = '\0';
2515 
2516 		/* create the new stat & add it to our list */
2517 		newentp = MALLOC(sizeof (*newentp));
2518 		*newentp = ent;
2519 		statp = stats_new_counter(NULL, sname, 0);
2520 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
2521 		    (lut_cmp)istat_cmp);
2522 	}
2523 
2524 	/* if n is non-zero, set that value instead of bumping */
2525 	if (n) {
2526 		stats_counter_reset(statp);
2527 		stats_counter_add(statp, n);
2528 	} else
2529 		stats_counter_bump(statp);
2530 	Istat_need_save = 1;
2531 
2532 	ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath);
2533 	out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented",
2534 	    stats_counter_value(statp));
2535 }
2536 
2537 /*ARGSUSED*/
2538 static void
istat_destructor(void * left,void * right,void * arg)2539 istat_destructor(void *left, void *right, void *arg)
2540 {
2541 	struct istat_entry *entp = (struct istat_entry *)left;
2542 	struct stats *statp = (struct stats *)right;
2543 	FREE(entp);
2544 	stats_delete(statp);
2545 }
2546 
2547 /*
2548  * Callback used in a walk of the Istats to reset matching stat counters.
2549  */
2550 static void
istat_counter_reset_cb(struct istat_entry * entp,struct stats * statp,const struct ipath * ipp)2551 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
2552     const struct ipath *ipp)
2553 {
2554 	char *path;
2555 
2556 	if (entp->ipath == ipp) {
2557 		path = ipath2str(entp->ename, ipp);
2558 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
2559 		FREE(path);
2560 		stats_counter_reset(statp);
2561 		Istat_need_save = 1;
2562 	}
2563 }
2564 
2565 /*ARGSUSED*/
2566 static void
istat_counter_topo_chg_cb(struct istat_entry * entp,struct stats * statp,void * unused)2567 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp,
2568     void *unused)
2569 {
2570 	char *path;
2571 	nvlist_t *fmri;
2572 
2573 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2574 	if (!platform_path_exists(fmri)) {
2575 		path = ipath2str(entp->ename, entp->ipath);
2576 		out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path);
2577 		FREE(path);
2578 		stats_counter_reset(statp);
2579 		Istat_need_save = 1;
2580 	}
2581 	nvlist_free(fmri);
2582 }
2583 
2584 void
istat_fini(void)2585 istat_fini(void)
2586 {
2587 	lut_free(Istats, istat_destructor, NULL);
2588 }
2589 
2590 static char *Serdbuf;
2591 static char *Serdbufptr;
2592 static int Serdsz;
2593 
2594 /*
2595  * serdaddsize -- calculate size of serd and add it to Serdsz
2596  */
2597 /*ARGSUSED*/
2598 static void
serdaddsize(const struct serd_entry * lhs,struct stats * rhs,void * arg)2599 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2600 {
2601 	ASSERT(lhs != NULL);
2602 
2603 	/* count up the size of the stat name */
2604 	Serdsz += ipath2strlen(lhs->ename, lhs->ipath);
2605 	Serdsz++;	/* for the trailing NULL byte */
2606 }
2607 
2608 /*
2609  * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2610  */
2611 /*ARGSUSED*/
2612 static void
serd2str(const struct serd_entry * lhs,struct stats * rhs,void * arg)2613 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2614 {
2615 	char *str;
2616 	int len;
2617 
2618 	ASSERT(lhs != NULL);
2619 
2620 	/* serialize the serd engine name */
2621 	str = ipath2str(lhs->ename, lhs->ipath);
2622 	len = strlen(str);
2623 
2624 	ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]);
2625 	(void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr);
2626 	Serdbufptr += len;
2627 	FREE(str);
2628 	*Serdbufptr++ = '\0';
2629 	ASSERT(Serdbufptr <= &Serdbuf[Serdsz]);
2630 }
2631 
2632 void
serd_save()2633 serd_save()
2634 {
2635 	if (Serd_need_save == 0)
2636 		return;
2637 
2638 	/* figure out how big the serialzed info is */
2639 	Serdsz = 0;
2640 	lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL);
2641 
2642 	if (Serdsz == 0) {
2643 		/* no serd engines to save */
2644 		fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2645 		return;
2646 	}
2647 
2648 	/* create the serialized buffer */
2649 	Serdbufptr = Serdbuf = MALLOC(Serdsz);
2650 	lut_walk(SerdEngines, (lut_cb)serd2str, NULL);
2651 
2652 	/* clear out current saved stats */
2653 	fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2654 
2655 	/* write out the new version */
2656 	fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz);
2657 	FREE(Serdbuf);
2658 	Serd_need_save = 0;
2659 }
2660 
2661 int
serd_cmp(struct serd_entry * ent1,struct serd_entry * ent2)2662 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2)
2663 {
2664 	if (ent1->ename != ent2->ename)
2665 		return (ent2->ename - ent1->ename);
2666 	if (ent1->ipath != ent2->ipath)
2667 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2668 
2669 	return (0);
2670 }
2671 
2672 void
fme_serd_load(fmd_hdl_t * hdl)2673 fme_serd_load(fmd_hdl_t *hdl)
2674 {
2675 	int sz;
2676 	char *sbuf;
2677 	char *sepptr;
2678 	char *ptr;
2679 	struct serd_entry *newentp;
2680 	struct node *epname;
2681 	nvlist_t *fmri;
2682 	char *namestring;
2683 
2684 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0)
2685 		return;
2686 	sbuf = alloca(sz);
2687 	fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz);
2688 	ptr = sbuf;
2689 	while (ptr < &sbuf[sz]) {
2690 		sepptr = strchr(ptr, '@');
2691 		*sepptr = '\0';
2692 		namestring = ptr;
2693 		sepptr++;
2694 		ptr = sepptr;
2695 		ptr += strlen(ptr);
2696 		ptr++;	/* move past the '\0' separating paths */
2697 		epname = pathstring2epnamenp(sepptr);
2698 		fmri = node2fmri(epname);
2699 		if (platform_path_exists(fmri)) {
2700 			newentp = MALLOC(sizeof (*newentp));
2701 			newentp->hdl = hdl;
2702 			newentp->ipath = ipath(epname);
2703 			newentp->ename = stable(namestring);
2704 			SerdEngines = lut_add(SerdEngines, (void *)newentp,
2705 			    (void *)newentp, (lut_cmp)serd_cmp);
2706 		} else
2707 			Serd_need_save = 1;
2708 		tree_free(epname);
2709 		nvlist_free(fmri);
2710 	}
2711 	/* save it back again in case some of the paths no longer exist */
2712 	serd_save();
2713 }
2714 
2715 /*ARGSUSED*/
2716 static void
serd_destructor(void * left,void * right,void * arg)2717 serd_destructor(void *left, void *right, void *arg)
2718 {
2719 	struct serd_entry *entp = (struct serd_entry *)left;
2720 	FREE(entp);
2721 }
2722 
2723 /*
2724  * Callback used in a walk of the SerdEngines to reset matching serd engines.
2725  */
2726 /*ARGSUSED*/
2727 static void
serd_reset_cb(struct serd_entry * entp,void * unused,const struct ipath * ipp)2728 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp)
2729 {
2730 	char *path;
2731 
2732 	if (entp->ipath == ipp) {
2733 		path = ipath2str(entp->ename, ipp);
2734 		out(O_ALTFP, "serd_reset_cb: resetting %s", path);
2735 		fmd_serd_reset(entp->hdl, path);
2736 		FREE(path);
2737 		Serd_need_save = 1;
2738 	}
2739 }
2740 
2741 /*ARGSUSED*/
2742 static void
serd_topo_chg_cb(struct serd_entry * entp,void * unused,void * unused2)2743 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2)
2744 {
2745 	char *path;
2746 	nvlist_t *fmri;
2747 
2748 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2749 	if (!platform_path_exists(fmri)) {
2750 		path = ipath2str(entp->ename, entp->ipath);
2751 		out(O_ALTFP, "serd_topo_chg_cb: not present %s", path);
2752 		fmd_serd_reset(entp->hdl, path);
2753 		FREE(path);
2754 		Serd_need_save = 1;
2755 	}
2756 	nvlist_free(fmri);
2757 }
2758 
2759 void
serd_fini(void)2760 serd_fini(void)
2761 {
2762 	lut_free(SerdEngines, serd_destructor, NULL);
2763 }
2764 
2765 static void
publish_suspects(struct fme * fmep,struct rsl * srl)2766 publish_suspects(struct fme *fmep, struct rsl *srl)
2767 {
2768 	struct rsl *rp;
2769 	nvlist_t *fault;
2770 	uint8_t cert;
2771 	uint_t *frs;
2772 	uint_t frsum, fr;
2773 	uint_t messval;
2774 	uint_t retireval;
2775 	uint_t responseval;
2776 	struct node *snp;
2777 	int frcnt, fridx;
2778 	boolean_t allfaulty = B_TRUE;
2779 	struct rsl *erl = srl + fmep->nsuspects - 1;
2780 
2781 	/*
2782 	 * sort the array
2783 	 */
2784 	qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp);
2785 
2786 	/* sum the fitrates */
2787 	frs = alloca(fmep->nsuspects * sizeof (uint_t));
2788 	fridx = frcnt = frsum = 0;
2789 
2790 	for (rp = srl; rp <= erl; rp++) {
2791 		struct node *n;
2792 
2793 		n = eventprop_lookup(rp->suspect, L_FITrate);
2794 		if (node2uint(n, &fr) != 0) {
2795 			out(O_DEBUG|O_NONL, "event ");
2796 			ipath_print(O_DEBUG|O_NONL,
2797 			    rp->suspect->enode->u.event.ename->u.name.s,
2798 			    rp->suspect->ipp);
2799 			out(O_VERB, " has no FITrate (using 1)");
2800 			fr = 1;
2801 		} else if (fr == 0) {
2802 			out(O_DEBUG|O_NONL, "event ");
2803 			ipath_print(O_DEBUG|O_NONL,
2804 			    rp->suspect->enode->u.event.ename->u.name.s,
2805 			    rp->suspect->ipp);
2806 			out(O_VERB, " has zero FITrate (using 1)");
2807 			fr = 1;
2808 		}
2809 
2810 		frs[fridx++] = fr;
2811 		frsum += fr;
2812 		frcnt++;
2813 	}
2814 
2815 	/* Add them in reverse order of our sort, as fmd reverses order */
2816 	for (rp = erl; rp >= srl; rp--) {
2817 		cert = percentof(frs[--fridx], frsum);
2818 		fault = fmd_nvl_create_fault(fmep->hdl,
2819 		    rp->suspect->enode->u.event.ename->u.name.s,
2820 		    cert,
2821 		    rp->asru,
2822 		    rp->fru,
2823 		    rp->rsrc);
2824 		if (fault == NULL)
2825 			out(O_DIE, "fault creation failed");
2826 		/* if "message" property exists, add it to the fault */
2827 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2828 		    &messval) == 0) {
2829 
2830 			out(O_ALTFP,
2831 			    "[FME%d, %s adds message=%d to suspect list]",
2832 			    fmep->id,
2833 			    rp->suspect->enode->u.event.ename->u.name.s,
2834 			    messval);
2835 			if (nvlist_add_boolean_value(fault,
2836 			    FM_SUSPECT_MESSAGE,
2837 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2838 				out(O_DIE, "cannot add no-message to fault");
2839 			}
2840 		}
2841 
2842 		/* if "retire" property exists, add it to the fault */
2843 		if (node2uint(eventprop_lookup(rp->suspect, L_retire),
2844 		    &retireval) == 0) {
2845 
2846 			out(O_ALTFP,
2847 			    "[FME%d, %s adds retire=%d to suspect list]",
2848 			    fmep->id,
2849 			    rp->suspect->enode->u.event.ename->u.name.s,
2850 			    retireval);
2851 			if (nvlist_add_boolean_value(fault,
2852 			    FM_SUSPECT_RETIRE,
2853 			    (retireval) ? B_TRUE : B_FALSE) != 0) {
2854 				out(O_DIE, "cannot add no-retire to fault");
2855 			}
2856 		}
2857 
2858 		/* if "response" property exists, add it to the fault */
2859 		if (node2uint(eventprop_lookup(rp->suspect, L_response),
2860 		    &responseval) == 0) {
2861 
2862 			out(O_ALTFP,
2863 			    "[FME%d, %s adds response=%d to suspect list]",
2864 			    fmep->id,
2865 			    rp->suspect->enode->u.event.ename->u.name.s,
2866 			    responseval);
2867 			if (nvlist_add_boolean_value(fault,
2868 			    FM_SUSPECT_RESPONSE,
2869 			    (responseval) ? B_TRUE : B_FALSE) != 0) {
2870 				out(O_DIE, "cannot add no-response to fault");
2871 			}
2872 		}
2873 
2874 		/* add any payload properties */
2875 		lut_walk(rp->suspect->payloadprops,
2876 		    (lut_cb)addpayloadprop, (void *)fault);
2877 		rslfree(rp);
2878 
2879 		/*
2880 		 * If "action" property exists, evaluate it;  this must be done
2881 		 * before the allfaulty check below since some actions may
2882 		 * modify the asru to be used in fmd_nvl_fmri_has_fault.  This
2883 		 * needs to be restructured if any new actions are introduced
2884 		 * that have effects that we do not want to be visible if
2885 		 * we decide not to publish in the dupclose check below.
2886 		 */
2887 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2888 			struct evalue evalue;
2889 
2890 			out(O_ALTFP|O_NONL,
2891 			    "[FME%d, %s action ", fmep->id,
2892 			    rp->suspect->enode->u.event.ename->u.name.s);
2893 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2894 			out(O_ALTFP, "]");
2895 			Action_nvl = fault;
2896 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2897 			    NULL, 0, &evalue);
2898 		}
2899 
2900 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2901 
2902 		/*
2903 		 * check if the asru is already marked as "faulty".
2904 		 */
2905 		if (allfaulty) {
2906 			nvlist_t *asru;
2907 
2908 			out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id);
2909 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2910 			out(O_ALTFP|O_VERB|O_NONL, " ");
2911 			if (nvlist_lookup_nvlist(fault,
2912 			    FM_FAULT_ASRU, &asru) != 0) {
2913 				out(O_ALTFP|O_VERB, "NULL asru");
2914 				allfaulty = B_FALSE;
2915 			} else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru,
2916 			    FMD_HAS_FAULT_ASRU, NULL)) {
2917 				out(O_ALTFP|O_VERB, "faulty");
2918 			} else {
2919 				out(O_ALTFP|O_VERB, "not faulty");
2920 				allfaulty = B_FALSE;
2921 			}
2922 		}
2923 
2924 	}
2925 
2926 	if (!allfaulty) {
2927 		/*
2928 		 * don't update the count stat if all asrus are already
2929 		 * present and unrepaired in the asru cache
2930 		 */
2931 		for (rp = erl; rp >= srl; rp--) {
2932 			struct event *suspect = rp->suspect;
2933 
2934 			if (suspect == NULL)
2935 				continue;
2936 
2937 			/* if "count" exists, increment the appropriate stat */
2938 			if ((snp = eventprop_lookup(suspect,
2939 			    L_count)) != NULL) {
2940 				out(O_ALTFP|O_NONL,
2941 				    "[FME%d, %s count ", fmep->id,
2942 				    suspect->enode->u.event.ename->u.name.s);
2943 				ptree_name_iter(O_ALTFP|O_NONL, snp);
2944 				out(O_ALTFP, "]");
2945 				istat_bump(snp, 0);
2946 
2947 			}
2948 		}
2949 		istat_save();	/* write out any istat changes */
2950 	}
2951 }
2952 
2953 static const char *
undiag_2defect_str(int ud)2954 undiag_2defect_str(int ud)
2955 {
2956 	switch (ud) {
2957 	case UD_VAL_MISSINGINFO:
2958 	case UD_VAL_MISSINGOBS:
2959 	case UD_VAL_MISSINGPATH:
2960 	case UD_VAL_MISSINGZERO:
2961 	case UD_VAL_BADOBS:
2962 	case UD_VAL_CFGMISMATCH:
2963 		return (UNDIAG_DEFECT_CHKPT);
2964 
2965 	case UD_VAL_BADEVENTI:
2966 	case UD_VAL_BADEVENTPATH:
2967 	case UD_VAL_BADEVENTCLASS:
2968 	case UD_VAL_INSTFAIL:
2969 	case UD_VAL_NOPATH:
2970 	case UD_VAL_UNSOLVD:
2971 		return (UNDIAG_DEFECT_FME);
2972 
2973 	case UD_VAL_MAXFME:
2974 		return (UNDIAG_DEFECT_LIMIT);
2975 
2976 	case UD_VAL_UNKNOWN:
2977 	default:
2978 		return (UNDIAG_DEFECT_UNKNOWN);
2979 	}
2980 }
2981 
2982 static const char *
undiag_2fault_str(int ud)2983 undiag_2fault_str(int ud)
2984 {
2985 	switch (ud) {
2986 	case UD_VAL_BADEVENTI:
2987 	case UD_VAL_BADEVENTPATH:
2988 	case UD_VAL_BADEVENTCLASS:
2989 	case UD_VAL_INSTFAIL:
2990 	case UD_VAL_NOPATH:
2991 	case UD_VAL_UNSOLVD:
2992 		return (UNDIAG_FAULT_FME);
2993 	default:
2994 		return (NULL);
2995 	}
2996 }
2997 
2998 static char *
undiag_2reason_str(int ud,char * arg)2999 undiag_2reason_str(int ud, char *arg)
3000 {
3001 	const char *ptr;
3002 	char *buf;
3003 	int with_arg = 0;
3004 
3005 	switch (ud) {
3006 	case UD_VAL_BADEVENTPATH:
3007 		ptr = UD_STR_BADEVENTPATH;
3008 		with_arg = 1;
3009 		break;
3010 	case UD_VAL_BADEVENTCLASS:
3011 		ptr = UD_STR_BADEVENTCLASS;
3012 		with_arg = 1;
3013 		break;
3014 	case UD_VAL_BADEVENTI:
3015 		ptr = UD_STR_BADEVENTI;
3016 		with_arg = 1;
3017 		break;
3018 	case UD_VAL_BADOBS:
3019 		ptr = UD_STR_BADOBS;
3020 		break;
3021 	case UD_VAL_CFGMISMATCH:
3022 		ptr = UD_STR_CFGMISMATCH;
3023 		break;
3024 	case UD_VAL_INSTFAIL:
3025 		ptr = UD_STR_INSTFAIL;
3026 		with_arg = 1;
3027 		break;
3028 	case UD_VAL_MAXFME:
3029 		ptr = UD_STR_MAXFME;
3030 		break;
3031 	case UD_VAL_MISSINGINFO:
3032 		ptr = UD_STR_MISSINGINFO;
3033 		break;
3034 	case UD_VAL_MISSINGOBS:
3035 		ptr = UD_STR_MISSINGOBS;
3036 		break;
3037 	case UD_VAL_MISSINGPATH:
3038 		ptr = UD_STR_MISSINGPATH;
3039 		break;
3040 	case UD_VAL_MISSINGZERO:
3041 		ptr = UD_STR_MISSINGZERO;
3042 		break;
3043 	case UD_VAL_NOPATH:
3044 		ptr = UD_STR_NOPATH;
3045 		with_arg = 1;
3046 		break;
3047 	case UD_VAL_UNSOLVD:
3048 		ptr = UD_STR_UNSOLVD;
3049 		break;
3050 	case UD_VAL_UNKNOWN:
3051 	default:
3052 		ptr = UD_STR_UNKNOWN;
3053 		break;
3054 	}
3055 	if (with_arg) {
3056 		buf = MALLOC(strlen(ptr) + strlen(arg) - 1);
3057 		(void) sprintf(buf, ptr, arg);
3058 	} else {
3059 		buf = MALLOC(strlen(ptr) + 1);
3060 		(void) sprintf(buf, ptr);
3061 	}
3062 	return (buf);
3063 }
3064 
3065 static void
publish_undiagnosable(fmd_hdl_t * hdl,fmd_event_t * ffep,fmd_case_t * fmcase,nvlist_t * detector,char * arg)3066 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase,
3067     nvlist_t *detector, char *arg)
3068 {
3069 	struct case_list *newcase;
3070 	nvlist_t *defect, *fault;
3071 	const char *faultstr;
3072 	char *reason = undiag_2reason_str(Undiag_reason, arg);
3073 
3074 	out(O_ALTFP,
3075 	    "[undiagnosable ereport received, "
3076 	    "creating and closing a new case (%s)]", reason);
3077 
3078 	newcase = MALLOC(sizeof (struct case_list));
3079 	newcase->next = NULL;
3080 	newcase->fmcase = fmcase;
3081 	if (Undiagablecaselist != NULL)
3082 		newcase->next = Undiagablecaselist;
3083 	Undiagablecaselist = newcase;
3084 
3085 	if (ffep != NULL)
3086 		fmd_case_add_ereport(hdl, newcase->fmcase, ffep);
3087 
3088 	/* add defect */
3089 	defect = fmd_nvl_create_fault(hdl,
3090 	    undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector);
3091 	(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
3092 	(void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE);
3093 	(void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE);
3094 	fmd_case_add_suspect(hdl, newcase->fmcase, defect);
3095 
3096 	/* add fault if appropriate */
3097 	faultstr = undiag_2fault_str(Undiag_reason);
3098 	if (faultstr != NULL) {
3099 		fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL,
3100 		    detector);
3101 		(void) nvlist_add_string(fault, UNDIAG_REASON, reason);
3102 		(void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE,
3103 		    B_FALSE);
3104 		(void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE,
3105 		    B_FALSE);
3106 		fmd_case_add_suspect(hdl, newcase->fmcase, fault);
3107 	}
3108 	FREE(reason);
3109 
3110 	/* solve and close case */
3111 	fmd_case_solve(hdl, newcase->fmcase);
3112 	fmd_case_close(hdl, newcase->fmcase);
3113 	Undiag_reason = UD_VAL_UNKNOWN;
3114 }
3115 
3116 static void
fme_undiagnosable(struct fme * f)3117 fme_undiagnosable(struct fme *f)
3118 {
3119 	nvlist_t *defect, *fault, *detector = NULL;
3120 	struct event *ep;
3121 	char *pathstr;
3122 	const char *faultstr;
3123 	char *reason = undiag_2reason_str(Undiag_reason, NULL);
3124 
3125 	out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]",
3126 	    f->id, fmd_case_uuid(f->hdl, f->fmcase), reason);
3127 
3128 	for (ep = f->observations; ep; ep = ep->observations) {
3129 
3130 		if (ep->ffep != f->e0r)
3131 			fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep);
3132 
3133 		pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp)));
3134 		platform_unit_translate(0, f->config, TOPO_PROP_RESOURCE,
3135 		    &detector, pathstr);
3136 		FREE(pathstr);
3137 
3138 		/* add defect */
3139 		defect = fmd_nvl_create_fault(f->hdl,
3140 		    undiag_2defect_str(Undiag_reason), 50 / f->uniqobs,
3141 		    NULL, NULL, detector);
3142 		(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
3143 		(void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE,
3144 		    B_FALSE);
3145 		(void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE,
3146 		    B_FALSE);
3147 		fmd_case_add_suspect(f->hdl, f->fmcase, defect);
3148 
3149 		/* add fault if appropriate */
3150 		faultstr = undiag_2fault_str(Undiag_reason);
3151 		if (faultstr == NULL)
3152 			continue;
3153 		fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs,
3154 		    NULL, NULL, detector);
3155 		(void) nvlist_add_string(fault, UNDIAG_REASON, reason);
3156 		(void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE,
3157 		    B_FALSE);
3158 		(void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE,
3159 		    B_FALSE);
3160 		fmd_case_add_suspect(f->hdl, f->fmcase, fault);
3161 		nvlist_free(detector);
3162 	}
3163 	FREE(reason);
3164 	fmd_case_solve(f->hdl, f->fmcase);
3165 	fmd_case_close(f->hdl, f->fmcase);
3166 	Undiag_reason = UD_VAL_UNKNOWN;
3167 }
3168 
3169 /*
3170  * fme_close_case
3171  *
3172  *	Find the requested case amongst our fmes and close it.  Free up
3173  *	the related fme.
3174  */
3175 void
fme_close_case(fmd_hdl_t * hdl,fmd_case_t * fmcase)3176 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase)
3177 {
3178 	struct case_list *ucasep, *prevcasep = NULL;
3179 	struct fme *prev = NULL;
3180 	struct fme *fmep;
3181 
3182 	for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) {
3183 		if (fmcase != ucasep->fmcase) {
3184 			prevcasep = ucasep;
3185 			continue;
3186 		}
3187 
3188 		if (prevcasep == NULL)
3189 			Undiagablecaselist = Undiagablecaselist->next;
3190 		else
3191 			prevcasep->next = ucasep->next;
3192 
3193 		FREE(ucasep);
3194 		return;
3195 	}
3196 
3197 	for (fmep = FMElist; fmep; fmep = fmep->next) {
3198 		if (fmep->hdl == hdl && fmep->fmcase == fmcase)
3199 			break;
3200 		prev = fmep;
3201 	}
3202 
3203 	if (fmep == NULL) {
3204 		out(O_WARN, "Eft asked to close unrecognized case [%s].",
3205 		    fmd_case_uuid(hdl, fmcase));
3206 		return;
3207 	}
3208 
3209 	if (EFMElist == fmep)
3210 		EFMElist = prev;
3211 
3212 	if (prev == NULL)
3213 		FMElist = FMElist->next;
3214 	else
3215 		prev->next = fmep->next;
3216 
3217 	fmep->next = NULL;
3218 
3219 	/* Get rid of any timer this fme has set */
3220 	if (fmep->wull != 0)
3221 		fmd_timer_remove(fmep->hdl, fmep->timer);
3222 
3223 	if (ClosedFMEs == NULL) {
3224 		ClosedFMEs = fmep;
3225 	} else {
3226 		fmep->next = ClosedFMEs;
3227 		ClosedFMEs = fmep;
3228 	}
3229 
3230 	Open_fme_count--;
3231 
3232 	/* See if we can close the overflow FME */
3233 	if (Open_fme_count <= Max_fme) {
3234 		for (fmep = FMElist; fmep; fmep = fmep->next) {
3235 			if (fmep->overflow && !(fmd_case_closed(fmep->hdl,
3236 			    fmep->fmcase)))
3237 				break;
3238 		}
3239 
3240 		if (fmep != NULL)
3241 			fmd_case_close(fmep->hdl, fmep->fmcase);
3242 	}
3243 }
3244 
3245 /*
3246  * fme_set_timer()
3247  *	If the time we need to wait for the given FME is less than the
3248  *	current timer, kick that old timer out and establish a new one.
3249  */
3250 static int
fme_set_timer(struct fme * fmep,unsigned long long wull)3251 fme_set_timer(struct fme *fmep, unsigned long long wull)
3252 {
3253 	out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait ");
3254 	ptree_timeval(O_ALTFP|O_VERB, &wull);
3255 
3256 	if (wull <= fmep->pull) {
3257 		out(O_ALTFP|O_VERB|O_NONL, "already have waited at least ");
3258 		ptree_timeval(O_ALTFP|O_VERB, &fmep->pull);
3259 		out(O_ALTFP|O_VERB, NULL);
3260 		/* we've waited at least wull already, don't need timer */
3261 		return (0);
3262 	}
3263 
3264 	out(O_ALTFP|O_VERB|O_NONL, " currently ");
3265 	if (fmep->wull != 0) {
3266 		out(O_ALTFP|O_VERB|O_NONL, "waiting ");
3267 		ptree_timeval(O_ALTFP|O_VERB, &fmep->wull);
3268 		out(O_ALTFP|O_VERB, NULL);
3269 	} else {
3270 		out(O_ALTFP|O_VERB|O_NONL, "not waiting");
3271 		out(O_ALTFP|O_VERB, NULL);
3272 	}
3273 
3274 	if (fmep->wull != 0)
3275 		if (wull >= fmep->wull)
3276 			/* New timer would fire later than established timer */
3277 			return (0);
3278 
3279 	if (fmep->wull != 0) {
3280 		fmd_timer_remove(fmep->hdl, fmep->timer);
3281 	}
3282 
3283 	fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep,
3284 	    fmep->e0r, wull);
3285 	out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer);
3286 	fmep->wull = wull;
3287 	return (1);
3288 }
3289 
3290 void
fme_timer_fired(struct fme * fmep,id_t tid)3291 fme_timer_fired(struct fme *fmep, id_t tid)
3292 {
3293 	struct fme *ffmep = NULL;
3294 
3295 	for (ffmep = FMElist; ffmep; ffmep = ffmep->next)
3296 		if (ffmep == fmep)
3297 			break;
3298 
3299 	if (ffmep == NULL) {
3300 		out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.",
3301 		    (void *)fmep);
3302 		return;
3303 	}
3304 
3305 	out(O_ALTFP|O_VERB, "Timer fired %lx", tid);
3306 	fmep->pull = fmep->wull;
3307 	fmep->wull = 0;
3308 	fmd_buf_write(fmep->hdl, fmep->fmcase,
3309 	    WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull));
3310 
3311 	fme_eval(fmep, fmep->e0r);
3312 }
3313 
3314 /*
3315  * Preserve the fme's suspect list in its psuspects list, NULLing the
3316  * suspects list in the meantime.
3317  */
3318 static void
save_suspects(struct fme * fmep)3319 save_suspects(struct fme *fmep)
3320 {
3321 	struct event *ep;
3322 	struct event *nextep;
3323 
3324 	/* zero out the previous suspect list */
3325 	for (ep = fmep->psuspects; ep; ep = nextep) {
3326 		nextep = ep->psuspects;
3327 		ep->psuspects = NULL;
3328 	}
3329 	fmep->psuspects = NULL;
3330 
3331 	/* zero out the suspect list, copying it to previous suspect list */
3332 	fmep->psuspects = fmep->suspects;
3333 	for (ep = fmep->suspects; ep; ep = nextep) {
3334 		nextep = ep->suspects;
3335 		ep->psuspects = ep->suspects;
3336 		ep->suspects = NULL;
3337 		ep->is_suspect = 0;
3338 	}
3339 	fmep->suspects = NULL;
3340 	fmep->nsuspects = 0;
3341 }
3342 
3343 /*
3344  * Retrieve the fme's suspect list from its psuspects list.
3345  */
3346 static void
restore_suspects(struct fme * fmep)3347 restore_suspects(struct fme *fmep)
3348 {
3349 	struct event *ep;
3350 	struct event *nextep;
3351 
3352 	fmep->nsuspects = 0;
3353 	fmep->suspects = fmep->psuspects;
3354 	for (ep = fmep->psuspects; ep; ep = nextep) {
3355 		fmep->nsuspects++;
3356 		nextep = ep->psuspects;
3357 		ep->suspects = ep->psuspects;
3358 	}
3359 }
3360 
3361 /*
3362  * this is what we use to call the Emrys prototype code instead of main()
3363  */
3364 static void
fme_eval(struct fme * fmep,fmd_event_t * ffep)3365 fme_eval(struct fme *fmep, fmd_event_t *ffep)
3366 {
3367 	struct event *ep;
3368 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
3369 	struct rsl *srl = NULL;
3370 	struct rsl *srl2 = NULL;
3371 	int mess_zero_count;
3372 	int rpcnt;
3373 
3374 	save_suspects(fmep);
3375 
3376 	out(O_ALTFP, "Evaluate FME %d", fmep->id);
3377 	indent_set("  ");
3378 
3379 	lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
3380 	fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
3381 
3382 	out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id,
3383 	    fme_state2str(fmep->state));
3384 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
3385 		out(O_ALTFP|O_NONL, " ");
3386 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
3387 	}
3388 	out(O_ALTFP, NULL);
3389 
3390 	switch (fmep->state) {
3391 	case FME_CREDIBLE:
3392 		print_suspects(SLNEW, fmep);
3393 		(void) upsets_eval(fmep, ffep);
3394 
3395 		/*
3396 		 * we may have already posted suspects in upsets_eval() which
3397 		 * can recurse into fme_eval() again. If so then just return.
3398 		 */
3399 		if (fmep->posted_suspects)
3400 			return;
3401 
3402 		stats_counter_bump(fmep->diags);
3403 		rpcnt = fmep->nsuspects;
3404 		save_suspects(fmep);
3405 
3406 		/*
3407 		 * create two lists, one for "message=1" faults and one for
3408 		 * "message=0" faults. If we have a mixture we will generate
3409 		 * two separate suspect lists.
3410 		 */
3411 		srl = MALLOC(rpcnt * sizeof (struct rsl));
3412 		bzero(srl, rpcnt * sizeof (struct rsl));
3413 		srl2 = MALLOC(rpcnt * sizeof (struct rsl));
3414 		bzero(srl2, rpcnt * sizeof (struct rsl));
3415 		mess_zero_count = trim_suspects(fmep, srl, srl2, ffep);
3416 
3417 		/*
3418 		 * If the resulting suspect list has no members, we're
3419 		 * done so simply close the case. Otherwise sort and publish.
3420 		 */
3421 		if (fmep->nsuspects == 0 && mess_zero_count == 0) {
3422 			out(O_ALTFP,
3423 			    "[FME%d, case %s (all suspects are upsets)]",
3424 			    fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase));
3425 			fmd_case_close(fmep->hdl, fmep->fmcase);
3426 		} else if (fmep->nsuspects != 0 && mess_zero_count == 0) {
3427 			publish_suspects(fmep, srl);
3428 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3429 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3430 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3431 		} else if (fmep->nsuspects == 0 && mess_zero_count != 0) {
3432 			fmep->nsuspects = mess_zero_count;
3433 			publish_suspects(fmep, srl2);
3434 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3435 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3436 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3437 		} else {
3438 			struct event *obsp;
3439 			struct fme *nfmep;
3440 
3441 			publish_suspects(fmep, srl);
3442 			out(O_ALTFP, "[solving FME%d, case %s]", fmep->id,
3443 			    fmd_case_uuid(fmep->hdl, fmep->fmcase));
3444 			fmd_case_solve(fmep->hdl, fmep->fmcase);
3445 
3446 			/*
3447 			 * Got both message=0 and message=1 so create a
3448 			 * duplicate case. Also need a temporary duplicate fme
3449 			 * structure for use by publish_suspects().
3450 			 */
3451 			nfmep = alloc_fme();
3452 			nfmep->id =  Nextid++;
3453 			nfmep->hdl = fmep->hdl;
3454 			nfmep->nsuspects = mess_zero_count;
3455 			nfmep->fmcase = fmd_case_open(fmep->hdl, NULL);
3456 			out(O_ALTFP|O_STAMP,
3457 			    "[creating parallel FME%d, case %s]", nfmep->id,
3458 			    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
3459 			Open_fme_count++;
3460 			if (ffep) {
3461 				fmd_case_setprincipal(nfmep->hdl,
3462 				    nfmep->fmcase, ffep);
3463 				fmd_case_add_ereport(nfmep->hdl,
3464 				    nfmep->fmcase, ffep);
3465 			}
3466 			for (obsp = fmep->observations; obsp;
3467 			    obsp = obsp->observations)
3468 				if (obsp->ffep && obsp->ffep != ffep)
3469 					fmd_case_add_ereport(nfmep->hdl,
3470 					    nfmep->fmcase, obsp->ffep);
3471 
3472 			publish_suspects(nfmep, srl2);
3473 			out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id,
3474 			    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
3475 			fmd_case_solve(nfmep->hdl, nfmep->fmcase);
3476 			FREE(nfmep);
3477 		}
3478 		FREE(srl);
3479 		FREE(srl2);
3480 		restore_suspects(fmep);
3481 
3482 		fmep->posted_suspects = 1;
3483 		fmd_buf_write(fmep->hdl, fmep->fmcase,
3484 		    WOBUF_POSTD,
3485 		    (void *)&fmep->posted_suspects,
3486 		    sizeof (fmep->posted_suspects));
3487 
3488 		/*
3489 		 * Now the suspects have been posted, we can clear up
3490 		 * the instance tree as we won't be looking at it again.
3491 		 * Also cancel the timer as the case is now solved.
3492 		 */
3493 		if (fmep->wull != 0) {
3494 			fmd_timer_remove(fmep->hdl, fmep->timer);
3495 			fmep->wull = 0;
3496 		}
3497 		break;
3498 
3499 	case FME_WAIT:
3500 		ASSERT(my_delay > fmep->ull);
3501 		(void) fme_set_timer(fmep, my_delay);
3502 		print_suspects(SLWAIT, fmep);
3503 		itree_prune(fmep->eventtree);
3504 		return;
3505 
3506 	case FME_DISPROVED:
3507 		print_suspects(SLDISPROVED, fmep);
3508 		Undiag_reason = UD_VAL_UNSOLVD;
3509 		fme_undiagnosable(fmep);
3510 		break;
3511 	}
3512 
3513 	itree_free(fmep->eventtree);
3514 	fmep->eventtree = NULL;
3515 	structconfig_free(fmep->config);
3516 	fmep->config = NULL;
3517 	destroy_fme_bufs(fmep);
3518 }
3519 
3520 static void indent(void);
3521 static int triggered(struct fme *fmep, struct event *ep, int mark);
3522 static enum fme_state effects_test(struct fme *fmep,
3523     struct event *fault_event, unsigned long long at_latest_by,
3524     unsigned long long *pdelay);
3525 static enum fme_state requirements_test(struct fme *fmep, struct event *ep,
3526     unsigned long long at_latest_by, unsigned long long *pdelay);
3527 static enum fme_state causes_test(struct fme *fmep, struct event *ep,
3528     unsigned long long at_latest_by, unsigned long long *pdelay);
3529 
3530 static int
checkconstraints(struct fme * fmep,struct arrow * arrowp)3531 checkconstraints(struct fme *fmep, struct arrow *arrowp)
3532 {
3533 	struct constraintlist *ctp;
3534 	struct evalue value;
3535 	char *sep = "";
3536 
3537 	if (arrowp->forever_false) {
3538 		indent();
3539 		out(O_ALTFP|O_VERB|O_NONL, "  Forever false constraint: ");
3540 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3541 			out(O_ALTFP|O_VERB|O_NONL, sep);
3542 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3543 			sep = ", ";
3544 		}
3545 		out(O_ALTFP|O_VERB, NULL);
3546 		return (0);
3547 	}
3548 	if (arrowp->forever_true) {
3549 		indent();
3550 		out(O_ALTFP|O_VERB|O_NONL, "  Forever true constraint: ");
3551 		for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3552 			out(O_ALTFP|O_VERB|O_NONL, sep);
3553 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3554 			sep = ", ";
3555 		}
3556 		out(O_ALTFP|O_VERB, NULL);
3557 		return (1);
3558 	}
3559 
3560 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3561 		if (eval_expr(ctp->cnode, NULL, NULL,
3562 		    &fmep->globals, fmep->config,
3563 		    arrowp, 0, &value)) {
3564 			/* evaluation successful */
3565 			if (value.t == UNDEFINED || value.v == 0) {
3566 				/* known false */
3567 				arrowp->forever_false = 1;
3568 				indent();
3569 				out(O_ALTFP|O_VERB|O_NONL,
3570 				    "  False constraint: ");
3571 				ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3572 				out(O_ALTFP|O_VERB, NULL);
3573 				return (0);
3574 			}
3575 		} else {
3576 			/* evaluation unsuccessful -- unknown value */
3577 			indent();
3578 			out(O_ALTFP|O_VERB|O_NONL,
3579 			    "  Deferred constraint: ");
3580 			ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3581 			out(O_ALTFP|O_VERB, NULL);
3582 			return (1);
3583 		}
3584 	}
3585 	/* known true */
3586 	arrowp->forever_true = 1;
3587 	indent();
3588 	out(O_ALTFP|O_VERB|O_NONL, "  True constraint: ");
3589 	for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) {
3590 		out(O_ALTFP|O_VERB|O_NONL, sep);
3591 		ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0);
3592 		sep = ", ";
3593 	}
3594 	out(O_ALTFP|O_VERB, NULL);
3595 	return (1);
3596 }
3597 
3598 static int
triggered(struct fme * fmep,struct event * ep,int mark)3599 triggered(struct fme *fmep, struct event *ep, int mark)
3600 {
3601 	struct bubble *bp;
3602 	struct arrowlist *ap;
3603 	int count = 0;
3604 
3605 	stats_counter_bump(fmep->Tcallcount);
3606 	for (bp = itree_next_bubble(ep, NULL); bp;
3607 	    bp = itree_next_bubble(ep, bp)) {
3608 		if (bp->t != B_TO)
3609 			continue;
3610 		for (ap = itree_next_arrow(bp, NULL); ap;
3611 		    ap = itree_next_arrow(bp, ap)) {
3612 			/* check count of marks against K in the bubble */
3613 			if ((ap->arrowp->mark & mark) &&
3614 			    ++count >= bp->nork)
3615 				return (1);
3616 		}
3617 	}
3618 	return (0);
3619 }
3620 
3621 static int
mark_arrows(struct fme * fmep,struct event * ep,int mark,unsigned long long at_latest_by,unsigned long long * pdelay,int keep)3622 mark_arrows(struct fme *fmep, struct event *ep, int mark,
3623     unsigned long long at_latest_by, unsigned long long *pdelay, int keep)
3624 {
3625 	struct bubble *bp;
3626 	struct arrowlist *ap;
3627 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3628 	unsigned long long my_delay;
3629 	enum fme_state result;
3630 	int retval = 0;
3631 
3632 	for (bp = itree_next_bubble(ep, NULL); bp;
3633 	    bp = itree_next_bubble(ep, bp)) {
3634 		if (bp->t != B_FROM)
3635 			continue;
3636 		stats_counter_bump(fmep->Marrowcount);
3637 		for (ap = itree_next_arrow(bp, NULL); ap;
3638 		    ap = itree_next_arrow(bp, ap)) {
3639 			struct event *ep2 = ap->arrowp->head->myevent;
3640 			/*
3641 			 * if we're clearing marks, we can avoid doing
3642 			 * all that work evaluating constraints.
3643 			 */
3644 			if (mark == 0) {
3645 				if (ap->arrowp->arrow_marked == 0)
3646 					continue;
3647 				ap->arrowp->arrow_marked = 0;
3648 				ap->arrowp->mark &= ~EFFECTS_COUNTER;
3649 				if (keep && (ep2->cached_state &
3650 				    (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT)))
3651 					ep2->keep_in_tree = 1;
3652 				ep2->cached_state &=
3653 				    ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT);
3654 				(void) mark_arrows(fmep, ep2, mark, 0, NULL,
3655 				    keep);
3656 				continue;
3657 			}
3658 			ap->arrowp->arrow_marked = 1;
3659 			if (ep2->cached_state & REQMNTS_DISPROVED) {
3660 				indent();
3661 				out(O_ALTFP|O_VERB|O_NONL,
3662 				    "  ALREADY DISPROVED ");
3663 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3664 				out(O_ALTFP|O_VERB, NULL);
3665 				continue;
3666 			}
3667 			if (ep2->cached_state & WAIT_EFFECT) {
3668 				indent();
3669 				out(O_ALTFP|O_VERB|O_NONL,
3670 				    "  ALREADY EFFECTS WAIT ");
3671 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3672 				out(O_ALTFP|O_VERB, NULL);
3673 				continue;
3674 			}
3675 			if (ep2->cached_state & CREDIBLE_EFFECT) {
3676 				indent();
3677 				out(O_ALTFP|O_VERB|O_NONL,
3678 				    "  ALREADY EFFECTS CREDIBLE ");
3679 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3680 				out(O_ALTFP|O_VERB, NULL);
3681 				continue;
3682 			}
3683 			if ((ep2->cached_state & PARENT_WAIT) &&
3684 			    (mark & PARENT_WAIT)) {
3685 				indent();
3686 				out(O_ALTFP|O_VERB|O_NONL,
3687 				    "  ALREADY PARENT EFFECTS WAIT ");
3688 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3689 				out(O_ALTFP|O_VERB, NULL);
3690 				continue;
3691 			}
3692 			platform_set_payloadnvp(ep2->nvp);
3693 			if (checkconstraints(fmep, ap->arrowp) == 0) {
3694 				platform_set_payloadnvp(NULL);
3695 				indent();
3696 				out(O_ALTFP|O_VERB|O_NONL,
3697 				    "  CONSTRAINTS FAIL ");
3698 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3699 				out(O_ALTFP|O_VERB, NULL);
3700 				continue;
3701 			}
3702 			platform_set_payloadnvp(NULL);
3703 			ap->arrowp->mark |= EFFECTS_COUNTER;
3704 			if (!triggered(fmep, ep2, EFFECTS_COUNTER)) {
3705 				indent();
3706 				out(O_ALTFP|O_VERB|O_NONL,
3707 				    "  K-COUNT NOT YET MET ");
3708 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3709 				out(O_ALTFP|O_VERB, NULL);
3710 				continue;
3711 			}
3712 			ep2->cached_state &= ~PARENT_WAIT;
3713 			/*
3714 			 * if we've reached an ereport and no propagation time
3715 			 * is specified, use the Hesitate value
3716 			 */
3717 			if (ep2->t == N_EREPORT && at_latest_by == 0ULL &&
3718 			    ap->arrowp->maxdelay == 0ULL) {
3719 				out(O_ALTFP|O_VERB|O_NONL, "  default wait ");
3720 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3721 				out(O_ALTFP|O_VERB, NULL);
3722 				result = requirements_test(fmep, ep2, Hesitate,
3723 				    &my_delay);
3724 			} else {
3725 				result = requirements_test(fmep, ep2,
3726 				    at_latest_by + ap->arrowp->maxdelay,
3727 				    &my_delay);
3728 			}
3729 			if (result == FME_WAIT) {
3730 				retval = WAIT_EFFECT;
3731 				if (overall_delay > my_delay)
3732 					overall_delay = my_delay;
3733 				ep2->cached_state |= WAIT_EFFECT;
3734 				indent();
3735 				out(O_ALTFP|O_VERB|O_NONL, "  EFFECTS WAIT ");
3736 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3737 				out(O_ALTFP|O_VERB, NULL);
3738 				indent_push("  E");
3739 				if (mark_arrows(fmep, ep2, PARENT_WAIT,
3740 				    at_latest_by, &my_delay, 0) ==
3741 				    WAIT_EFFECT) {
3742 					retval = WAIT_EFFECT;
3743 					if (overall_delay > my_delay)
3744 						overall_delay = my_delay;
3745 				}
3746 				indent_pop();
3747 			} else if (result == FME_DISPROVED) {
3748 				indent();
3749 				out(O_ALTFP|O_VERB|O_NONL,
3750 				    "  EFFECTS DISPROVED ");
3751 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3752 				out(O_ALTFP|O_VERB, NULL);
3753 			} else {
3754 				ep2->cached_state |= mark;
3755 				indent();
3756 				if (mark == CREDIBLE_EFFECT)
3757 					out(O_ALTFP|O_VERB|O_NONL,
3758 					    "  EFFECTS CREDIBLE ");
3759 				else
3760 					out(O_ALTFP|O_VERB|O_NONL,
3761 					    "  PARENT EFFECTS WAIT ");
3762 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2);
3763 				out(O_ALTFP|O_VERB, NULL);
3764 				indent_push("  E");
3765 				if (mark_arrows(fmep, ep2, mark, at_latest_by,
3766 				    &my_delay, 0) == WAIT_EFFECT) {
3767 					retval = WAIT_EFFECT;
3768 					if (overall_delay > my_delay)
3769 						overall_delay = my_delay;
3770 				}
3771 				indent_pop();
3772 			}
3773 		}
3774 	}
3775 	if (retval == WAIT_EFFECT)
3776 		*pdelay = overall_delay;
3777 	return (retval);
3778 }
3779 
3780 static enum fme_state
effects_test(struct fme * fmep,struct event * fault_event,unsigned long long at_latest_by,unsigned long long * pdelay)3781 effects_test(struct fme *fmep, struct event *fault_event,
3782     unsigned long long at_latest_by, unsigned long long *pdelay)
3783 {
3784 	struct event *error_event;
3785 	enum fme_state return_value = FME_CREDIBLE;
3786 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3787 	unsigned long long my_delay;
3788 
3789 	stats_counter_bump(fmep->Ecallcount);
3790 	indent_push("  E");
3791 	indent();
3792 	out(O_ALTFP|O_VERB|O_NONL, "->");
3793 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3794 	out(O_ALTFP|O_VERB, NULL);
3795 
3796 	if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by,
3797 	    &my_delay, 0) == WAIT_EFFECT) {
3798 		return_value = FME_WAIT;
3799 		if (overall_delay > my_delay)
3800 			overall_delay = my_delay;
3801 	}
3802 	for (error_event = fmep->observations;
3803 	    error_event; error_event = error_event->observations) {
3804 		indent();
3805 		out(O_ALTFP|O_VERB|O_NONL, " ");
3806 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event);
3807 		if (!(error_event->cached_state & CREDIBLE_EFFECT)) {
3808 			if (error_event->cached_state &
3809 			    (PARENT_WAIT|WAIT_EFFECT)) {
3810 				out(O_ALTFP|O_VERB, " NOT YET triggered");
3811 				continue;
3812 			}
3813 			return_value = FME_DISPROVED;
3814 			out(O_ALTFP|O_VERB, " NOT triggered");
3815 			break;
3816 		} else {
3817 			out(O_ALTFP|O_VERB, " triggered");
3818 		}
3819 	}
3820 	if (return_value == FME_DISPROVED) {
3821 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0);
3822 	} else {
3823 		fault_event->keep_in_tree = 1;
3824 		(void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1);
3825 	}
3826 
3827 	indent();
3828 	out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ",
3829 	    fme_state2str(return_value));
3830 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event);
3831 	out(O_ALTFP|O_VERB, NULL);
3832 	indent_pop();
3833 	if (return_value == FME_WAIT)
3834 		*pdelay = overall_delay;
3835 	return (return_value);
3836 }
3837 
3838 static enum fme_state
requirements_test(struct fme * fmep,struct event * ep,unsigned long long at_latest_by,unsigned long long * pdelay)3839 requirements_test(struct fme *fmep, struct event *ep,
3840     unsigned long long at_latest_by, unsigned long long *pdelay)
3841 {
3842 	int waiting_events;
3843 	int credible_events;
3844 	int deferred_events;
3845 	enum fme_state return_value = FME_CREDIBLE;
3846 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
3847 	unsigned long long arrow_delay;
3848 	unsigned long long my_delay;
3849 	struct event *ep2;
3850 	struct bubble *bp;
3851 	struct arrowlist *ap;
3852 
3853 	if (ep->cached_state & REQMNTS_CREDIBLE) {
3854 		indent();
3855 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY CREDIBLE ");
3856 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3857 		out(O_ALTFP|O_VERB, NULL);
3858 		return (FME_CREDIBLE);
3859 	}
3860 	if (ep->cached_state & REQMNTS_DISPROVED) {
3861 		indent();
3862 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY DISPROVED ");
3863 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3864 		out(O_ALTFP|O_VERB, NULL);
3865 		return (FME_DISPROVED);
3866 	}
3867 	if (ep->cached_state & REQMNTS_WAIT) {
3868 		indent();
3869 		*pdelay = ep->cached_delay;
3870 		out(O_ALTFP|O_VERB|O_NONL, "  REQMNTS ALREADY WAIT ");
3871 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3872 		out(O_ALTFP|O_VERB|O_NONL, ", wait for: ");
3873 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3874 		out(O_ALTFP|O_VERB, NULL);
3875 		return (FME_WAIT);
3876 	}
3877 	stats_counter_bump(fmep->Rcallcount);
3878 	indent_push("  R");
3879 	indent();
3880 	out(O_ALTFP|O_VERB|O_NONL, "->");
3881 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3882 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
3883 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3884 	out(O_ALTFP|O_VERB, NULL);
3885 
3886 	if (ep->t == N_EREPORT) {
3887 		if (ep->count == 0) {
3888 			if (fmep->pull >= at_latest_by) {
3889 				return_value = FME_DISPROVED;
3890 			} else {
3891 				ep->cached_delay = *pdelay = at_latest_by;
3892 				return_value = FME_WAIT;
3893 			}
3894 		}
3895 
3896 		indent();
3897 		switch (return_value) {
3898 		case FME_CREDIBLE:
3899 			ep->cached_state |= REQMNTS_CREDIBLE;
3900 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE ");
3901 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3902 			break;
3903 		case FME_DISPROVED:
3904 			ep->cached_state |= REQMNTS_DISPROVED;
3905 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
3906 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3907 			break;
3908 		case FME_WAIT:
3909 			ep->cached_state |= REQMNTS_WAIT;
3910 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT ");
3911 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
3912 			out(O_ALTFP|O_VERB|O_NONL, " to ");
3913 			ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
3914 			break;
3915 		default:
3916 			out(O_DIE, "requirements_test: unexpected fme_state");
3917 			break;
3918 		}
3919 		out(O_ALTFP|O_VERB, NULL);
3920 		indent_pop();
3921 
3922 		return (return_value);
3923 	}
3924 
3925 	/* this event is not a report, descend the tree */
3926 	for (bp = itree_next_bubble(ep, NULL); bp;
3927 	    bp = itree_next_bubble(ep, bp)) {
3928 		int n;
3929 
3930 		if (bp->t != B_FROM)
3931 			continue;
3932 
3933 		n = bp->nork;
3934 
3935 		credible_events = 0;
3936 		waiting_events = 0;
3937 		deferred_events = 0;
3938 		arrow_delay = TIMEVAL_EVENTUALLY;
3939 		/*
3940 		 * n is -1 for 'A' so adjust it.
3941 		 * XXX just count up the arrows for now.
3942 		 */
3943 		if (n < 0) {
3944 			n = 0;
3945 			for (ap = itree_next_arrow(bp, NULL); ap;
3946 			    ap = itree_next_arrow(bp, ap))
3947 				n++;
3948 			indent();
3949 			out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n);
3950 		} else {
3951 			indent();
3952 			out(O_ALTFP|O_VERB, " Bubble N=%d", n);
3953 		}
3954 
3955 		if (n == 0)
3956 			continue;
3957 		if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) {
3958 			for (ap = itree_next_arrow(bp, NULL); ap;
3959 			    ap = itree_next_arrow(bp, ap)) {
3960 				ep2 = ap->arrowp->head->myevent;
3961 				platform_set_payloadnvp(ep2->nvp);
3962 				(void) checkconstraints(fmep, ap->arrowp);
3963 				if (!ap->arrowp->forever_false) {
3964 					/*
3965 					 * if all arrows are invalidated by the
3966 					 * constraints, then we should elide the
3967 					 * whole bubble to be consistant with
3968 					 * the tree creation time behaviour
3969 					 */
3970 					bp->mark |= BUBBLE_OK;
3971 					platform_set_payloadnvp(NULL);
3972 					break;
3973 				}
3974 				platform_set_payloadnvp(NULL);
3975 			}
3976 		}
3977 		for (ap = itree_next_arrow(bp, NULL); ap;
3978 		    ap = itree_next_arrow(bp, ap)) {
3979 			ep2 = ap->arrowp->head->myevent;
3980 			if (n <= credible_events)
3981 				break;
3982 
3983 			ap->arrowp->mark |= REQMNTS_COUNTER;
3984 			if (triggered(fmep, ep2, REQMNTS_COUNTER))
3985 				/* XXX adding max timevals! */
3986 				switch (requirements_test(fmep, ep2,
3987 				    at_latest_by + ap->arrowp->maxdelay,
3988 				    &my_delay)) {
3989 				case FME_DEFERRED:
3990 					deferred_events++;
3991 					break;
3992 				case FME_CREDIBLE:
3993 					credible_events++;
3994 					break;
3995 				case FME_DISPROVED:
3996 					break;
3997 				case FME_WAIT:
3998 					if (my_delay < arrow_delay)
3999 						arrow_delay = my_delay;
4000 					waiting_events++;
4001 					break;
4002 				default:
4003 					out(O_DIE,
4004 					"Bug in requirements_test.");
4005 				}
4006 			else
4007 				deferred_events++;
4008 		}
4009 		if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) {
4010 			bp->mark |= BUBBLE_ELIDED;
4011 			continue;
4012 		}
4013 		indent();
4014 		out(O_ALTFP|O_VERB, " Credible: %d Waiting %d",
4015 		    credible_events + deferred_events, waiting_events);
4016 		if (credible_events + deferred_events + waiting_events < n) {
4017 			/* Can never meet requirements */
4018 			ep->cached_state |= REQMNTS_DISPROVED;
4019 			indent();
4020 			out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED ");
4021 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4022 			out(O_ALTFP|O_VERB, NULL);
4023 			indent_pop();
4024 			return (FME_DISPROVED);
4025 		}
4026 		if (credible_events + deferred_events < n) {
4027 			/* will have to wait */
4028 			/* wait time is shortest known */
4029 			if (arrow_delay < overall_delay)
4030 				overall_delay = arrow_delay;
4031 			return_value = FME_WAIT;
4032 		} else if (credible_events < n) {
4033 			if (return_value != FME_WAIT)
4034 				return_value = FME_DEFERRED;
4035 		}
4036 	}
4037 
4038 	/*
4039 	 * don't mark as FME_DEFERRED. If this event isn't reached by another
4040 	 * path, then this will be considered FME_CREDIBLE. But if it is
4041 	 * reached by a different path so the K-count is met, then might
4042 	 * get overridden by FME_WAIT or FME_DISPROVED.
4043 	 */
4044 	if (return_value == FME_WAIT) {
4045 		ep->cached_state |= REQMNTS_WAIT;
4046 		ep->cached_delay = *pdelay = overall_delay;
4047 	} else if (return_value == FME_CREDIBLE) {
4048 		ep->cached_state |= REQMNTS_CREDIBLE;
4049 	}
4050 	indent();
4051 	out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ",
4052 	    fme_state2str(return_value));
4053 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4054 	out(O_ALTFP|O_VERB, NULL);
4055 	indent_pop();
4056 	return (return_value);
4057 }
4058 
4059 static enum fme_state
causes_test(struct fme * fmep,struct event * ep,unsigned long long at_latest_by,unsigned long long * pdelay)4060 causes_test(struct fme *fmep, struct event *ep,
4061     unsigned long long at_latest_by, unsigned long long *pdelay)
4062 {
4063 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
4064 	unsigned long long my_delay;
4065 	int credible_results = 0;
4066 	int waiting_results = 0;
4067 	enum fme_state fstate;
4068 	struct event *tail_event;
4069 	struct bubble *bp;
4070 	struct arrowlist *ap;
4071 	int k = 1;
4072 
4073 	stats_counter_bump(fmep->Ccallcount);
4074 	indent_push("  C");
4075 	indent();
4076 	out(O_ALTFP|O_VERB|O_NONL, "->");
4077 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4078 	out(O_ALTFP|O_VERB, NULL);
4079 
4080 	for (bp = itree_next_bubble(ep, NULL); bp;
4081 	    bp = itree_next_bubble(ep, bp)) {
4082 		if (bp->t != B_TO)
4083 			continue;
4084 		k = bp->nork;	/* remember the K value */
4085 		for (ap = itree_next_arrow(bp, NULL); ap;
4086 		    ap = itree_next_arrow(bp, ap)) {
4087 			int do_not_follow = 0;
4088 
4089 			/*
4090 			 * if we get to the same event multiple times
4091 			 * only worry about the first one.
4092 			 */
4093 			if (ap->arrowp->tail->myevent->cached_state &
4094 			    CAUSES_TESTED) {
4095 				indent();
4096 				out(O_ALTFP|O_VERB|O_NONL,
4097 				    "  causes test already run for ");
4098 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
4099 				    ap->arrowp->tail->myevent);
4100 				out(O_ALTFP|O_VERB, NULL);
4101 				continue;
4102 			}
4103 
4104 			/*
4105 			 * see if false constraint prevents us
4106 			 * from traversing this arrow
4107 			 */
4108 			platform_set_payloadnvp(ep->nvp);
4109 			if (checkconstraints(fmep, ap->arrowp) == 0)
4110 				do_not_follow = 1;
4111 			platform_set_payloadnvp(NULL);
4112 			if (do_not_follow) {
4113 				indent();
4114 				out(O_ALTFP|O_VERB|O_NONL,
4115 				    "  False arrow from ");
4116 				itree_pevent_brief(O_ALTFP|O_VERB|O_NONL,
4117 				    ap->arrowp->tail->myevent);
4118 				out(O_ALTFP|O_VERB, NULL);
4119 				continue;
4120 			}
4121 
4122 			ap->arrowp->tail->myevent->cached_state |=
4123 			    CAUSES_TESTED;
4124 			tail_event = ap->arrowp->tail->myevent;
4125 			fstate = hypothesise(fmep, tail_event, at_latest_by,
4126 			    &my_delay);
4127 
4128 			switch (fstate) {
4129 			case FME_WAIT:
4130 				if (my_delay < overall_delay)
4131 					overall_delay = my_delay;
4132 				waiting_results++;
4133 				break;
4134 			case FME_CREDIBLE:
4135 				credible_results++;
4136 				break;
4137 			case FME_DISPROVED:
4138 				break;
4139 			default:
4140 				out(O_DIE, "Bug in causes_test");
4141 			}
4142 		}
4143 	}
4144 	/* compare against K */
4145 	if (credible_results + waiting_results < k) {
4146 		indent();
4147 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED ");
4148 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4149 		out(O_ALTFP|O_VERB, NULL);
4150 		indent_pop();
4151 		return (FME_DISPROVED);
4152 	}
4153 	if (waiting_results != 0) {
4154 		*pdelay = overall_delay;
4155 		indent();
4156 		out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT ");
4157 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4158 		out(O_ALTFP|O_VERB|O_NONL, " to ");
4159 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
4160 		out(O_ALTFP|O_VERB, NULL);
4161 		indent_pop();
4162 		return (FME_WAIT);
4163 	}
4164 	indent();
4165 	out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE ");
4166 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4167 	out(O_ALTFP|O_VERB, NULL);
4168 	indent_pop();
4169 	return (FME_CREDIBLE);
4170 }
4171 
4172 static enum fme_state
hypothesise(struct fme * fmep,struct event * ep,unsigned long long at_latest_by,unsigned long long * pdelay)4173 hypothesise(struct fme *fmep, struct event *ep,
4174     unsigned long long at_latest_by, unsigned long long *pdelay)
4175 {
4176 	enum fme_state rtr, otr;
4177 	unsigned long long my_delay;
4178 	unsigned long long overall_delay = TIMEVAL_EVENTUALLY;
4179 
4180 	stats_counter_bump(fmep->Hcallcount);
4181 	indent_push("  H");
4182 	indent();
4183 	out(O_ALTFP|O_VERB|O_NONL, "->");
4184 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4185 	out(O_ALTFP|O_VERB|O_NONL, ", at latest by: ");
4186 	ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by);
4187 	out(O_ALTFP|O_VERB, NULL);
4188 
4189 	rtr = requirements_test(fmep, ep, at_latest_by, &my_delay);
4190 	if ((rtr == FME_WAIT) && (my_delay < overall_delay))
4191 		overall_delay = my_delay;
4192 	if (rtr != FME_DISPROVED) {
4193 		if (is_problem(ep->t)) {
4194 			otr = effects_test(fmep, ep, at_latest_by, &my_delay);
4195 			if (otr != FME_DISPROVED) {
4196 				if (fmep->peek == 0 && ep->is_suspect == 0) {
4197 					ep->suspects = fmep->suspects;
4198 					ep->is_suspect = 1;
4199 					fmep->suspects = ep;
4200 					fmep->nsuspects++;
4201 				}
4202 			}
4203 		} else
4204 			otr = causes_test(fmep, ep, at_latest_by, &my_delay);
4205 		if ((otr == FME_WAIT) && (my_delay < overall_delay))
4206 			overall_delay = my_delay;
4207 		if ((otr != FME_DISPROVED) &&
4208 		    ((rtr == FME_WAIT) || (otr == FME_WAIT)))
4209 			*pdelay = overall_delay;
4210 	}
4211 	if (rtr == FME_DISPROVED) {
4212 		indent();
4213 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4214 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4215 		out(O_ALTFP|O_VERB, " (doesn't meet requirements)");
4216 		indent_pop();
4217 		return (FME_DISPROVED);
4218 	}
4219 	if ((otr == FME_DISPROVED) && is_problem(ep->t)) {
4220 		indent();
4221 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4222 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4223 		out(O_ALTFP|O_VERB, " (doesn't explain all reports)");
4224 		indent_pop();
4225 		return (FME_DISPROVED);
4226 	}
4227 	if (otr == FME_DISPROVED) {
4228 		indent();
4229 		out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED ");
4230 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4231 		out(O_ALTFP|O_VERB, " (causes are not credible)");
4232 		indent_pop();
4233 		return (FME_DISPROVED);
4234 	}
4235 	if ((rtr == FME_WAIT) || (otr == FME_WAIT)) {
4236 		indent();
4237 		out(O_ALTFP|O_VERB|O_NONL, "<-WAIT ");
4238 		itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4239 		out(O_ALTFP|O_VERB|O_NONL, " to ");
4240 		ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay);
4241 		out(O_ALTFP|O_VERB, NULL);
4242 		indent_pop();
4243 		return (FME_WAIT);
4244 	}
4245 	indent();
4246 	out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE ");
4247 	itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep);
4248 	out(O_ALTFP|O_VERB, NULL);
4249 	indent_pop();
4250 	return (FME_CREDIBLE);
4251 }
4252 
4253 /*
4254  * fme_istat_load -- reconstitute any persistent istats
4255  */
4256 void
fme_istat_load(fmd_hdl_t * hdl)4257 fme_istat_load(fmd_hdl_t *hdl)
4258 {
4259 	int sz;
4260 	char *sbuf;
4261 	char *ptr;
4262 
4263 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) {
4264 		out(O_ALTFP, "fme_istat_load: No stats");
4265 		return;
4266 	}
4267 
4268 	sbuf = alloca(sz);
4269 
4270 	fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz);
4271 
4272 	/*
4273 	 * pick apart the serialized stats
4274 	 *
4275 	 * format is:
4276 	 *	<class-name>, '@', <path>, '\0', <value>, '\0'
4277 	 * for example:
4278 	 *	"stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0"
4279 	 *
4280 	 * since this is parsing our own serialized data, any parsing issues
4281 	 * are fatal, so we check for them all with ASSERT() below.
4282 	 */
4283 	ptr = sbuf;
4284 	while (ptr < &sbuf[sz]) {
4285 		char *sepptr;
4286 		struct node *np;
4287 		int val;
4288 
4289 		sepptr = strchr(ptr, '@');
4290 		ASSERT(sepptr != NULL);
4291 		*sepptr = '\0';
4292 
4293 		/* construct the event */
4294 		np = newnode(T_EVENT, NULL, 0);
4295 		np->u.event.ename = newnode(T_NAME, NULL, 0);
4296 		np->u.event.ename->u.name.t = N_STAT;
4297 		np->u.event.ename->u.name.s = stable(ptr);
4298 		np->u.event.ename->u.name.it = IT_ENAME;
4299 		np->u.event.ename->u.name.last = np->u.event.ename;
4300 
4301 		ptr = sepptr + 1;
4302 		ASSERT(ptr < &sbuf[sz]);
4303 		ptr += strlen(ptr);
4304 		ptr++;	/* move past the '\0' separating path from value */
4305 		ASSERT(ptr < &sbuf[sz]);
4306 		ASSERT(isdigit(*ptr));
4307 		val = atoi(ptr);
4308 		ASSERT(val > 0);
4309 		ptr += strlen(ptr);
4310 		ptr++;	/* move past the final '\0' for this entry */
4311 
4312 		np->u.event.epname = pathstring2epnamenp(sepptr + 1);
4313 		ASSERT(np->u.event.epname != NULL);
4314 
4315 		istat_bump(np, val);
4316 		tree_free(np);
4317 	}
4318 
4319 	istat_save();
4320 }
4321