1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <strings.h> 35 #include <ctype.h> 36 #include <alloca.h> 37 #include <libnvpair.h> 38 #include <sys/fm/protocol.h> 39 #include <fm/fmd_api.h> 40 #include "alloc.h" 41 #include "out.h" 42 #include "stats.h" 43 #include "stable.h" 44 #include "literals.h" 45 #include "lut.h" 46 #include "tree.h" 47 #include "ptree.h" 48 #include "itree.h" 49 #include "ipath.h" 50 #include "fme.h" 51 #include "evnv.h" 52 #include "eval.h" 53 #include "config.h" 54 #include "platform.h" 55 #include "esclex.h" 56 57 /* imported from eft.c... */ 58 extern hrtime_t Hesitate; 59 extern char *Serd_Override; 60 extern nv_alloc_t Eft_nv_hdl; 61 extern int Max_fme; 62 extern fmd_hdl_t *Hdl; 63 64 static int Istat_need_save; 65 static int Serd_need_save; 66 void istat_save(void); 67 void serd_save(void); 68 69 /* fme under construction is global so we can free it on module abort */ 70 static struct fme *Nfmep; 71 72 static const char *Undiag_reason; 73 74 static int Nextid = 0; 75 76 static int Open_fme_count = 0; /* Count of open FMEs */ 77 78 /* list of fault management exercises underway */ 79 static struct fme { 80 struct fme *next; /* next exercise */ 81 unsigned long long ull; /* time when fme was created */ 82 int id; /* FME id */ 83 struct config *config; /* cooked configuration data */ 84 struct lut *eventtree; /* propagation tree for this FME */ 85 /* 86 * The initial error report that created this FME is kept in 87 * two forms. e0 points to the instance tree node and is used 88 * by fme_eval() as the starting point for the inference 89 * algorithm. e0r is the event handle FMD passed to us when 90 * the ereport first arrived and is used when setting timers, 91 * which are always relative to the time of this initial 92 * report. 93 */ 94 struct event *e0; 95 fmd_event_t *e0r; 96 97 id_t timer; /* for setting an fmd time-out */ 98 99 struct event *ecurrent; /* ereport under consideration */ 100 struct event *suspects; /* current suspect list */ 101 struct event *psuspects; /* previous suspect list */ 102 int nsuspects; /* count of suspects */ 103 int nonfault; /* zero if all suspects T_FAULT */ 104 int posted_suspects; /* true if we've posted a diagnosis */ 105 int uniqobs; /* number of unique events observed */ 106 int peek; /* just peeking, don't track suspects */ 107 int overflow; /* true if overflow FME */ 108 enum fme_state { 109 FME_NOTHING = 5000, /* not evaluated yet */ 110 FME_WAIT, /* need to wait for more info */ 111 FME_CREDIBLE, /* suspect list is credible */ 112 FME_DISPROVED, /* no valid suspects found */ 113 FME_DEFERRED /* don't know yet (k-count not met) */ 114 } state; 115 116 unsigned long long pull; /* time passed since created */ 117 unsigned long long wull; /* wait until this time for re-eval */ 118 struct event *observations; /* observation list */ 119 struct lut *globals; /* values of global variables */ 120 /* fmd interfacing */ 121 fmd_hdl_t *hdl; /* handle for talking with fmd */ 122 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 123 /* stats */ 124 struct stats *Rcount; 125 struct stats *Hcallcount; 126 struct stats *Rcallcount; 127 struct stats *Ccallcount; 128 struct stats *Ecallcount; 129 struct stats *Tcallcount; 130 struct stats *Marrowcount; 131 struct stats *diags; 132 } *FMElist, *EFMElist, *ClosedFMEs; 133 134 static struct case_list { 135 fmd_case_t *fmcase; 136 struct case_list *next; 137 } *Undiagablecaselist; 138 139 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 140 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 141 unsigned long long at_latest_by, unsigned long long *pdelay); 142 static struct node *eventprop_lookup(struct event *ep, const char *propname); 143 static struct node *pathstring2epnamenp(char *path); 144 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 145 fmd_case_t *fmcase); 146 static void restore_suspects(struct fme *fmep); 147 static void save_suspects(struct fme *fmep); 148 static void destroy_fme(struct fme *f); 149 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 150 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 151 static void istat_counter_reset_cb(struct istat_entry *entp, 152 struct stats *statp, const struct ipath *ipp); 153 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 154 struct stats *statp, void *unused); 155 static void serd_reset_cb(struct serd_entry *entp, void *unused, 156 const struct ipath *ipp); 157 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 158 void *unused2); 159 static void destroy_fme_bufs(struct fme *fp); 160 161 static struct fme * 162 alloc_fme(void) 163 { 164 struct fme *fmep; 165 166 fmep = MALLOC(sizeof (*fmep)); 167 bzero(fmep, sizeof (*fmep)); 168 return (fmep); 169 } 170 171 /* 172 * fme_ready -- called when all initialization of the FME (except for 173 * stats) has completed successfully. Adds the fme to global lists 174 * and establishes its stats. 175 */ 176 static struct fme * 177 fme_ready(struct fme *fmep) 178 { 179 char nbuf[100]; 180 181 Nfmep = NULL; /* don't need to free this on module abort now */ 182 183 if (EFMElist) { 184 EFMElist->next = fmep; 185 EFMElist = fmep; 186 } else 187 FMElist = EFMElist = fmep; 188 189 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 190 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 191 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 192 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 193 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 194 fmep->Rcallcount = stats_new_counter(nbuf, 195 "calls to requirements_test()", 1); 196 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 197 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 199 fmep->Ecallcount = 200 stats_new_counter(nbuf, "calls to effects_test()", 1); 201 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 202 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 203 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 204 fmep->Marrowcount = stats_new_counter(nbuf, 205 "arrows marked by mark_arrows()", 1); 206 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 207 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 208 209 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 210 config_print(O_ALTFP|O_VERB2, fmep->config); 211 212 return (fmep); 213 } 214 215 extern void ipath_dummy_lut(struct arrow *); 216 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 217 218 /* ARGSUSED */ 219 static void 220 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 221 { 222 struct bubble *bp; 223 struct arrowlist *ap; 224 225 for (bp = itree_next_bubble(ep, NULL); bp; 226 bp = itree_next_bubble(ep, bp)) { 227 if (bp->t != B_FROM) 228 continue; 229 for (ap = itree_next_arrow(bp, NULL); ap; 230 ap = itree_next_arrow(bp, ap)) { 231 ap->arrowp->pnode->u.arrow.needed = 1; 232 ipath_dummy_lut(ap->arrowp); 233 } 234 } 235 } 236 237 /* ARGSUSED */ 238 static void 239 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 240 { 241 struct bubble *bp; 242 struct arrowlist *ap; 243 244 for (bp = itree_next_bubble(ep, NULL); bp; 245 bp = itree_next_bubble(ep, bp)) { 246 if (bp->t != B_FROM) 247 continue; 248 for (ap = itree_next_arrow(bp, NULL); ap; 249 ap = itree_next_arrow(bp, ap)) 250 ap->arrowp->pnode->u.arrow.needed = 0; 251 } 252 } 253 254 static void globals_destructor(void *left, void *right, void *arg); 255 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 256 257 static void 258 prune_propagations(const char *e0class, const struct ipath *e0ipp) 259 { 260 char nbuf[100]; 261 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 262 extern struct lut *Usednames; 263 264 Nfmep = alloc_fme(); 265 Nfmep->id = Nextid; 266 Nfmep->state = FME_NOTHING; 267 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 268 if ((Nfmep->e0 = 269 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 270 out(O_ALTFP, "prune_propagations: e0 not in instance tree"); 271 itree_free(Nfmep->eventtree); 272 FREE(Nfmep); 273 Nfmep = NULL; 274 return; 275 } 276 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 277 Nfmep->e0->count++; 278 279 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 280 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 281 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 282 Nfmep->Hcallcount = 283 stats_new_counter(nbuf, "calls to hypothesise()", 1); 284 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 285 Nfmep->Rcallcount = stats_new_counter(nbuf, 286 "calls to requirements_test()", 1); 287 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 288 Nfmep->Ccallcount = 289 stats_new_counter(nbuf, "calls to causes_test()", 1); 290 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 291 Nfmep->Ecallcount = 292 stats_new_counter(nbuf, "calls to effects_test()", 1); 293 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 294 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 295 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 296 Nfmep->Marrowcount = stats_new_counter(nbuf, 297 "arrows marked by mark_arrows()", 1); 298 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 299 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 300 301 Nfmep->peek = 1; 302 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 303 lut_free(Usednames, NULL, NULL); 304 Usednames = NULL; 305 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 306 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 307 itree_prune(Nfmep->eventtree); 308 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 309 310 stats_delete(Nfmep->Rcount); 311 stats_delete(Nfmep->Hcallcount); 312 stats_delete(Nfmep->Rcallcount); 313 stats_delete(Nfmep->Ccallcount); 314 stats_delete(Nfmep->Ecallcount); 315 stats_delete(Nfmep->Tcallcount); 316 stats_delete(Nfmep->Marrowcount); 317 stats_delete(Nfmep->diags); 318 itree_free(Nfmep->eventtree); 319 lut_free(Nfmep->globals, globals_destructor, NULL); 320 FREE(Nfmep); 321 } 322 323 static struct fme * 324 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 325 fmd_case_t *fmcase) 326 { 327 struct cfgdata *cfgdata; 328 int init_size; 329 extern int alloc_total(); 330 331 init_size = alloc_total(); 332 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 333 if ((cfgdata = config_snapshot()) == NULL) { 334 out(O_ALTFP, "newfme: NULL configuration"); 335 Undiag_reason = UD_NOCONF; 336 return (NULL); 337 } 338 platform_save_config(hdl, fmcase); 339 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 340 alloc_total() - init_size); 341 342 Nfmep = alloc_fme(); 343 344 Nfmep->id = Nextid++; 345 Nfmep->config = cfgdata->cooked; 346 config_free(cfgdata); 347 Nfmep->posted_suspects = 0; 348 Nfmep->uniqobs = 0; 349 Nfmep->state = FME_NOTHING; 350 Nfmep->pull = 0ULL; 351 Nfmep->overflow = 0; 352 353 Nfmep->fmcase = fmcase; 354 Nfmep->hdl = hdl; 355 356 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 357 out(O_ALTFP, "newfme: NULL instance tree"); 358 Undiag_reason = UD_INSTFAIL; 359 structconfig_free(Nfmep->config); 360 destroy_fme_bufs(Nfmep); 361 FREE(Nfmep); 362 Nfmep = NULL; 363 return (NULL); 364 } 365 366 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 367 368 if ((Nfmep->e0 = 369 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 370 out(O_ALTFP, "newfme: e0 not in instance tree"); 371 Undiag_reason = UD_BADEVENTI; 372 itree_free(Nfmep->eventtree); 373 structconfig_free(Nfmep->config); 374 destroy_fme_bufs(Nfmep); 375 FREE(Nfmep); 376 Nfmep = NULL; 377 return (NULL); 378 } 379 380 return (fme_ready(Nfmep)); 381 } 382 383 void 384 fme_fini(void) 385 { 386 struct fme *sfp, *fp; 387 struct case_list *ucasep, *nextcasep; 388 389 ucasep = Undiagablecaselist; 390 while (ucasep != NULL) { 391 nextcasep = ucasep->next; 392 FREE(ucasep); 393 ucasep = nextcasep; 394 } 395 Undiagablecaselist = NULL; 396 397 /* clean up closed fmes */ 398 fp = ClosedFMEs; 399 while (fp != NULL) { 400 sfp = fp->next; 401 destroy_fme(fp); 402 fp = sfp; 403 } 404 ClosedFMEs = NULL; 405 406 fp = FMElist; 407 while (fp != NULL) { 408 sfp = fp->next; 409 destroy_fme(fp); 410 fp = sfp; 411 } 412 FMElist = EFMElist = NULL; 413 414 /* if we were in the middle of creating an fme, free it now */ 415 if (Nfmep) { 416 destroy_fme(Nfmep); 417 Nfmep = NULL; 418 } 419 } 420 421 /* 422 * Allocated space for a buffer name. 20 bytes allows for 423 * a ridiculous 9,999,999 unique observations. 424 */ 425 #define OBBUFNMSZ 20 426 427 /* 428 * serialize_observation 429 * 430 * Create a recoverable version of the current observation 431 * (f->ecurrent). We keep a serialized version of each unique 432 * observation in order that we may resume correctly the fme in the 433 * correct state if eft or fmd crashes and we're restarted. 434 */ 435 static void 436 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 437 { 438 size_t pkdlen; 439 char tmpbuf[OBBUFNMSZ]; 440 char *pkd = NULL; 441 char *estr; 442 443 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 444 estr = ipath2str(cls, ipp); 445 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 446 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 447 strlen(estr) + 1); 448 FREE(estr); 449 450 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 451 (void) snprintf(tmpbuf, 452 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 453 if (nvlist_xpack(fp->ecurrent->nvp, 454 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 455 out(O_DIE|O_SYS, "pack of observed nvl failed"); 456 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 457 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 458 FREE(pkd); 459 } 460 461 fp->uniqobs++; 462 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 463 sizeof (fp->uniqobs)); 464 } 465 466 /* 467 * init_fme_bufs -- We keep several bits of state about an fme for 468 * use if eft or fmd crashes and we're restarted. 469 */ 470 static void 471 init_fme_bufs(struct fme *fp) 472 { 473 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 474 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 475 sizeof (fp->pull)); 476 477 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 478 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 479 sizeof (fp->id)); 480 481 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 482 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 483 sizeof (fp->uniqobs)); 484 485 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 486 sizeof (fp->posted_suspects)); 487 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 488 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 489 } 490 491 static void 492 destroy_fme_bufs(struct fme *fp) 493 { 494 char tmpbuf[OBBUFNMSZ]; 495 int o; 496 497 platform_restore_config(fp->hdl, fp->fmcase); 498 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 499 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 500 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 501 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 502 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 503 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 504 505 for (o = 0; o < fp->uniqobs; o++) { 506 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 507 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 508 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 509 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 510 } 511 } 512 513 /* 514 * reconstitute_observations -- convert a case's serialized observations 515 * back into struct events. Returns zero if all observations are 516 * successfully reconstituted. 517 */ 518 static int 519 reconstitute_observations(struct fme *fmep) 520 { 521 struct event *ep; 522 struct node *epnamenp = NULL; 523 size_t pkdlen; 524 char *pkd = NULL; 525 char *tmpbuf = alloca(OBBUFNMSZ); 526 char *sepptr; 527 char *estr; 528 int ocnt; 529 int elen; 530 531 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 532 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 533 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 534 if (elen == 0) { 535 out(O_ALTFP, 536 "reconstitute_observation: no %s buffer found.", 537 tmpbuf); 538 Undiag_reason = UD_MISSINGOBS; 539 break; 540 } 541 542 estr = MALLOC(elen); 543 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 544 sepptr = strchr(estr, '@'); 545 if (sepptr == NULL) { 546 out(O_ALTFP, 547 "reconstitute_observation: %s: " 548 "missing @ separator in %s.", 549 tmpbuf, estr); 550 Undiag_reason = UD_MISSINGPATH; 551 FREE(estr); 552 break; 553 } 554 555 *sepptr = '\0'; 556 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 557 out(O_ALTFP, 558 "reconstitute_observation: %s: " 559 "trouble converting path string \"%s\" " 560 "to internal representation.", 561 tmpbuf, sepptr + 1); 562 Undiag_reason = UD_MISSINGPATH; 563 FREE(estr); 564 break; 565 } 566 567 /* construct the event */ 568 ep = itree_lookup(fmep->eventtree, 569 stable(estr), ipath(epnamenp)); 570 if (ep == NULL) { 571 out(O_ALTFP, 572 "reconstitute_observation: %s: " 573 "lookup of \"%s\" in itree failed.", 574 tmpbuf, ipath2str(estr, ipath(epnamenp))); 575 Undiag_reason = UD_BADOBS; 576 tree_free(epnamenp); 577 FREE(estr); 578 break; 579 } 580 tree_free(epnamenp); 581 582 /* 583 * We may or may not have a saved nvlist for the observation 584 */ 585 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 586 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 587 if (pkdlen != 0) { 588 pkd = MALLOC(pkdlen); 589 fmd_buf_read(fmep->hdl, 590 fmep->fmcase, tmpbuf, pkd, pkdlen); 591 ASSERT(ep->nvp == NULL); 592 if (nvlist_xunpack(pkd, 593 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 594 out(O_DIE|O_SYS, "pack of observed nvl failed"); 595 FREE(pkd); 596 } 597 598 if (ocnt == 0) 599 fmep->e0 = ep; 600 601 FREE(estr); 602 fmep->ecurrent = ep; 603 ep->count++; 604 605 /* link it into list of observations seen */ 606 ep->observations = fmep->observations; 607 fmep->observations = ep; 608 } 609 610 if (ocnt == fmep->uniqobs) { 611 (void) fme_ready(fmep); 612 return (0); 613 } 614 615 return (1); 616 } 617 618 /* 619 * restart_fme -- called during eft initialization. Reconstitutes 620 * an in-progress fme. 621 */ 622 void 623 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 624 { 625 nvlist_t *defect; 626 struct case_list *bad; 627 struct fme *fmep; 628 struct cfgdata *cfgdata; 629 size_t rawsz; 630 struct event *ep; 631 char *tmpbuf = alloca(OBBUFNMSZ); 632 char *sepptr; 633 char *estr; 634 int elen; 635 struct node *epnamenp = NULL; 636 int init_size; 637 extern int alloc_total(); 638 639 /* 640 * ignore solved or closed cases 641 */ 642 if (fmd_case_solved(hdl, inprogress) || 643 fmd_case_closed(hdl, inprogress)) 644 return; 645 646 fmep = alloc_fme(); 647 fmep->fmcase = inprogress; 648 fmep->hdl = hdl; 649 650 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 651 out(O_ALTFP, "restart_fme: no saved posted status"); 652 Undiag_reason = UD_MISSINGINFO; 653 goto badcase; 654 } else { 655 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 656 (void *)&fmep->posted_suspects, 657 sizeof (fmep->posted_suspects)); 658 } 659 660 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 661 out(O_ALTFP, "restart_fme: no saved id"); 662 Undiag_reason = UD_MISSINGINFO; 663 goto badcase; 664 } else { 665 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 666 sizeof (fmep->id)); 667 } 668 if (Nextid <= fmep->id) 669 Nextid = fmep->id + 1; 670 671 out(O_ALTFP, "Replay FME %d", fmep->id); 672 673 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 674 out(O_ALTFP, "restart_fme: No config data"); 675 Undiag_reason = UD_MISSINGINFO; 676 goto badcase; 677 } 678 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 679 sizeof (size_t)); 680 681 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 682 out(O_ALTFP, "restart_fme: No event zero"); 683 Undiag_reason = UD_MISSINGZERO; 684 goto badcase; 685 } 686 687 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 688 out(O_ALTFP, "restart_fme: no saved wait time"); 689 Undiag_reason = UD_MISSINGINFO; 690 goto badcase; 691 } else { 692 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 693 sizeof (fmep->pull)); 694 } 695 696 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 697 out(O_ALTFP, "restart_fme: no count of observations"); 698 Undiag_reason = UD_MISSINGINFO; 699 goto badcase; 700 } else { 701 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 702 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 703 } 704 705 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 706 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 707 if (elen == 0) { 708 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 709 tmpbuf); 710 Undiag_reason = UD_MISSINGOBS; 711 goto badcase; 712 } 713 estr = MALLOC(elen); 714 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 715 sepptr = strchr(estr, '@'); 716 if (sepptr == NULL) { 717 out(O_ALTFP, "reconstitute_observation: %s: " 718 "missing @ separator in %s.", 719 tmpbuf, estr); 720 Undiag_reason = UD_MISSINGPATH; 721 FREE(estr); 722 goto badcase; 723 } 724 *sepptr = '\0'; 725 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 726 out(O_ALTFP, "reconstitute_observation: %s: " 727 "trouble converting path string \"%s\" " 728 "to internal representation.", tmpbuf, sepptr + 1); 729 Undiag_reason = UD_MISSINGPATH; 730 FREE(estr); 731 goto badcase; 732 } 733 prune_propagations(stable(estr), ipath(epnamenp)); 734 tree_free(epnamenp); 735 FREE(estr); 736 737 init_size = alloc_total(); 738 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 739 cfgdata = MALLOC(sizeof (struct cfgdata)); 740 cfgdata->cooked = NULL; 741 cfgdata->devcache = NULL; 742 cfgdata->devidcache = NULL; 743 cfgdata->cpucache = NULL; 744 cfgdata->raw_refcnt = 1; 745 746 if (rawsz > 0) { 747 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 748 out(O_ALTFP, "restart_fme: Config data size mismatch"); 749 Undiag_reason = UD_CFGMISMATCH; 750 goto badcase; 751 } 752 cfgdata->begin = MALLOC(rawsz); 753 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 754 fmd_buf_read(hdl, 755 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 756 } else { 757 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 758 } 759 760 config_cook(cfgdata); 761 fmep->config = cfgdata->cooked; 762 config_free(cfgdata); 763 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 764 alloc_total() - init_size); 765 766 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 767 /* case not properly saved or irretrievable */ 768 out(O_ALTFP, "restart_fme: NULL instance tree"); 769 Undiag_reason = UD_INSTFAIL; 770 goto badcase; 771 } 772 773 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 774 775 if (reconstitute_observations(fmep) != 0) 776 goto badcase; 777 778 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 779 for (ep = fmep->observations; ep; ep = ep->observations) { 780 out(O_ALTFP|O_NONL, " "); 781 itree_pevent_brief(O_ALTFP|O_NONL, ep); 782 } 783 out(O_ALTFP, NULL); 784 785 Open_fme_count++; 786 787 /* give the diagnosis algorithm a shot at the new FME state */ 788 fme_eval(fmep, fmep->e0r); 789 return; 790 791 badcase: 792 if (fmep->eventtree != NULL) 793 itree_free(fmep->eventtree); 794 if (fmep->config) 795 structconfig_free(fmep->config); 796 destroy_fme_bufs(fmep); 797 FREE(fmep); 798 799 /* 800 * Since we're unable to restart the case, add it to the undiagable 801 * list and solve and close it as appropriate. 802 */ 803 bad = MALLOC(sizeof (struct case_list)); 804 bad->next = NULL; 805 806 if (Undiagablecaselist != NULL) 807 bad->next = Undiagablecaselist; 808 Undiagablecaselist = bad; 809 bad->fmcase = inprogress; 810 811 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 812 fmd_case_uuid(hdl, bad->fmcase)); 813 814 if (fmd_case_solved(hdl, bad->fmcase)) { 815 out(O_ALTFP|O_NONL, "already solved, "); 816 } else { 817 out(O_ALTFP|O_NONL, "solving, "); 818 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 819 NULL, NULL, NULL); 820 if (Undiag_reason != NULL) 821 (void) nvlist_add_string(defect, 822 UNDIAG_REASON, Undiag_reason); 823 fmd_case_add_suspect(hdl, bad->fmcase, defect); 824 fmd_case_solve(hdl, bad->fmcase); 825 } 826 827 if (fmd_case_closed(hdl, bad->fmcase)) { 828 out(O_ALTFP, "already closed ]"); 829 } else { 830 out(O_ALTFP, "closing ]"); 831 fmd_case_close(hdl, bad->fmcase); 832 } 833 } 834 835 /*ARGSUSED*/ 836 static void 837 globals_destructor(void *left, void *right, void *arg) 838 { 839 struct evalue *evp = (struct evalue *)right; 840 if (evp->t == NODEPTR) 841 tree_free((struct node *)(uintptr_t)evp->v); 842 evp->v = (uintptr_t)NULL; 843 FREE(evp); 844 } 845 846 void 847 destroy_fme(struct fme *f) 848 { 849 stats_delete(f->Rcount); 850 stats_delete(f->Hcallcount); 851 stats_delete(f->Rcallcount); 852 stats_delete(f->Ccallcount); 853 stats_delete(f->Ecallcount); 854 stats_delete(f->Tcallcount); 855 stats_delete(f->Marrowcount); 856 stats_delete(f->diags); 857 858 if (f->eventtree != NULL) 859 itree_free(f->eventtree); 860 if (f->config) 861 structconfig_free(f->config); 862 lut_free(f->globals, globals_destructor, NULL); 863 FREE(f); 864 } 865 866 static const char * 867 fme_state2str(enum fme_state s) 868 { 869 switch (s) { 870 case FME_NOTHING: return ("NOTHING"); 871 case FME_WAIT: return ("WAIT"); 872 case FME_CREDIBLE: return ("CREDIBLE"); 873 case FME_DISPROVED: return ("DISPROVED"); 874 case FME_DEFERRED: return ("DEFERRED"); 875 default: return ("UNKNOWN"); 876 } 877 } 878 879 static int 880 is_problem(enum nametype t) 881 { 882 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 883 } 884 885 static int 886 is_fault(enum nametype t) 887 { 888 return (t == N_FAULT); 889 } 890 891 static int 892 is_defect(enum nametype t) 893 { 894 return (t == N_DEFECT); 895 } 896 897 static int 898 is_upset(enum nametype t) 899 { 900 return (t == N_UPSET); 901 } 902 903 static void 904 fme_print(int flags, struct fme *fmep) 905 { 906 struct event *ep; 907 908 out(flags, "Fault Management Exercise %d", fmep->id); 909 out(flags, "\t State: %s", fme_state2str(fmep->state)); 910 out(flags|O_NONL, "\t Start time: "); 911 ptree_timeval(flags|O_NONL, &fmep->ull); 912 out(flags, NULL); 913 if (fmep->wull) { 914 out(flags|O_NONL, "\t Wait time: "); 915 ptree_timeval(flags|O_NONL, &fmep->wull); 916 out(flags, NULL); 917 } 918 out(flags|O_NONL, "\t E0: "); 919 if (fmep->e0) 920 itree_pevent_brief(flags|O_NONL, fmep->e0); 921 else 922 out(flags|O_NONL, "NULL"); 923 out(flags, NULL); 924 out(flags|O_NONL, "\tObservations:"); 925 for (ep = fmep->observations; ep; ep = ep->observations) { 926 out(flags|O_NONL, " "); 927 itree_pevent_brief(flags|O_NONL, ep); 928 } 929 out(flags, NULL); 930 out(flags|O_NONL, "\tSuspect list:"); 931 for (ep = fmep->suspects; ep; ep = ep->suspects) { 932 out(flags|O_NONL, " "); 933 itree_pevent_brief(flags|O_NONL, ep); 934 } 935 out(flags, NULL); 936 if (fmep->eventtree != NULL) { 937 out(flags|O_VERB2, "\t Tree:"); 938 itree_ptree(flags|O_VERB2, fmep->eventtree); 939 } 940 } 941 942 static struct node * 943 pathstring2epnamenp(char *path) 944 { 945 char *sep = "/"; 946 struct node *ret; 947 char *ptr; 948 949 if ((ptr = strtok(path, sep)) == NULL) 950 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 951 952 ret = tree_iname(stable(ptr), NULL, 0); 953 954 while ((ptr = strtok(NULL, sep)) != NULL) 955 ret = tree_name_append(ret, 956 tree_iname(stable(ptr), NULL, 0)); 957 958 return (ret); 959 } 960 961 /* 962 * for a given upset sp, increment the corresponding SERD engine. if the 963 * SERD engine trips, return the ename and ipp of the resulting ereport. 964 * returns true if engine tripped and *enamep and *ippp were filled in. 965 */ 966 static int 967 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 968 fmd_case_t *fmcase, struct event *sp, const char **enamep, 969 const struct ipath **ippp) 970 { 971 struct node *serdinst; 972 char *serdname; 973 char *serdresource; 974 struct node *nid; 975 struct serd_entry *newentp; 976 int i, serdn = -1, serdincrement = 1, len = 0; 977 char *serdsuffix = NULL, *serdt = NULL, *ptr; 978 struct evalue *ep; 979 980 ASSERT(sp->t == N_UPSET); 981 ASSERT(ffep != NULL); 982 983 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 984 (void *)"n", (lut_cmp)strcmp)) != NULL) { 985 ASSERT(ep->t == UINT64); 986 serdn = (int)ep->v; 987 } 988 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 989 (void *)"t", (lut_cmp)strcmp)) != NULL) { 990 ASSERT(ep->t == STRING); 991 serdt = (char *)(uintptr_t)ep->v; 992 } 993 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 994 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 995 ASSERT(ep->t == STRING); 996 serdsuffix = (char *)(uintptr_t)ep->v; 997 } 998 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 999 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1000 ASSERT(ep->t == UINT64); 1001 serdincrement = (int)ep->v; 1002 } 1003 1004 /* 1005 * obtain instanced SERD engine from the upset sp. from this 1006 * derive serdname, the string used to identify the SERD engine. 1007 */ 1008 serdinst = eventprop_lookup(sp, L_engine); 1009 1010 if (serdinst == NULL) 1011 return (-1); 1012 1013 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 1014 NULL); 1015 serdresource = ipath2str(NULL, 1016 ipath(serdinst->u.stmt.np->u.event.epname)); 1017 1018 len = strlen(serdname) + strlen(serdresource) + 2; 1019 if (serdsuffix != NULL) 1020 len += strlen(serdsuffix); 1021 1022 ptr = MALLOC(len); 1023 if (serdsuffix != NULL) { 1024 (void) snprintf(ptr, len, "%s%s@%s", serdname, serdsuffix, 1025 serdresource); 1026 } else { 1027 (void) snprintf(ptr, len, "%s@%s", serdname, serdresource); 1028 } 1029 FREE(serdname); 1030 FREE(serdresource); 1031 serdname = ptr; 1032 1033 /* handle serd engine "id" property, if there is one */ 1034 if ((nid = 1035 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1036 struct evalue *gval; 1037 char suffixbuf[200]; 1038 char *suffix; 1039 char *nserdname; 1040 size_t nname; 1041 1042 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1043 ptree_name_iter(O_ALTFP|O_NONL, nid); 1044 1045 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1046 1047 if ((gval = lut_lookup(fmep->globals, 1048 (void *)nid->u.globid.s, NULL)) == NULL) { 1049 out(O_ALTFP, " undefined"); 1050 } else if (gval->t == UINT64) { 1051 out(O_ALTFP, " %llu", gval->v); 1052 (void) sprintf(suffixbuf, "%llu", gval->v); 1053 suffix = suffixbuf; 1054 } else { 1055 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1056 suffix = (char *)(uintptr_t)gval->v; 1057 } 1058 1059 nname = strlen(serdname) + strlen(suffix) + 2; 1060 nserdname = MALLOC(nname); 1061 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1062 FREE(serdname); 1063 serdname = nserdname; 1064 } 1065 1066 /* 1067 * if the engine is empty, and we have an override for n/t then 1068 * destroy and recreate it. 1069 */ 1070 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1071 fmd_serd_empty(hdl, serdname)) 1072 fmd_serd_destroy(hdl, serdname); 1073 1074 if (!fmd_serd_exists(hdl, serdname)) { 1075 struct node *nN, *nT; 1076 const char *s; 1077 struct node *nodep; 1078 struct config *cp; 1079 char *path; 1080 uint_t nval; 1081 hrtime_t tval; 1082 const char *name; 1083 char *tptr; 1084 char *serd_name; 1085 int i; 1086 int tmplen; 1087 char *ptr; 1088 int got_n_override = 0, got_t_override = 0; 1089 1090 /* no SERD engine yet, so create it */ 1091 nodep = serdinst->u.stmt.np->u.event.epname; 1092 tmplen = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) 1093 + 2; 1094 if (serdsuffix != NULL) 1095 tmplen += strlen(serdsuffix); 1096 tptr = MALLOC(tmplen); 1097 if (serdsuffix != NULL) { 1098 (void) snprintf(tptr, len, "%s%s", 1099 serdinst->u.stmt.np->u.event.ename->u.name.s, 1100 serdsuffix); 1101 } else { 1102 (void) snprintf(tptr, len, "%s", 1103 serdinst->u.stmt.np->u.event.ename->u.name.s); 1104 } 1105 name = (const char *)tptr; 1106 path = ipath2str(NULL, ipath(nodep)); 1107 cp = config_lookup(fmep->config, path, 0); 1108 FREE((void *)path); 1109 1110 /* 1111 * We allow serd paramaters to be overridden, either from 1112 * eft.conf file values (if Serd_Override is set) or from 1113 * driver properties (for "serd.io.device" engines). 1114 */ 1115 if (Serd_Override != NULL) { 1116 char *save_ptr, *ptr1, *ptr2, *ptr3; 1117 ptr3 = save_ptr = STRDUP(Serd_Override); 1118 while (*ptr3 != '\0') { 1119 ptr1 = strchr(ptr3, ','); 1120 *ptr1 = '\0'; 1121 if (strcmp(ptr3, name) == 0) { 1122 ptr2 = strchr(ptr1 + 1, ','); 1123 *ptr2 = '\0'; 1124 nval = atoi(ptr1 + 1); 1125 out(O_ALTFP, "serd override %s_n %d", 1126 name, nval); 1127 ptr3 = strchr(ptr2 + 1, ' '); 1128 if (ptr3) 1129 *ptr3 = '\0'; 1130 ptr = STRDUP(ptr2 + 1); 1131 out(O_ALTFP, "serd override %s_t %s", 1132 name, ptr); 1133 got_n_override = 1; 1134 got_t_override = 1; 1135 break; 1136 } else { 1137 ptr2 = strchr(ptr1 + 1, ','); 1138 ptr3 = strchr(ptr2 + 1, ' '); 1139 if (ptr3 == NULL) 1140 break; 1141 } 1142 ptr3++; 1143 } 1144 FREE(save_ptr); 1145 } 1146 1147 if (cp && got_n_override == 0) { 1148 /* 1149 * convert serd engine name into property name 1150 */ 1151 serd_name = MALLOC(strlen(name) + 3); 1152 for (i = 0; i < strlen(name); i++) { 1153 if (name[i] == '.') 1154 serd_name[i] = '_'; 1155 else 1156 serd_name[i] = name[i]; 1157 } 1158 serd_name[i++] = '_'; 1159 serd_name[i++] = 'n'; 1160 serd_name[i] = '\0'; 1161 if (s = config_getprop(cp, serd_name)) { 1162 nval = atoi(s); 1163 out(O_ALTFP, "serd override %s_n %s", name, s); 1164 got_n_override = 1; 1165 } 1166 serd_name[i - 1] = 't'; 1167 if (s = config_getprop(cp, serd_name)) { 1168 ptr = STRDUP(s); 1169 out(O_ALTFP, "serd override %s_t %s", name, s); 1170 got_t_override = 1; 1171 } 1172 FREE(serd_name); 1173 } 1174 1175 if (serdn != -1 && got_n_override == 0) { 1176 nval = serdn; 1177 out(O_ALTFP, "serd override %s_n %d", name, serdn); 1178 got_n_override = 1; 1179 } 1180 if (serdt != NULL && got_t_override == 0) { 1181 ptr = STRDUP(serdt); 1182 out(O_ALTFP, "serd override %s_t %s", name, serdt); 1183 got_t_override = 1; 1184 } 1185 1186 if (!got_n_override) { 1187 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1188 NULL); 1189 ASSERT(nN->t == T_NUM); 1190 nval = (uint_t)nN->u.ull; 1191 } 1192 if (!got_t_override) { 1193 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1194 NULL); 1195 ASSERT(nT->t == T_TIMEVAL); 1196 tval = (hrtime_t)nT->u.ull; 1197 } else { 1198 const unsigned long long *ullp; 1199 const char *suffix; 1200 int len; 1201 1202 len = strspn(ptr, "0123456789"); 1203 suffix = stable(&ptr[len]); 1204 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1205 (void *)suffix, NULL); 1206 ptr[len] = '\0'; 1207 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1208 FREE(ptr); 1209 } 1210 fmd_serd_create(hdl, serdname, nval, tval); 1211 FREE(tptr); 1212 } 1213 1214 newentp = MALLOC(sizeof (*newentp)); 1215 newentp->ename = stable(serdinst->u.stmt.np->u.event.ename->u.name.s); 1216 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1217 newentp->hdl = hdl; 1218 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1219 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1220 (void *)newentp, (lut_cmp)serd_cmp); 1221 Serd_need_save = 1; 1222 serd_save(); 1223 } else { 1224 FREE(newentp); 1225 } 1226 1227 1228 /* 1229 * increment SERD engine. if engine fires, reset serd 1230 * engine and return trip_strcode if required. 1231 */ 1232 for (i = 0; i < serdincrement; i++) { 1233 if (fmd_serd_record(hdl, serdname, ffep)) { 1234 fmd_case_add_serd(hdl, fmcase, serdname); 1235 fmd_serd_reset(hdl, serdname); 1236 1237 if (ippp) { 1238 struct node *tripinst = 1239 lut_lookup(serdinst->u.stmt.lutp, 1240 (void *)L_trip, NULL); 1241 ASSERT(tripinst != NULL); 1242 *enamep = tripinst->u.event.ename->u.name.s; 1243 *ippp = ipath(tripinst->u.event.epname); 1244 out(O_ALTFP|O_NONL, 1245 "[engine fired: %s, sending: ", serdname); 1246 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1247 out(O_ALTFP, "]"); 1248 } else { 1249 out(O_ALTFP, "[engine fired: %s, no trip]", 1250 serdname); 1251 } 1252 FREE(serdname); 1253 return (1); 1254 } 1255 } 1256 1257 FREE(serdname); 1258 return (0); 1259 } 1260 1261 /* 1262 * search a suspect list for upsets. feed each upset to serd_eval() and 1263 * build up tripped[], an array of ereports produced by the firing of 1264 * any SERD engines. then feed each ereport back into 1265 * fme_receive_report(). 1266 * 1267 * returns ntrip, the number of these ereports produced. 1268 */ 1269 static int 1270 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1271 { 1272 /* we build an array of tripped ereports that we send ourselves */ 1273 struct { 1274 const char *ename; 1275 const struct ipath *ipp; 1276 } *tripped; 1277 struct event *sp; 1278 int ntrip, nupset, i; 1279 1280 /* 1281 * count the number of upsets to determine the upper limit on 1282 * expected trip ereport strings. remember that one upset can 1283 * lead to at most one ereport. 1284 */ 1285 nupset = 0; 1286 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1287 if (sp->t == N_UPSET) 1288 nupset++; 1289 } 1290 1291 if (nupset == 0) 1292 return (0); 1293 1294 /* 1295 * get to this point if we have upsets and expect some trip 1296 * ereports 1297 */ 1298 tripped = alloca(sizeof (*tripped) * nupset); 1299 bzero((void *)tripped, sizeof (*tripped) * nupset); 1300 1301 ntrip = 0; 1302 for (sp = fmep->suspects; sp; sp = sp->suspects) 1303 if (sp->t == N_UPSET && 1304 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1305 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1306 ntrip++; 1307 1308 for (i = 0; i < ntrip; i++) { 1309 struct event *ep, *nep; 1310 struct fme *nfmep; 1311 fmd_case_t *fmcase; 1312 const struct ipath *ipp; 1313 const char *eventstring; 1314 int prev_verbose; 1315 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1316 enum fme_state state; 1317 1318 /* 1319 * First try and evaluate a case with the trip ereport plus 1320 * all the other ereports that cause the trip. If that fails 1321 * to evaluate then try again with just this ereport on its own. 1322 */ 1323 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1324 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1325 out(O_ALTFP|O_STAMP, NULL); 1326 ep = fmep->e0; 1327 eventstring = ep->enode->u.event.ename->u.name.s; 1328 ipp = ep->ipp; 1329 prune_propagations(eventstring, ipp); 1330 1331 /* 1332 * create a duplicate fme and case 1333 */ 1334 fmcase = fmd_case_open(fmep->hdl, NULL); 1335 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1336 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1337 out(O_ALTFP, " ]"); 1338 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1339 fmcase)) == NULL) { 1340 out(O_ALTFP|O_NONL, "["); 1341 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1342 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1343 publish_undiagnosable(fmep->hdl, ffep, fmcase); 1344 continue; 1345 } 1346 Open_fme_count++; 1347 nfmep->pull = fmep->pull; 1348 init_fme_bufs(nfmep); 1349 out(O_ALTFP|O_NONL, "["); 1350 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1351 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1352 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1353 if (ffep) { 1354 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1355 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1356 nfmep->e0r = ffep; 1357 } 1358 1359 /* 1360 * add the original ereports 1361 */ 1362 for (ep = fmep->observations; ep; ep = ep->observations) { 1363 eventstring = ep->enode->u.event.ename->u.name.s; 1364 ipp = ep->ipp; 1365 out(O_ALTFP|O_NONL, "adding event ["); 1366 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1367 out(O_ALTFP, " ]"); 1368 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1369 if (nep->count++ == 0) { 1370 nep->observations = nfmep->observations; 1371 nfmep->observations = nep; 1372 serialize_observation(nfmep, eventstring, ipp); 1373 nep->nvp = evnv_dupnvl(ep->nvp); 1374 } 1375 if (ep->ffep && ep->ffep != ffep) 1376 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1377 ep->ffep); 1378 stats_counter_bump(nfmep->Rcount); 1379 } 1380 1381 /* 1382 * add the serd trigger ereport 1383 */ 1384 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1385 tripped[i].ipp)) == NULL) { 1386 /* 1387 * The trigger ereport is not in the instance tree. It 1388 * was presumably removed by prune_propagations() as 1389 * this combination of events is not present in the 1390 * rules. 1391 */ 1392 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1393 Undiag_reason = UD_BADEVENTI; 1394 goto retry_lone_ereport; 1395 } 1396 out(O_ALTFP|O_NONL, "adding event ["); 1397 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1398 out(O_ALTFP, " ]"); 1399 nfmep->ecurrent = ep; 1400 ep->nvp = NULL; 1401 ep->count = 1; 1402 ep->observations = nfmep->observations; 1403 nfmep->observations = ep; 1404 1405 /* 1406 * just peek first. 1407 */ 1408 nfmep->peek = 1; 1409 prev_verbose = Verbose; 1410 if (Debug == 0) 1411 Verbose = 0; 1412 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1413 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1414 nfmep->peek = 0; 1415 Verbose = prev_verbose; 1416 if (state == FME_DISPROVED) { 1417 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1418 Undiag_reason = UD_UNSOLVD; 1419 retry_lone_ereport: 1420 /* 1421 * However the trigger ereport on its own might be 1422 * diagnosable, so check for that. Undo the new fme 1423 * and case we just created and call fme_receive_report. 1424 */ 1425 out(O_ALTFP|O_NONL, "["); 1426 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1427 tripped[i].ipp); 1428 out(O_ALTFP, " retrying with just trigger ereport]"); 1429 itree_free(nfmep->eventtree); 1430 nfmep->eventtree = NULL; 1431 structconfig_free(nfmep->config); 1432 nfmep->config = NULL; 1433 destroy_fme_bufs(nfmep); 1434 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1435 fme_receive_report(fmep->hdl, ffep, 1436 tripped[i].ename, tripped[i].ipp, NULL); 1437 continue; 1438 } 1439 1440 /* 1441 * and evaluate 1442 */ 1443 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1444 fme_eval(nfmep, ffep); 1445 } 1446 1447 return (ntrip); 1448 } 1449 1450 /* 1451 * fme_receive_external_report -- call when an external ereport comes in 1452 * 1453 * this routine just converts the relevant information from the ereport 1454 * into a format used internally and passes it on to fme_receive_report(). 1455 */ 1456 void 1457 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1458 const char *class) 1459 { 1460 struct node *epnamenp; 1461 fmd_case_t *fmcase; 1462 const struct ipath *ipp; 1463 1464 class = stable(class); 1465 1466 /* Get the component path from the ereport */ 1467 epnamenp = platform_getpath(nvl); 1468 1469 /* See if we ended up without a path. */ 1470 if (epnamenp == NULL) { 1471 /* See if class permits silent discard on unknown component. */ 1472 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1473 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1474 "to component path, but silent discard allowed.", 1475 class); 1476 } else { 1477 /* 1478 * XFILE: Failure to find a component is bad unless 1479 * 'discard_if_config_unknown=1' was specified in the 1480 * ereport definition. Indicate undiagnosable. 1481 */ 1482 out(O_ALTFP, "XFILE: Unable to map \"%s\" ereport " 1483 "to component path.", class); 1484 Undiag_reason = UD_NOPATH; 1485 fmcase = fmd_case_open(hdl, NULL); 1486 publish_undiagnosable(hdl, ffep, fmcase); 1487 } 1488 return; 1489 } 1490 1491 ipp = ipath(epnamenp); 1492 tree_free(epnamenp); 1493 fme_receive_report(hdl, ffep, class, ipp, nvl); 1494 } 1495 1496 /*ARGSUSED*/ 1497 void 1498 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1499 const char *eventstring) 1500 { 1501 char *uuid; 1502 nvlist_t **nva; 1503 uint_t nvc; 1504 const struct ipath *ipp; 1505 1506 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1507 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1508 &nva, &nvc) != 0) { 1509 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1510 return; 1511 } 1512 1513 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1514 1515 while (nvc-- != 0) { 1516 /* 1517 * Reset any istat or serd engine associated with this path. 1518 */ 1519 char *path; 1520 1521 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1522 continue; 1523 1524 path = ipath2str(NULL, ipp); 1525 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1526 path); 1527 FREE(path); 1528 1529 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1530 istat_save(); 1531 1532 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1533 serd_save(); 1534 } 1535 } 1536 1537 /*ARGSUSED*/ 1538 void 1539 fme_receive_topology_change(void) 1540 { 1541 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1542 istat_save(); 1543 1544 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1545 serd_save(); 1546 } 1547 1548 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1549 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1550 1551 /* ARGSUSED */ 1552 static void 1553 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1554 { 1555 struct bubble *bp; 1556 struct arrowlist *ap; 1557 1558 ep->cached_state = 0; 1559 ep->keep_in_tree = 0; 1560 for (bp = itree_next_bubble(ep, NULL); bp; 1561 bp = itree_next_bubble(ep, bp)) { 1562 if (bp->t != B_FROM) 1563 continue; 1564 bp->mark = 0; 1565 for (ap = itree_next_arrow(bp, NULL); ap; 1566 ap = itree_next_arrow(bp, ap)) 1567 ap->arrowp->mark = 0; 1568 } 1569 } 1570 1571 static void 1572 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1573 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1574 { 1575 struct event *ep; 1576 struct fme *fmep = NULL; 1577 struct fme *ofmep = NULL; 1578 struct fme *cfmep, *svfmep; 1579 int matched = 0; 1580 nvlist_t *defect; 1581 fmd_case_t *fmcase; 1582 1583 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1584 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1585 out(O_ALTFP|O_STAMP, NULL); 1586 1587 /* decide which FME it goes to */ 1588 for (fmep = FMElist; fmep; fmep = fmep->next) { 1589 int prev_verbose; 1590 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1591 enum fme_state state; 1592 nvlist_t *pre_peek_nvp = NULL; 1593 1594 if (fmep->overflow) { 1595 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1596 ofmep = fmep; 1597 1598 continue; 1599 } 1600 1601 /* 1602 * ignore solved or closed cases 1603 */ 1604 if (fmep->posted_suspects || 1605 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1606 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1607 continue; 1608 1609 /* look up event in event tree for this FME */ 1610 if ((ep = itree_lookup(fmep->eventtree, 1611 eventstring, ipp)) == NULL) 1612 continue; 1613 1614 /* note observation */ 1615 fmep->ecurrent = ep; 1616 if (ep->count++ == 0) { 1617 /* link it into list of observations seen */ 1618 ep->observations = fmep->observations; 1619 fmep->observations = ep; 1620 ep->nvp = evnv_dupnvl(nvl); 1621 } else { 1622 /* use new payload values for peek */ 1623 pre_peek_nvp = ep->nvp; 1624 ep->nvp = evnv_dupnvl(nvl); 1625 } 1626 1627 /* tell hypothesise() not to mess with suspect list */ 1628 fmep->peek = 1; 1629 1630 /* don't want this to be verbose (unless Debug is set) */ 1631 prev_verbose = Verbose; 1632 if (Debug == 0) 1633 Verbose = 0; 1634 1635 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1636 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1637 1638 fmep->peek = 0; 1639 1640 /* put verbose flag back */ 1641 Verbose = prev_verbose; 1642 1643 if (state != FME_DISPROVED) { 1644 /* found an FME that explains the ereport */ 1645 matched++; 1646 out(O_ALTFP|O_NONL, "["); 1647 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1648 out(O_ALTFP, " explained by FME%d]", fmep->id); 1649 1650 if (pre_peek_nvp) 1651 nvlist_free(pre_peek_nvp); 1652 1653 if (ep->count == 1) 1654 serialize_observation(fmep, eventstring, ipp); 1655 1656 if (ffep) { 1657 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1658 ep->ffep = ffep; 1659 } 1660 1661 stats_counter_bump(fmep->Rcount); 1662 1663 /* re-eval FME */ 1664 fme_eval(fmep, ffep); 1665 } else { 1666 1667 /* not a match, undo noting of observation */ 1668 fmep->ecurrent = NULL; 1669 if (--ep->count == 0) { 1670 /* unlink it from observations */ 1671 fmep->observations = ep->observations; 1672 ep->observations = NULL; 1673 nvlist_free(ep->nvp); 1674 ep->nvp = NULL; 1675 } else { 1676 nvlist_free(ep->nvp); 1677 ep->nvp = pre_peek_nvp; 1678 } 1679 } 1680 } 1681 1682 if (matched) 1683 return; /* explained by at least one existing FME */ 1684 1685 /* clean up closed fmes */ 1686 cfmep = ClosedFMEs; 1687 while (cfmep != NULL) { 1688 svfmep = cfmep->next; 1689 destroy_fme(cfmep); 1690 cfmep = svfmep; 1691 } 1692 ClosedFMEs = NULL; 1693 prune_propagations(eventstring, ipp); 1694 1695 if (ofmep) { 1696 out(O_ALTFP|O_NONL, "["); 1697 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1698 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1699 if (ffep) 1700 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1701 1702 return; 1703 1704 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1705 out(O_ALTFP|O_NONL, "["); 1706 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1707 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1708 1709 fmcase = fmd_case_open(hdl, NULL); 1710 1711 /* Create overflow fme */ 1712 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1713 out(O_ALTFP|O_NONL, "["); 1714 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1715 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1716 publish_undiagnosable(hdl, ffep, fmcase); 1717 return; 1718 } 1719 1720 Open_fme_count++; 1721 1722 init_fme_bufs(fmep); 1723 fmep->overflow = B_TRUE; 1724 1725 if (ffep) 1726 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1727 1728 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 1729 NULL, NULL, NULL); 1730 (void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME); 1731 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1732 fmd_case_solve(hdl, fmep->fmcase); 1733 return; 1734 } 1735 1736 /* open a case */ 1737 fmcase = fmd_case_open(hdl, NULL); 1738 1739 /* start a new FME */ 1740 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1741 out(O_ALTFP|O_NONL, "["); 1742 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1743 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1744 publish_undiagnosable(hdl, ffep, fmcase); 1745 return; 1746 } 1747 1748 Open_fme_count++; 1749 1750 init_fme_bufs(fmep); 1751 1752 out(O_ALTFP|O_NONL, "["); 1753 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1754 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1755 fmd_case_uuid(hdl, fmep->fmcase)); 1756 1757 ep = fmep->e0; 1758 ASSERT(ep != NULL); 1759 1760 /* note observation */ 1761 fmep->ecurrent = ep; 1762 if (ep->count++ == 0) { 1763 /* link it into list of observations seen */ 1764 ep->observations = fmep->observations; 1765 fmep->observations = ep; 1766 ep->nvp = evnv_dupnvl(nvl); 1767 serialize_observation(fmep, eventstring, ipp); 1768 } else { 1769 /* new payload overrides any previous */ 1770 nvlist_free(ep->nvp); 1771 ep->nvp = evnv_dupnvl(nvl); 1772 } 1773 1774 stats_counter_bump(fmep->Rcount); 1775 1776 if (ffep) { 1777 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1778 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1779 fmep->e0r = ffep; 1780 ep->ffep = ffep; 1781 } 1782 1783 /* give the diagnosis algorithm a shot at the new FME state */ 1784 fme_eval(fmep, ffep); 1785 } 1786 1787 void 1788 fme_status(int flags) 1789 { 1790 struct fme *fmep; 1791 1792 if (FMElist == NULL) { 1793 out(flags, "No fault management exercises underway."); 1794 return; 1795 } 1796 1797 for (fmep = FMElist; fmep; fmep = fmep->next) 1798 fme_print(flags, fmep); 1799 } 1800 1801 /* 1802 * "indent" routines used mostly for nicely formatted debug output, but also 1803 * for sanity checking for infinite recursion bugs. 1804 */ 1805 1806 #define MAX_INDENT 1024 1807 static const char *indent_s[MAX_INDENT]; 1808 static int current_indent; 1809 1810 static void 1811 indent_push(const char *s) 1812 { 1813 if (current_indent < MAX_INDENT) 1814 indent_s[current_indent++] = s; 1815 else 1816 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1817 } 1818 1819 static void 1820 indent_set(const char *s) 1821 { 1822 current_indent = 0; 1823 indent_push(s); 1824 } 1825 1826 static void 1827 indent_pop(void) 1828 { 1829 if (current_indent > 0) 1830 current_indent--; 1831 else 1832 out(O_DIE, "recursion underflow"); 1833 } 1834 1835 static void 1836 indent(void) 1837 { 1838 int i; 1839 if (!Verbose) 1840 return; 1841 for (i = 0; i < current_indent; i++) 1842 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1843 } 1844 1845 #define SLNEW 1 1846 #define SLCHANGED 2 1847 #define SLWAIT 3 1848 #define SLDISPROVED 4 1849 1850 static void 1851 print_suspects(int circumstance, struct fme *fmep) 1852 { 1853 struct event *ep; 1854 1855 out(O_ALTFP|O_NONL, "["); 1856 if (circumstance == SLCHANGED) { 1857 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1858 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1859 } else if (circumstance == SLWAIT) { 1860 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1861 fmep->timer); 1862 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1863 } else if (circumstance == SLDISPROVED) { 1864 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1865 } else { 1866 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1867 } 1868 1869 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1870 out(O_ALTFP, "]"); 1871 return; 1872 } 1873 1874 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1875 out(O_ALTFP|O_NONL, " "); 1876 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1877 } 1878 out(O_ALTFP, "]"); 1879 } 1880 1881 static struct node * 1882 eventprop_lookup(struct event *ep, const char *propname) 1883 { 1884 return (lut_lookup(ep->props, (void *)propname, NULL)); 1885 } 1886 1887 #define MAXDIGITIDX 23 1888 static char numbuf[MAXDIGITIDX + 1]; 1889 1890 static int 1891 node2uint(struct node *n, uint_t *valp) 1892 { 1893 struct evalue value; 1894 struct lut *globals = NULL; 1895 1896 if (n == NULL) 1897 return (1); 1898 1899 /* 1900 * check value.v since we are being asked to convert an unsigned 1901 * long long int to an unsigned int 1902 */ 1903 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1904 value.t != UINT64 || value.v > (1ULL << 32)) 1905 return (1); 1906 1907 *valp = (uint_t)value.v; 1908 1909 return (0); 1910 } 1911 1912 static nvlist_t * 1913 node2fmri(struct node *n) 1914 { 1915 nvlist_t **pa, *f, *p; 1916 struct node *nc; 1917 uint_t depth = 0; 1918 char *numstr, *nullbyte; 1919 char *failure; 1920 int err, i; 1921 1922 /* XXX do we need to be able to handle a non-T_NAME node? */ 1923 if (n == NULL || n->t != T_NAME) 1924 return (NULL); 1925 1926 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1927 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1928 break; 1929 depth++; 1930 } 1931 1932 if (nc != NULL) { 1933 /* We bailed early, something went wrong */ 1934 return (NULL); 1935 } 1936 1937 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1938 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1939 pa = alloca(depth * sizeof (nvlist_t *)); 1940 for (i = 0; i < depth; i++) 1941 pa[i] = NULL; 1942 1943 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1944 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1945 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1946 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1947 if (err != 0) { 1948 failure = "basic construction of FMRI failed"; 1949 goto boom; 1950 } 1951 1952 numbuf[MAXDIGITIDX] = '\0'; 1953 nullbyte = &numbuf[MAXDIGITIDX]; 1954 i = 0; 1955 1956 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1957 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1958 if (err != 0) { 1959 failure = "alloc of an hc-pair failed"; 1960 goto boom; 1961 } 1962 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1963 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1964 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1965 if (err != 0) { 1966 failure = "construction of an hc-pair failed"; 1967 goto boom; 1968 } 1969 pa[i++] = p; 1970 } 1971 1972 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1973 if (err == 0) { 1974 for (i = 0; i < depth; i++) 1975 if (pa[i] != NULL) 1976 nvlist_free(pa[i]); 1977 return (f); 1978 } 1979 failure = "addition of hc-pair array to FMRI failed"; 1980 1981 boom: 1982 for (i = 0; i < depth; i++) 1983 if (pa[i] != NULL) 1984 nvlist_free(pa[i]); 1985 nvlist_free(f); 1986 out(O_DIE, "%s", failure); 1987 /*NOTREACHED*/ 1988 return (NULL); 1989 } 1990 1991 /* an ipath cache entry is an array of these, with s==NULL at the end */ 1992 struct ipath { 1993 const char *s; /* component name (in stable) */ 1994 int i; /* instance number */ 1995 }; 1996 1997 static nvlist_t * 1998 ipath2fmri(struct ipath *ipath) 1999 { 2000 nvlist_t **pa, *f, *p; 2001 uint_t depth = 0; 2002 char *numstr, *nullbyte; 2003 char *failure; 2004 int err, i; 2005 struct ipath *ipp; 2006 2007 for (ipp = ipath; ipp->s != NULL; ipp++) 2008 depth++; 2009 2010 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2011 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2012 pa = alloca(depth * sizeof (nvlist_t *)); 2013 for (i = 0; i < depth; i++) 2014 pa[i] = NULL; 2015 2016 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2017 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2018 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2019 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2020 if (err != 0) { 2021 failure = "basic construction of FMRI failed"; 2022 goto boom; 2023 } 2024 2025 numbuf[MAXDIGITIDX] = '\0'; 2026 nullbyte = &numbuf[MAXDIGITIDX]; 2027 i = 0; 2028 2029 for (ipp = ipath; ipp->s != NULL; ipp++) { 2030 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2031 if (err != 0) { 2032 failure = "alloc of an hc-pair failed"; 2033 goto boom; 2034 } 2035 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2036 numstr = ulltostr(ipp->i, nullbyte); 2037 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2038 if (err != 0) { 2039 failure = "construction of an hc-pair failed"; 2040 goto boom; 2041 } 2042 pa[i++] = p; 2043 } 2044 2045 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2046 if (err == 0) { 2047 for (i = 0; i < depth; i++) 2048 if (pa[i] != NULL) 2049 nvlist_free(pa[i]); 2050 return (f); 2051 } 2052 failure = "addition of hc-pair array to FMRI failed"; 2053 2054 boom: 2055 for (i = 0; i < depth; i++) 2056 if (pa[i] != NULL) 2057 nvlist_free(pa[i]); 2058 nvlist_free(f); 2059 out(O_DIE, "%s", failure); 2060 /*NOTREACHED*/ 2061 return (NULL); 2062 } 2063 2064 static uint_t 2065 avg(uint_t sum, uint_t cnt) 2066 { 2067 unsigned long long s = sum * 10; 2068 2069 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 2070 } 2071 2072 static uint8_t 2073 percentof(uint_t part, uint_t whole) 2074 { 2075 unsigned long long p = part * 1000; 2076 2077 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2078 } 2079 2080 struct rsl { 2081 struct event *suspect; 2082 nvlist_t *asru; 2083 nvlist_t *fru; 2084 nvlist_t *rsrc; 2085 }; 2086 2087 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2088 2089 /* 2090 * rslfree -- free internal members of struct rsl not expected to be 2091 * freed elsewhere. 2092 */ 2093 static void 2094 rslfree(struct rsl *freeme) 2095 { 2096 if (freeme->asru != NULL) 2097 nvlist_free(freeme->asru); 2098 if (freeme->fru != NULL) 2099 nvlist_free(freeme->fru); 2100 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2101 nvlist_free(freeme->rsrc); 2102 } 2103 2104 /* 2105 * rslcmp -- compare two rsl structures. Use the following 2106 * comparisons to establish cardinality: 2107 * 2108 * 1. Name of the suspect's class. (simple strcmp) 2109 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2110 * 2111 */ 2112 static int 2113 rslcmp(const void *a, const void *b) 2114 { 2115 struct rsl *r1 = (struct rsl *)a; 2116 struct rsl *r2 = (struct rsl *)b; 2117 int rv; 2118 2119 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2120 r2->suspect->enode->u.event.ename->u.name.s); 2121 if (rv != 0) 2122 return (rv); 2123 2124 if (r1->rsrc == NULL && r2->rsrc == NULL) 2125 return (0); 2126 if (r1->rsrc == NULL) 2127 return (-1); 2128 if (r2->rsrc == NULL) 2129 return (1); 2130 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2131 } 2132 2133 /* 2134 * rsluniq -- given an array of rsl structures, seek out and "remove" 2135 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 2136 * of the array element. Removal also means updating the number of 2137 * problems and the number of problems which are not faults. User 2138 * provides the first and last element pointers. 2139 */ 2140 static void 2141 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 2142 { 2143 struct rsl *cr; 2144 2145 if (*nprobs == 1) 2146 return; 2147 2148 /* 2149 * At this point, we only expect duplicate defects. 2150 * Eversholt's diagnosis algorithm prevents duplicate 2151 * suspects, but we rewrite defects in the platform code after 2152 * the diagnosis is made, and that can introduce new 2153 * duplicates. 2154 */ 2155 while (first <= last) { 2156 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 2157 first++; 2158 continue; 2159 } 2160 cr = first + 1; 2161 while (cr <= last) { 2162 if (is_defect(first->suspect->t)) { 2163 if (rslcmp(first, cr) == 0) { 2164 cr->suspect = NULL; 2165 rslfree(cr); 2166 (*nprobs)--; 2167 (*nnonf)--; 2168 } 2169 } 2170 /* 2171 * assume all defects are in order after our 2172 * sort and short circuit here with "else break" ? 2173 */ 2174 cr++; 2175 } 2176 first++; 2177 } 2178 } 2179 2180 /* 2181 * get_resources -- for a given suspect, determine what ASRU, FRU and 2182 * RSRC nvlists should be advertised in the final suspect list. 2183 */ 2184 void 2185 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2186 { 2187 struct node *asrudef, *frudef; 2188 nvlist_t *asru, *fru; 2189 nvlist_t *rsrc = NULL; 2190 char *pathstr; 2191 2192 /* 2193 * First find any ASRU and/or FRU defined in the 2194 * initial fault tree. 2195 */ 2196 asrudef = eventprop_lookup(sp, L_ASRU); 2197 frudef = eventprop_lookup(sp, L_FRU); 2198 2199 /* 2200 * Create FMRIs based on those definitions 2201 */ 2202 asru = node2fmri(asrudef); 2203 fru = node2fmri(frudef); 2204 pathstr = ipath2str(NULL, sp->ipp); 2205 2206 /* 2207 * Allow for platform translations of the FMRIs 2208 */ 2209 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2210 pathstr); 2211 2212 FREE(pathstr); 2213 rsrcs->suspect = sp; 2214 rsrcs->asru = asru; 2215 rsrcs->fru = fru; 2216 rsrcs->rsrc = rsrc; 2217 } 2218 2219 /* 2220 * trim_suspects -- prior to publishing, we may need to remove some 2221 * suspects from the list. If we're auto-closing upsets, we don't 2222 * want any of those in the published list. If the ASRUs for multiple 2223 * defects resolve to the same ASRU (driver) we only want to publish 2224 * that as a single suspect. 2225 */ 2226 static int 2227 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2228 fmd_event_t *ffep, int *mess_zero_nonfaultp) 2229 { 2230 struct event *ep; 2231 struct rsl *rp = begin; 2232 struct rsl *rp2 = begin2; 2233 int mess_zero_count = 0; 2234 int serd_rval; 2235 uint_t messval; 2236 2237 /* remove any unwanted upsets and populate our array */ 2238 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2239 if (is_upset(ep->t)) 2240 continue; 2241 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2242 NULL, NULL); 2243 if (serd_rval == 0) 2244 continue; 2245 if (node2uint(eventprop_lookup(ep, L_message), 2246 &messval) == 0 && messval == 0) { 2247 get_resources(ep, rp2, fmep->config); 2248 rp2++; 2249 mess_zero_count++; 2250 if (!is_fault(ep->t)) 2251 (*mess_zero_nonfaultp)++; 2252 } else { 2253 get_resources(ep, rp, fmep->config); 2254 rp++; 2255 fmep->nsuspects++; 2256 if (!is_fault(ep->t)) 2257 fmep->nonfault++; 2258 } 2259 } 2260 return (mess_zero_count); 2261 } 2262 2263 /* 2264 * addpayloadprop -- add a payload prop to a problem 2265 */ 2266 static void 2267 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2268 { 2269 nvlist_t *rsrc, *hcs; 2270 2271 ASSERT(fault != NULL); 2272 ASSERT(lhs != NULL); 2273 ASSERT(rhs != NULL); 2274 2275 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2276 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2277 2278 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2279 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2280 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2281 out(O_DIE, 2282 "cannot add payloadprop \"%s\" to fault", lhs); 2283 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2284 out(O_DIE, 2285 "cannot add payloadprop \"%s\" to fault", lhs); 2286 nvlist_free(hcs); 2287 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2288 out(O_DIE, 2289 "cannot add payloadprop \"%s\" to fault", lhs); 2290 } else 2291 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2292 2293 if (rhs->t == UINT64) { 2294 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2295 2296 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2297 out(O_DIE, 2298 "cannot add payloadprop \"%s\" to fault", lhs); 2299 } else { 2300 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2301 lhs, (char *)(uintptr_t)rhs->v); 2302 2303 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2304 out(O_DIE, 2305 "cannot add payloadprop \"%s\" to fault", lhs); 2306 } 2307 } 2308 2309 static char *Istatbuf; 2310 static char *Istatbufptr; 2311 static int Istatsz; 2312 2313 /* 2314 * istataddsize -- calculate size of istat and add it to Istatsz 2315 */ 2316 /*ARGSUSED2*/ 2317 static void 2318 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2319 { 2320 int val; 2321 2322 ASSERT(lhs != NULL); 2323 ASSERT(rhs != NULL); 2324 2325 if ((val = stats_counter_value(rhs)) == 0) 2326 return; /* skip zero-valued stats */ 2327 2328 /* count up the size of the stat name */ 2329 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2330 Istatsz++; /* for the trailing NULL byte */ 2331 2332 /* count up the size of the stat value */ 2333 Istatsz += snprintf(NULL, 0, "%d", val); 2334 Istatsz++; /* for the trailing NULL byte */ 2335 } 2336 2337 /* 2338 * istat2str -- serialize an istat, writing result to *Istatbufptr 2339 */ 2340 /*ARGSUSED2*/ 2341 static void 2342 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2343 { 2344 char *str; 2345 int len; 2346 int val; 2347 2348 ASSERT(lhs != NULL); 2349 ASSERT(rhs != NULL); 2350 2351 if ((val = stats_counter_value(rhs)) == 0) 2352 return; /* skip zero-valued stats */ 2353 2354 /* serialize the stat name */ 2355 str = ipath2str(lhs->ename, lhs->ipath); 2356 len = strlen(str); 2357 2358 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2359 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2360 Istatbufptr += len; 2361 FREE(str); 2362 *Istatbufptr++ = '\0'; 2363 2364 /* serialize the stat value */ 2365 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2366 "%d", val); 2367 *Istatbufptr++ = '\0'; 2368 2369 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2370 } 2371 2372 void 2373 istat_save() 2374 { 2375 if (Istat_need_save == 0) 2376 return; 2377 2378 /* figure out how big the serialzed info is */ 2379 Istatsz = 0; 2380 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2381 2382 if (Istatsz == 0) { 2383 /* no stats to save */ 2384 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2385 return; 2386 } 2387 2388 /* create the serialized buffer */ 2389 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2390 lut_walk(Istats, (lut_cb)istat2str, NULL); 2391 2392 /* clear out current saved stats */ 2393 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2394 2395 /* write out the new version */ 2396 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2397 FREE(Istatbuf); 2398 2399 Istat_need_save = 0; 2400 } 2401 2402 int 2403 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2404 { 2405 if (ent1->ename != ent2->ename) 2406 return (ent2->ename - ent1->ename); 2407 if (ent1->ipath != ent2->ipath) 2408 return ((char *)ent2->ipath - (char *)ent1->ipath); 2409 2410 return (0); 2411 } 2412 2413 /* 2414 * istat-verify -- verify the component associated with a stat still exists 2415 * 2416 * if the component no longer exists, this routine resets the stat and 2417 * returns 0. if the component still exists, it returns 1. 2418 */ 2419 static int 2420 istat_verify(struct node *snp, struct istat_entry *entp) 2421 { 2422 struct stats *statp; 2423 nvlist_t *fmri; 2424 2425 fmri = node2fmri(snp->u.event.epname); 2426 if (platform_path_exists(fmri)) { 2427 nvlist_free(fmri); 2428 return (1); 2429 } 2430 nvlist_free(fmri); 2431 2432 /* component no longer in system. zero out the associated stats */ 2433 if ((statp = (struct stats *) 2434 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2435 stats_counter_value(statp) == 0) 2436 return (0); /* stat is already reset */ 2437 2438 Istat_need_save = 1; 2439 stats_counter_reset(statp); 2440 return (0); 2441 } 2442 2443 static void 2444 istat_bump(struct node *snp, int n) 2445 { 2446 struct stats *statp; 2447 struct istat_entry ent; 2448 2449 ASSERT(snp != NULL); 2450 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2451 ASSERT(snp->u.event.epname != NULL); 2452 2453 /* class name should be hoisted into a single stable entry */ 2454 ASSERT(snp->u.event.ename->u.name.next == NULL); 2455 ent.ename = snp->u.event.ename->u.name.s; 2456 ent.ipath = ipath(snp->u.event.epname); 2457 2458 if (!istat_verify(snp, &ent)) { 2459 /* component no longer exists in system, nothing to do */ 2460 return; 2461 } 2462 2463 if ((statp = (struct stats *) 2464 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2465 /* need to create the counter */ 2466 int cnt = 0; 2467 struct node *np; 2468 char *sname; 2469 char *snamep; 2470 struct istat_entry *newentp; 2471 2472 /* count up the size of the stat name */ 2473 np = snp->u.event.ename; 2474 while (np != NULL) { 2475 cnt += strlen(np->u.name.s); 2476 cnt++; /* for the '.' or '@' */ 2477 np = np->u.name.next; 2478 } 2479 np = snp->u.event.epname; 2480 while (np != NULL) { 2481 cnt += snprintf(NULL, 0, "%s%llu", 2482 np->u.name.s, np->u.name.child->u.ull); 2483 cnt++; /* for the '/' or trailing NULL byte */ 2484 np = np->u.name.next; 2485 } 2486 2487 /* build the stat name */ 2488 snamep = sname = alloca(cnt); 2489 np = snp->u.event.ename; 2490 while (np != NULL) { 2491 snamep += snprintf(snamep, &sname[cnt] - snamep, 2492 "%s", np->u.name.s); 2493 np = np->u.name.next; 2494 if (np) 2495 *snamep++ = '.'; 2496 } 2497 *snamep++ = '@'; 2498 np = snp->u.event.epname; 2499 while (np != NULL) { 2500 snamep += snprintf(snamep, &sname[cnt] - snamep, 2501 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2502 np = np->u.name.next; 2503 if (np) 2504 *snamep++ = '/'; 2505 } 2506 *snamep++ = '\0'; 2507 2508 /* create the new stat & add it to our list */ 2509 newentp = MALLOC(sizeof (*newentp)); 2510 *newentp = ent; 2511 statp = stats_new_counter(NULL, sname, 0); 2512 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2513 (lut_cmp)istat_cmp); 2514 } 2515 2516 /* if n is non-zero, set that value instead of bumping */ 2517 if (n) { 2518 stats_counter_reset(statp); 2519 stats_counter_add(statp, n); 2520 } else 2521 stats_counter_bump(statp); 2522 Istat_need_save = 1; 2523 2524 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2525 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2526 stats_counter_value(statp)); 2527 } 2528 2529 /*ARGSUSED*/ 2530 static void 2531 istat_destructor(void *left, void *right, void *arg) 2532 { 2533 struct istat_entry *entp = (struct istat_entry *)left; 2534 struct stats *statp = (struct stats *)right; 2535 FREE(entp); 2536 stats_delete(statp); 2537 } 2538 2539 /* 2540 * Callback used in a walk of the Istats to reset matching stat counters. 2541 */ 2542 static void 2543 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2544 const struct ipath *ipp) 2545 { 2546 char *path; 2547 2548 if (entp->ipath == ipp) { 2549 path = ipath2str(entp->ename, ipp); 2550 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2551 FREE(path); 2552 stats_counter_reset(statp); 2553 Istat_need_save = 1; 2554 } 2555 } 2556 2557 /*ARGSUSED*/ 2558 static void 2559 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2560 void *unused) 2561 { 2562 char *path; 2563 nvlist_t *fmri; 2564 2565 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2566 if (!platform_path_exists(fmri)) { 2567 path = ipath2str(entp->ename, entp->ipath); 2568 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2569 FREE(path); 2570 stats_counter_reset(statp); 2571 Istat_need_save = 1; 2572 } 2573 nvlist_free(fmri); 2574 } 2575 2576 void 2577 istat_fini(void) 2578 { 2579 lut_free(Istats, istat_destructor, NULL); 2580 } 2581 2582 static char *Serdbuf; 2583 static char *Serdbufptr; 2584 static int Serdsz; 2585 2586 /* 2587 * serdaddsize -- calculate size of serd and add it to Serdsz 2588 */ 2589 /*ARGSUSED*/ 2590 static void 2591 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2592 { 2593 ASSERT(lhs != NULL); 2594 2595 /* count up the size of the stat name */ 2596 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2597 Serdsz++; /* for the trailing NULL byte */ 2598 } 2599 2600 /* 2601 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2602 */ 2603 /*ARGSUSED*/ 2604 static void 2605 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2606 { 2607 char *str; 2608 int len; 2609 2610 ASSERT(lhs != NULL); 2611 2612 /* serialize the serd engine name */ 2613 str = ipath2str(lhs->ename, lhs->ipath); 2614 len = strlen(str); 2615 2616 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2617 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2618 Serdbufptr += len; 2619 FREE(str); 2620 *Serdbufptr++ = '\0'; 2621 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2622 } 2623 2624 void 2625 serd_save() 2626 { 2627 if (Serd_need_save == 0) 2628 return; 2629 2630 /* figure out how big the serialzed info is */ 2631 Serdsz = 0; 2632 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2633 2634 if (Serdsz == 0) { 2635 /* no serd engines to save */ 2636 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2637 return; 2638 } 2639 2640 /* create the serialized buffer */ 2641 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2642 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2643 2644 /* clear out current saved stats */ 2645 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2646 2647 /* write out the new version */ 2648 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2649 FREE(Serdbuf); 2650 Serd_need_save = 0; 2651 } 2652 2653 int 2654 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2655 { 2656 if (ent1->ename != ent2->ename) 2657 return (ent2->ename - ent1->ename); 2658 if (ent1->ipath != ent2->ipath) 2659 return ((char *)ent2->ipath - (char *)ent1->ipath); 2660 2661 return (0); 2662 } 2663 2664 void 2665 fme_serd_load(fmd_hdl_t *hdl) 2666 { 2667 int sz; 2668 char *sbuf; 2669 char *sepptr; 2670 char *ptr; 2671 struct serd_entry *newentp; 2672 struct node *epname; 2673 nvlist_t *fmri; 2674 char *namestring; 2675 2676 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2677 return; 2678 sbuf = alloca(sz); 2679 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2680 ptr = sbuf; 2681 while (ptr < &sbuf[sz]) { 2682 sepptr = strchr(ptr, '@'); 2683 *sepptr = '\0'; 2684 namestring = ptr; 2685 sepptr++; 2686 ptr = sepptr; 2687 ptr += strlen(ptr); 2688 ptr++; /* move past the '\0' separating paths */ 2689 epname = pathstring2epnamenp(sepptr); 2690 fmri = node2fmri(epname); 2691 if (platform_path_exists(fmri)) { 2692 newentp = MALLOC(sizeof (*newentp)); 2693 newentp->hdl = hdl; 2694 newentp->ipath = ipath(epname); 2695 newentp->ename = stable(namestring); 2696 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2697 (void *)newentp, (lut_cmp)serd_cmp); 2698 } else 2699 Serd_need_save = 1; 2700 tree_free(epname); 2701 nvlist_free(fmri); 2702 } 2703 /* save it back again in case some of the paths no longer exist */ 2704 serd_save(); 2705 } 2706 2707 /*ARGSUSED*/ 2708 static void 2709 serd_destructor(void *left, void *right, void *arg) 2710 { 2711 struct serd_entry *entp = (struct serd_entry *)left; 2712 FREE(entp); 2713 } 2714 2715 /* 2716 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2717 */ 2718 /*ARGSUSED*/ 2719 static void 2720 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2721 { 2722 char *path; 2723 2724 if (entp->ipath == ipp) { 2725 path = ipath2str(entp->ename, ipp); 2726 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2727 fmd_serd_reset(entp->hdl, path); 2728 FREE(path); 2729 Serd_need_save = 1; 2730 } 2731 } 2732 2733 /*ARGSUSED*/ 2734 static void 2735 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2736 { 2737 char *path; 2738 nvlist_t *fmri; 2739 2740 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2741 if (!platform_path_exists(fmri)) { 2742 path = ipath2str(entp->ename, entp->ipath); 2743 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2744 fmd_serd_reset(entp->hdl, path); 2745 FREE(path); 2746 Serd_need_save = 1; 2747 } 2748 nvlist_free(fmri); 2749 } 2750 2751 void 2752 serd_fini(void) 2753 { 2754 lut_free(SerdEngines, serd_destructor, NULL); 2755 } 2756 2757 static void 2758 publish_suspects(struct fme *fmep, struct rsl *srl) 2759 { 2760 struct rsl *rp; 2761 nvlist_t *fault; 2762 uint8_t cert; 2763 uint_t *frs; 2764 uint_t fravg, frsum, fr; 2765 uint_t messval; 2766 uint_t retireval; 2767 uint_t responseval; 2768 struct node *snp; 2769 int frcnt, fridx; 2770 boolean_t allfaulty = B_TRUE; 2771 struct rsl *erl = srl + fmep->nsuspects - 1; 2772 2773 /* 2774 * sort the array 2775 */ 2776 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2777 rsluniq(srl, erl, &fmep->nsuspects, &fmep->nonfault); 2778 2779 /* 2780 * If the suspect list is all faults, then for a given fault, 2781 * say X of N, X's certainty is computed via: 2782 * 2783 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 2784 * 2785 * If none of the suspects are faults, and there are N suspects, 2786 * the certainty of a given suspect is 100/N. 2787 * 2788 * If there are are a mixture of faults and other problems in 2789 * the suspect list, we take an average of the faults' 2790 * FITrates and treat this average as the FITrate for any 2791 * non-faults. The fitrate of any given suspect is then 2792 * computed per the first formula above. 2793 */ 2794 if (fmep->nonfault == fmep->nsuspects) { 2795 /* NO faults in the suspect list */ 2796 cert = percentof(1, fmep->nsuspects); 2797 } else { 2798 /* sum the fitrates */ 2799 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2800 fridx = frcnt = frsum = 0; 2801 2802 for (rp = srl; rp <= erl; rp++) { 2803 struct node *n; 2804 2805 if (rp->suspect == NULL) 2806 continue; 2807 if (!is_fault(rp->suspect->t)) { 2808 frs[fridx++] = 0; 2809 continue; 2810 } 2811 n = eventprop_lookup(rp->suspect, L_FITrate); 2812 if (node2uint(n, &fr) != 0) { 2813 out(O_DEBUG|O_NONL, "event "); 2814 ipath_print(O_DEBUG|O_NONL, 2815 rp->suspect->enode->u.event.ename->u.name.s, 2816 rp->suspect->ipp); 2817 out(O_DEBUG, " has no FITrate (using 1)"); 2818 fr = 1; 2819 } else if (fr == 0) { 2820 out(O_DEBUG|O_NONL, "event "); 2821 ipath_print(O_DEBUG|O_NONL, 2822 rp->suspect->enode->u.event.ename->u.name.s, 2823 rp->suspect->ipp); 2824 out(O_DEBUG, " has zero FITrate (using 1)"); 2825 fr = 1; 2826 } 2827 2828 frs[fridx++] = fr; 2829 frsum += fr; 2830 frcnt++; 2831 } 2832 fravg = avg(frsum, frcnt); 2833 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 2834 if (frs[fridx] == 0) { 2835 frs[fridx] = fravg; 2836 frsum += fravg; 2837 } 2838 } 2839 2840 /* Add them in reverse order of our sort, as fmd reverses order */ 2841 for (rp = erl; rp >= srl; rp--) { 2842 if (rp->suspect == NULL) 2843 continue; 2844 if (!is_fault(rp->suspect->t)) 2845 allfaulty = B_FALSE; 2846 if (fmep->nonfault != fmep->nsuspects) 2847 cert = percentof(frs[--fridx], frsum); 2848 fault = fmd_nvl_create_fault(fmep->hdl, 2849 rp->suspect->enode->u.event.ename->u.name.s, 2850 cert, 2851 rp->asru, 2852 rp->fru, 2853 rp->rsrc); 2854 if (fault == NULL) 2855 out(O_DIE, "fault creation failed"); 2856 /* if "message" property exists, add it to the fault */ 2857 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2858 &messval) == 0) { 2859 2860 out(O_ALTFP, 2861 "[FME%d, %s adds message=%d to suspect list]", 2862 fmep->id, 2863 rp->suspect->enode->u.event.ename->u.name.s, 2864 messval); 2865 if (nvlist_add_boolean_value(fault, 2866 FM_SUSPECT_MESSAGE, 2867 (messval) ? B_TRUE : B_FALSE) != 0) { 2868 out(O_DIE, "cannot add no-message to fault"); 2869 } 2870 } 2871 2872 /* if "retire" property exists, add it to the fault */ 2873 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2874 &retireval) == 0) { 2875 2876 out(O_ALTFP, 2877 "[FME%d, %s adds retire=%d to suspect list]", 2878 fmep->id, 2879 rp->suspect->enode->u.event.ename->u.name.s, 2880 retireval); 2881 if (nvlist_add_boolean_value(fault, 2882 FM_SUSPECT_RETIRE, 2883 (retireval) ? B_TRUE : B_FALSE) != 0) { 2884 out(O_DIE, "cannot add no-retire to fault"); 2885 } 2886 } 2887 2888 /* if "response" property exists, add it to the fault */ 2889 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2890 &responseval) == 0) { 2891 2892 out(O_ALTFP, 2893 "[FME%d, %s adds response=%d to suspect list]", 2894 fmep->id, 2895 rp->suspect->enode->u.event.ename->u.name.s, 2896 responseval); 2897 if (nvlist_add_boolean_value(fault, 2898 FM_SUSPECT_RESPONSE, 2899 (responseval) ? B_TRUE : B_FALSE) != 0) { 2900 out(O_DIE, "cannot add no-response to fault"); 2901 } 2902 } 2903 2904 /* add any payload properties */ 2905 lut_walk(rp->suspect->payloadprops, 2906 (lut_cb)addpayloadprop, (void *)fault); 2907 rslfree(rp); 2908 2909 /* 2910 * If "action" property exists, evaluate it; this must be done 2911 * before the allfaulty check below since some actions may 2912 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2913 * needs to be restructured if any new actions are introduced 2914 * that have effects that we do not want to be visible if 2915 * we decide not to publish in the dupclose check below. 2916 */ 2917 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2918 struct evalue evalue; 2919 2920 out(O_ALTFP|O_NONL, 2921 "[FME%d, %s action ", fmep->id, 2922 rp->suspect->enode->u.event.ename->u.name.s); 2923 ptree_name_iter(O_ALTFP|O_NONL, snp); 2924 out(O_ALTFP, "]"); 2925 Action_nvl = fault; 2926 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2927 NULL, 0, &evalue); 2928 } 2929 2930 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2931 2932 /* 2933 * check if the asru is already marked as "faulty". 2934 */ 2935 if (allfaulty) { 2936 nvlist_t *asru; 2937 2938 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2939 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2940 out(O_ALTFP|O_VERB|O_NONL, " "); 2941 if (nvlist_lookup_nvlist(fault, 2942 FM_FAULT_ASRU, &asru) != 0) { 2943 out(O_ALTFP|O_VERB, "NULL asru"); 2944 allfaulty = B_FALSE; 2945 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2946 FMD_HAS_FAULT_ASRU, NULL)) { 2947 out(O_ALTFP|O_VERB, "faulty"); 2948 } else { 2949 out(O_ALTFP|O_VERB, "not faulty"); 2950 allfaulty = B_FALSE; 2951 } 2952 } 2953 2954 } 2955 2956 if (!allfaulty) { 2957 /* 2958 * don't update the count stat if all asrus are already 2959 * present and unrepaired in the asru cache 2960 */ 2961 for (rp = erl; rp >= srl; rp--) { 2962 struct event *suspect = rp->suspect; 2963 2964 if (suspect == NULL) 2965 continue; 2966 2967 /* if "count" exists, increment the appropriate stat */ 2968 if ((snp = eventprop_lookup(suspect, 2969 L_count)) != NULL) { 2970 out(O_ALTFP|O_NONL, 2971 "[FME%d, %s count ", fmep->id, 2972 suspect->enode->u.event.ename->u.name.s); 2973 ptree_name_iter(O_ALTFP|O_NONL, snp); 2974 out(O_ALTFP, "]"); 2975 istat_bump(snp, 0); 2976 2977 } 2978 } 2979 istat_save(); /* write out any istat changes */ 2980 } 2981 } 2982 2983 static void 2984 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase) 2985 { 2986 struct case_list *newcase; 2987 nvlist_t *defect; 2988 2989 out(O_ALTFP, 2990 "[undiagnosable ereport received, " 2991 "creating and closing a new case (%s)]", 2992 Undiag_reason ? Undiag_reason : "reason not provided"); 2993 2994 newcase = MALLOC(sizeof (struct case_list)); 2995 newcase->next = NULL; 2996 newcase->fmcase = fmcase; 2997 if (Undiagablecaselist != NULL) 2998 newcase->next = Undiagablecaselist; 2999 Undiagablecaselist = newcase; 3000 3001 if (ffep != NULL) 3002 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3003 3004 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 3005 NULL, NULL, NULL); 3006 if (Undiag_reason != NULL) 3007 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 3008 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3009 3010 fmd_case_solve(hdl, newcase->fmcase); 3011 fmd_case_close(hdl, newcase->fmcase); 3012 } 3013 3014 static void 3015 fme_undiagnosable(struct fme *f) 3016 { 3017 nvlist_t *defect; 3018 3019 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3020 f->id, fmd_case_uuid(f->hdl, f->fmcase), 3021 Undiag_reason ? Undiag_reason : "undiagnosable"); 3022 3023 defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100, 3024 NULL, NULL, NULL); 3025 if (Undiag_reason != NULL) 3026 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 3027 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3028 fmd_case_solve(f->hdl, f->fmcase); 3029 fmd_case_close(f->hdl, f->fmcase); 3030 } 3031 3032 /* 3033 * fme_close_case 3034 * 3035 * Find the requested case amongst our fmes and close it. Free up 3036 * the related fme. 3037 */ 3038 void 3039 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3040 { 3041 struct case_list *ucasep, *prevcasep = NULL; 3042 struct fme *prev = NULL; 3043 struct fme *fmep; 3044 3045 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3046 if (fmcase != ucasep->fmcase) { 3047 prevcasep = ucasep; 3048 continue; 3049 } 3050 3051 if (prevcasep == NULL) 3052 Undiagablecaselist = Undiagablecaselist->next; 3053 else 3054 prevcasep->next = ucasep->next; 3055 3056 FREE(ucasep); 3057 return; 3058 } 3059 3060 for (fmep = FMElist; fmep; fmep = fmep->next) { 3061 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3062 break; 3063 prev = fmep; 3064 } 3065 3066 if (fmep == NULL) { 3067 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3068 fmd_case_uuid(hdl, fmcase)); 3069 return; 3070 } 3071 3072 if (EFMElist == fmep) 3073 EFMElist = prev; 3074 3075 if (prev == NULL) 3076 FMElist = FMElist->next; 3077 else 3078 prev->next = fmep->next; 3079 3080 fmep->next = NULL; 3081 3082 /* Get rid of any timer this fme has set */ 3083 if (fmep->wull != 0) 3084 fmd_timer_remove(fmep->hdl, fmep->timer); 3085 3086 if (ClosedFMEs == NULL) { 3087 ClosedFMEs = fmep; 3088 } else { 3089 fmep->next = ClosedFMEs; 3090 ClosedFMEs = fmep; 3091 } 3092 3093 Open_fme_count--; 3094 3095 /* See if we can close the overflow FME */ 3096 if (Open_fme_count <= Max_fme) { 3097 for (fmep = FMElist; fmep; fmep = fmep->next) { 3098 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3099 fmep->fmcase))) 3100 break; 3101 } 3102 3103 if (fmep != NULL) 3104 fmd_case_close(fmep->hdl, fmep->fmcase); 3105 } 3106 } 3107 3108 /* 3109 * fme_set_timer() 3110 * If the time we need to wait for the given FME is less than the 3111 * current timer, kick that old timer out and establish a new one. 3112 */ 3113 static int 3114 fme_set_timer(struct fme *fmep, unsigned long long wull) 3115 { 3116 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3117 ptree_timeval(O_ALTFP|O_VERB, &wull); 3118 3119 if (wull <= fmep->pull) { 3120 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3121 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3122 out(O_ALTFP|O_VERB, NULL); 3123 /* we've waited at least wull already, don't need timer */ 3124 return (0); 3125 } 3126 3127 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3128 if (fmep->wull != 0) { 3129 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3130 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3131 out(O_ALTFP|O_VERB, NULL); 3132 } else { 3133 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3134 out(O_ALTFP|O_VERB, NULL); 3135 } 3136 3137 if (fmep->wull != 0) 3138 if (wull >= fmep->wull) 3139 /* New timer would fire later than established timer */ 3140 return (0); 3141 3142 if (fmep->wull != 0) { 3143 fmd_timer_remove(fmep->hdl, fmep->timer); 3144 } 3145 3146 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3147 fmep->e0r, wull); 3148 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3149 fmep->wull = wull; 3150 return (1); 3151 } 3152 3153 void 3154 fme_timer_fired(struct fme *fmep, id_t tid) 3155 { 3156 struct fme *ffmep = NULL; 3157 3158 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3159 if (ffmep == fmep) 3160 break; 3161 3162 if (ffmep == NULL) { 3163 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3164 (void *)fmep); 3165 return; 3166 } 3167 3168 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3169 fmep->pull = fmep->wull; 3170 fmep->wull = 0; 3171 fmd_buf_write(fmep->hdl, fmep->fmcase, 3172 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3173 3174 fme_eval(fmep, fmep->e0r); 3175 } 3176 3177 /* 3178 * Preserve the fme's suspect list in its psuspects list, NULLing the 3179 * suspects list in the meantime. 3180 */ 3181 static void 3182 save_suspects(struct fme *fmep) 3183 { 3184 struct event *ep; 3185 struct event *nextep; 3186 3187 /* zero out the previous suspect list */ 3188 for (ep = fmep->psuspects; ep; ep = nextep) { 3189 nextep = ep->psuspects; 3190 ep->psuspects = NULL; 3191 } 3192 fmep->psuspects = NULL; 3193 3194 /* zero out the suspect list, copying it to previous suspect list */ 3195 fmep->psuspects = fmep->suspects; 3196 for (ep = fmep->suspects; ep; ep = nextep) { 3197 nextep = ep->suspects; 3198 ep->psuspects = ep->suspects; 3199 ep->suspects = NULL; 3200 ep->is_suspect = 0; 3201 } 3202 fmep->suspects = NULL; 3203 fmep->nsuspects = 0; 3204 fmep->nonfault = 0; 3205 } 3206 3207 /* 3208 * Retrieve the fme's suspect list from its psuspects list. 3209 */ 3210 static void 3211 restore_suspects(struct fme *fmep) 3212 { 3213 struct event *ep; 3214 struct event *nextep; 3215 3216 fmep->nsuspects = fmep->nonfault = 0; 3217 fmep->suspects = fmep->psuspects; 3218 for (ep = fmep->psuspects; ep; ep = nextep) { 3219 fmep->nsuspects++; 3220 if (!is_fault(ep->t)) 3221 fmep->nonfault++; 3222 nextep = ep->psuspects; 3223 ep->suspects = ep->psuspects; 3224 } 3225 } 3226 3227 /* 3228 * this is what we use to call the Emrys prototype code instead of main() 3229 */ 3230 static void 3231 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3232 { 3233 struct event *ep; 3234 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3235 struct rsl *srl = NULL; 3236 struct rsl *srl2 = NULL; 3237 int mess_zero_count; 3238 int mess_zero_nonfault = 0; 3239 int rpcnt; 3240 3241 save_suspects(fmep); 3242 3243 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3244 indent_set(" "); 3245 3246 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3247 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3248 3249 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3250 fme_state2str(fmep->state)); 3251 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3252 out(O_ALTFP|O_NONL, " "); 3253 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3254 } 3255 out(O_ALTFP, NULL); 3256 3257 switch (fmep->state) { 3258 case FME_CREDIBLE: 3259 print_suspects(SLNEW, fmep); 3260 (void) upsets_eval(fmep, ffep); 3261 3262 /* 3263 * we may have already posted suspects in upsets_eval() which 3264 * can recurse into fme_eval() again. If so then just return. 3265 */ 3266 if (fmep->posted_suspects) 3267 return; 3268 3269 stats_counter_bump(fmep->diags); 3270 rpcnt = fmep->nsuspects; 3271 save_suspects(fmep); 3272 3273 /* 3274 * create two lists, one for "message=1" faults and one for 3275 * "message=0" faults. If we have a mixture we will generate 3276 * two separate suspect lists. 3277 */ 3278 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3279 bzero(srl, rpcnt * sizeof (struct rsl)); 3280 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3281 bzero(srl2, rpcnt * sizeof (struct rsl)); 3282 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep, 3283 &mess_zero_nonfault); 3284 3285 /* 3286 * If the resulting suspect list has no members, we're 3287 * done so simply close the case. Otherwise sort and publish. 3288 */ 3289 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3290 out(O_ALTFP, 3291 "[FME%d, case %s (all suspects are upsets)]", 3292 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3293 fmd_case_close(fmep->hdl, fmep->fmcase); 3294 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3295 publish_suspects(fmep, srl); 3296 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3297 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3298 fmd_case_solve(fmep->hdl, fmep->fmcase); 3299 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3300 fmep->nsuspects = mess_zero_count; 3301 fmep->nonfault = mess_zero_nonfault; 3302 publish_suspects(fmep, srl2); 3303 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3304 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3305 fmd_case_solve(fmep->hdl, fmep->fmcase); 3306 } else { 3307 struct event *obsp; 3308 struct fme *nfmep; 3309 3310 publish_suspects(fmep, srl); 3311 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3312 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3313 fmd_case_solve(fmep->hdl, fmep->fmcase); 3314 3315 /* 3316 * Got both message=0 and message=1 so create a 3317 * duplicate case. Also need a temporary duplicate fme 3318 * structure for use by publish_suspects(). 3319 */ 3320 nfmep = alloc_fme(); 3321 nfmep->id = Nextid++; 3322 nfmep->hdl = fmep->hdl; 3323 nfmep->nsuspects = mess_zero_count; 3324 nfmep->nonfault = mess_zero_nonfault; 3325 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3326 out(O_ALTFP|O_STAMP, 3327 "[creating parallel FME%d, case %s]", nfmep->id, 3328 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3329 Open_fme_count++; 3330 if (ffep) { 3331 fmd_case_setprincipal(nfmep->hdl, 3332 nfmep->fmcase, ffep); 3333 fmd_case_add_ereport(nfmep->hdl, 3334 nfmep->fmcase, ffep); 3335 } 3336 for (obsp = fmep->observations; obsp; 3337 obsp = obsp->observations) 3338 if (obsp->ffep && obsp->ffep != ffep) 3339 fmd_case_add_ereport(nfmep->hdl, 3340 nfmep->fmcase, obsp->ffep); 3341 3342 publish_suspects(nfmep, srl2); 3343 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3344 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3345 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3346 FREE(nfmep); 3347 } 3348 FREE(srl); 3349 FREE(srl2); 3350 restore_suspects(fmep); 3351 3352 fmep->posted_suspects = 1; 3353 fmd_buf_write(fmep->hdl, fmep->fmcase, 3354 WOBUF_POSTD, 3355 (void *)&fmep->posted_suspects, 3356 sizeof (fmep->posted_suspects)); 3357 3358 /* 3359 * Now the suspects have been posted, we can clear up 3360 * the instance tree as we won't be looking at it again. 3361 * Also cancel the timer as the case is now solved. 3362 */ 3363 if (fmep->wull != 0) { 3364 fmd_timer_remove(fmep->hdl, fmep->timer); 3365 fmep->wull = 0; 3366 } 3367 break; 3368 3369 case FME_WAIT: 3370 ASSERT(my_delay > fmep->ull); 3371 (void) fme_set_timer(fmep, my_delay); 3372 print_suspects(SLWAIT, fmep); 3373 itree_prune(fmep->eventtree); 3374 return; 3375 3376 case FME_DISPROVED: 3377 print_suspects(SLDISPROVED, fmep); 3378 Undiag_reason = UD_UNSOLVD; 3379 fme_undiagnosable(fmep); 3380 break; 3381 } 3382 3383 itree_free(fmep->eventtree); 3384 fmep->eventtree = NULL; 3385 structconfig_free(fmep->config); 3386 fmep->config = NULL; 3387 destroy_fme_bufs(fmep); 3388 } 3389 3390 static void indent(void); 3391 static int triggered(struct fme *fmep, struct event *ep, int mark); 3392 static enum fme_state effects_test(struct fme *fmep, 3393 struct event *fault_event, unsigned long long at_latest_by, 3394 unsigned long long *pdelay); 3395 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3396 unsigned long long at_latest_by, unsigned long long *pdelay); 3397 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3398 unsigned long long at_latest_by, unsigned long long *pdelay); 3399 3400 static int 3401 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3402 { 3403 struct constraintlist *ctp; 3404 struct evalue value; 3405 char *sep = ""; 3406 3407 if (arrowp->forever_false) { 3408 indent(); 3409 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3410 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3411 out(O_ALTFP|O_VERB|O_NONL, sep); 3412 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3413 sep = ", "; 3414 } 3415 out(O_ALTFP|O_VERB, NULL); 3416 return (0); 3417 } 3418 if (arrowp->forever_true) { 3419 indent(); 3420 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3421 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3422 out(O_ALTFP|O_VERB|O_NONL, sep); 3423 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3424 sep = ", "; 3425 } 3426 out(O_ALTFP|O_VERB, NULL); 3427 return (1); 3428 } 3429 3430 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3431 if (eval_expr(ctp->cnode, NULL, NULL, 3432 &fmep->globals, fmep->config, 3433 arrowp, 0, &value)) { 3434 /* evaluation successful */ 3435 if (value.t == UNDEFINED || value.v == 0) { 3436 /* known false */ 3437 arrowp->forever_false = 1; 3438 indent(); 3439 out(O_ALTFP|O_VERB|O_NONL, 3440 " False constraint: "); 3441 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3442 out(O_ALTFP|O_VERB, NULL); 3443 return (0); 3444 } 3445 } else { 3446 /* evaluation unsuccessful -- unknown value */ 3447 indent(); 3448 out(O_ALTFP|O_VERB|O_NONL, 3449 " Deferred constraint: "); 3450 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3451 out(O_ALTFP|O_VERB, NULL); 3452 return (1); 3453 } 3454 } 3455 /* known true */ 3456 arrowp->forever_true = 1; 3457 indent(); 3458 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3459 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3460 out(O_ALTFP|O_VERB|O_NONL, sep); 3461 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3462 sep = ", "; 3463 } 3464 out(O_ALTFP|O_VERB, NULL); 3465 return (1); 3466 } 3467 3468 static int 3469 triggered(struct fme *fmep, struct event *ep, int mark) 3470 { 3471 struct bubble *bp; 3472 struct arrowlist *ap; 3473 int count = 0; 3474 3475 stats_counter_bump(fmep->Tcallcount); 3476 for (bp = itree_next_bubble(ep, NULL); bp; 3477 bp = itree_next_bubble(ep, bp)) { 3478 if (bp->t != B_TO) 3479 continue; 3480 for (ap = itree_next_arrow(bp, NULL); ap; 3481 ap = itree_next_arrow(bp, ap)) { 3482 /* check count of marks against K in the bubble */ 3483 if ((ap->arrowp->mark & mark) && 3484 ++count >= bp->nork) 3485 return (1); 3486 } 3487 } 3488 return (0); 3489 } 3490 3491 static int 3492 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3493 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3494 { 3495 struct bubble *bp; 3496 struct arrowlist *ap; 3497 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3498 unsigned long long my_delay; 3499 enum fme_state result; 3500 int retval = 0; 3501 3502 for (bp = itree_next_bubble(ep, NULL); bp; 3503 bp = itree_next_bubble(ep, bp)) { 3504 if (bp->t != B_FROM) 3505 continue; 3506 stats_counter_bump(fmep->Marrowcount); 3507 for (ap = itree_next_arrow(bp, NULL); ap; 3508 ap = itree_next_arrow(bp, ap)) { 3509 struct event *ep2 = ap->arrowp->head->myevent; 3510 /* 3511 * if we're clearing marks, we can avoid doing 3512 * all that work evaluating constraints. 3513 */ 3514 if (mark == 0) { 3515 if (ap->arrowp->arrow_marked == 0) 3516 continue; 3517 ap->arrowp->arrow_marked = 0; 3518 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3519 if (keep && (ep2->cached_state & 3520 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3521 ep2->keep_in_tree = 1; 3522 ep2->cached_state &= 3523 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3524 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3525 keep); 3526 continue; 3527 } 3528 ap->arrowp->arrow_marked = 1; 3529 if (ep2->cached_state & REQMNTS_DISPROVED) { 3530 indent(); 3531 out(O_ALTFP|O_VERB|O_NONL, 3532 " ALREADY DISPROVED "); 3533 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3534 out(O_ALTFP|O_VERB, NULL); 3535 continue; 3536 } 3537 if (ep2->cached_state & WAIT_EFFECT) { 3538 indent(); 3539 out(O_ALTFP|O_VERB|O_NONL, 3540 " ALREADY EFFECTS WAIT "); 3541 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3542 out(O_ALTFP|O_VERB, NULL); 3543 continue; 3544 } 3545 if (ep2->cached_state & CREDIBLE_EFFECT) { 3546 indent(); 3547 out(O_ALTFP|O_VERB|O_NONL, 3548 " ALREADY EFFECTS CREDIBLE "); 3549 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3550 out(O_ALTFP|O_VERB, NULL); 3551 continue; 3552 } 3553 if ((ep2->cached_state & PARENT_WAIT) && 3554 (mark & PARENT_WAIT)) { 3555 indent(); 3556 out(O_ALTFP|O_VERB|O_NONL, 3557 " ALREADY PARENT EFFECTS WAIT "); 3558 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3559 out(O_ALTFP|O_VERB, NULL); 3560 continue; 3561 } 3562 platform_set_payloadnvp(ep2->nvp); 3563 if (checkconstraints(fmep, ap->arrowp) == 0) { 3564 platform_set_payloadnvp(NULL); 3565 indent(); 3566 out(O_ALTFP|O_VERB|O_NONL, 3567 " CONSTRAINTS FAIL "); 3568 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3569 out(O_ALTFP|O_VERB, NULL); 3570 continue; 3571 } 3572 platform_set_payloadnvp(NULL); 3573 ap->arrowp->mark |= EFFECTS_COUNTER; 3574 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3575 indent(); 3576 out(O_ALTFP|O_VERB|O_NONL, 3577 " K-COUNT NOT YET MET "); 3578 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3579 out(O_ALTFP|O_VERB, NULL); 3580 continue; 3581 } 3582 ep2->cached_state &= ~PARENT_WAIT; 3583 /* 3584 * if we've reached an ereport and no propagation time 3585 * is specified, use the Hesitate value 3586 */ 3587 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3588 ap->arrowp->maxdelay == 0ULL) { 3589 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3590 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3591 out(O_ALTFP|O_VERB, NULL); 3592 result = requirements_test(fmep, ep2, Hesitate, 3593 &my_delay); 3594 } else { 3595 result = requirements_test(fmep, ep2, 3596 at_latest_by + ap->arrowp->maxdelay, 3597 &my_delay); 3598 } 3599 if (result == FME_WAIT) { 3600 retval = WAIT_EFFECT; 3601 if (overall_delay > my_delay) 3602 overall_delay = my_delay; 3603 ep2->cached_state |= WAIT_EFFECT; 3604 indent(); 3605 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3606 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3607 out(O_ALTFP|O_VERB, NULL); 3608 indent_push(" E"); 3609 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3610 at_latest_by, &my_delay, 0) == 3611 WAIT_EFFECT) { 3612 retval = WAIT_EFFECT; 3613 if (overall_delay > my_delay) 3614 overall_delay = my_delay; 3615 } 3616 indent_pop(); 3617 } else if (result == FME_DISPROVED) { 3618 indent(); 3619 out(O_ALTFP|O_VERB|O_NONL, 3620 " EFFECTS DISPROVED "); 3621 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3622 out(O_ALTFP|O_VERB, NULL); 3623 } else { 3624 ep2->cached_state |= mark; 3625 indent(); 3626 if (mark == CREDIBLE_EFFECT) 3627 out(O_ALTFP|O_VERB|O_NONL, 3628 " EFFECTS CREDIBLE "); 3629 else 3630 out(O_ALTFP|O_VERB|O_NONL, 3631 " PARENT EFFECTS WAIT "); 3632 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3633 out(O_ALTFP|O_VERB, NULL); 3634 indent_push(" E"); 3635 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3636 &my_delay, 0) == WAIT_EFFECT) { 3637 retval = WAIT_EFFECT; 3638 if (overall_delay > my_delay) 3639 overall_delay = my_delay; 3640 } 3641 indent_pop(); 3642 } 3643 } 3644 } 3645 if (retval == WAIT_EFFECT) 3646 *pdelay = overall_delay; 3647 return (retval); 3648 } 3649 3650 static enum fme_state 3651 effects_test(struct fme *fmep, struct event *fault_event, 3652 unsigned long long at_latest_by, unsigned long long *pdelay) 3653 { 3654 struct event *error_event; 3655 enum fme_state return_value = FME_CREDIBLE; 3656 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3657 unsigned long long my_delay; 3658 3659 stats_counter_bump(fmep->Ecallcount); 3660 indent_push(" E"); 3661 indent(); 3662 out(O_ALTFP|O_VERB|O_NONL, "->"); 3663 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3664 out(O_ALTFP|O_VERB, NULL); 3665 3666 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3667 &my_delay, 0) == WAIT_EFFECT) { 3668 return_value = FME_WAIT; 3669 if (overall_delay > my_delay) 3670 overall_delay = my_delay; 3671 } 3672 for (error_event = fmep->observations; 3673 error_event; error_event = error_event->observations) { 3674 indent(); 3675 out(O_ALTFP|O_VERB|O_NONL, " "); 3676 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3677 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3678 if (error_event->cached_state & 3679 (PARENT_WAIT|WAIT_EFFECT)) { 3680 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3681 continue; 3682 } 3683 return_value = FME_DISPROVED; 3684 out(O_ALTFP|O_VERB, " NOT triggered"); 3685 break; 3686 } else { 3687 out(O_ALTFP|O_VERB, " triggered"); 3688 } 3689 } 3690 if (return_value == FME_DISPROVED) { 3691 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3692 } else { 3693 fault_event->keep_in_tree = 1; 3694 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3695 } 3696 3697 indent(); 3698 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3699 fme_state2str(return_value)); 3700 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3701 out(O_ALTFP|O_VERB, NULL); 3702 indent_pop(); 3703 if (return_value == FME_WAIT) 3704 *pdelay = overall_delay; 3705 return (return_value); 3706 } 3707 3708 static enum fme_state 3709 requirements_test(struct fme *fmep, struct event *ep, 3710 unsigned long long at_latest_by, unsigned long long *pdelay) 3711 { 3712 int waiting_events; 3713 int credible_events; 3714 int deferred_events; 3715 enum fme_state return_value = FME_CREDIBLE; 3716 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3717 unsigned long long arrow_delay; 3718 unsigned long long my_delay; 3719 struct event *ep2; 3720 struct bubble *bp; 3721 struct arrowlist *ap; 3722 3723 if (ep->cached_state & REQMNTS_CREDIBLE) { 3724 indent(); 3725 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3726 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3727 out(O_ALTFP|O_VERB, NULL); 3728 return (FME_CREDIBLE); 3729 } 3730 if (ep->cached_state & REQMNTS_DISPROVED) { 3731 indent(); 3732 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3733 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3734 out(O_ALTFP|O_VERB, NULL); 3735 return (FME_DISPROVED); 3736 } 3737 if (ep->cached_state & REQMNTS_WAIT) { 3738 indent(); 3739 *pdelay = ep->cached_delay; 3740 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3741 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3742 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3743 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3744 out(O_ALTFP|O_VERB, NULL); 3745 return (FME_WAIT); 3746 } 3747 stats_counter_bump(fmep->Rcallcount); 3748 indent_push(" R"); 3749 indent(); 3750 out(O_ALTFP|O_VERB|O_NONL, "->"); 3751 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3752 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3753 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3754 out(O_ALTFP|O_VERB, NULL); 3755 3756 if (ep->t == N_EREPORT) { 3757 if (ep->count == 0) { 3758 if (fmep->pull >= at_latest_by) { 3759 return_value = FME_DISPROVED; 3760 } else { 3761 ep->cached_delay = *pdelay = at_latest_by; 3762 return_value = FME_WAIT; 3763 } 3764 } 3765 3766 indent(); 3767 switch (return_value) { 3768 case FME_CREDIBLE: 3769 ep->cached_state |= REQMNTS_CREDIBLE; 3770 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3771 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3772 break; 3773 case FME_DISPROVED: 3774 ep->cached_state |= REQMNTS_DISPROVED; 3775 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3776 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3777 break; 3778 case FME_WAIT: 3779 ep->cached_state |= REQMNTS_WAIT; 3780 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3781 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3782 out(O_ALTFP|O_VERB|O_NONL, " to "); 3783 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3784 break; 3785 default: 3786 out(O_DIE, "requirements_test: unexpected fme_state"); 3787 break; 3788 } 3789 out(O_ALTFP|O_VERB, NULL); 3790 indent_pop(); 3791 3792 return (return_value); 3793 } 3794 3795 /* this event is not a report, descend the tree */ 3796 for (bp = itree_next_bubble(ep, NULL); bp; 3797 bp = itree_next_bubble(ep, bp)) { 3798 int n; 3799 3800 if (bp->t != B_FROM) 3801 continue; 3802 3803 n = bp->nork; 3804 3805 credible_events = 0; 3806 waiting_events = 0; 3807 deferred_events = 0; 3808 arrow_delay = TIMEVAL_EVENTUALLY; 3809 /* 3810 * n is -1 for 'A' so adjust it. 3811 * XXX just count up the arrows for now. 3812 */ 3813 if (n < 0) { 3814 n = 0; 3815 for (ap = itree_next_arrow(bp, NULL); ap; 3816 ap = itree_next_arrow(bp, ap)) 3817 n++; 3818 indent(); 3819 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3820 } else { 3821 indent(); 3822 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3823 } 3824 3825 if (n == 0) 3826 continue; 3827 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3828 for (ap = itree_next_arrow(bp, NULL); ap; 3829 ap = itree_next_arrow(bp, ap)) { 3830 ep2 = ap->arrowp->head->myevent; 3831 platform_set_payloadnvp(ep2->nvp); 3832 (void) checkconstraints(fmep, ap->arrowp); 3833 if (ap->arrowp->forever_true) { 3834 /* 3835 * if all arrows are invalidated by the 3836 * constraints, then we should elide the 3837 * whole bubble to be consistant with 3838 * the tree creation time behaviour 3839 */ 3840 bp->mark |= BUBBLE_OK; 3841 platform_set_payloadnvp(NULL); 3842 break; 3843 } 3844 platform_set_payloadnvp(NULL); 3845 } 3846 } 3847 for (ap = itree_next_arrow(bp, NULL); ap; 3848 ap = itree_next_arrow(bp, ap)) { 3849 ep2 = ap->arrowp->head->myevent; 3850 if (n <= credible_events) 3851 break; 3852 3853 ap->arrowp->mark |= REQMNTS_COUNTER; 3854 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3855 /* XXX adding max timevals! */ 3856 switch (requirements_test(fmep, ep2, 3857 at_latest_by + ap->arrowp->maxdelay, 3858 &my_delay)) { 3859 case FME_DEFERRED: 3860 deferred_events++; 3861 break; 3862 case FME_CREDIBLE: 3863 credible_events++; 3864 break; 3865 case FME_DISPROVED: 3866 break; 3867 case FME_WAIT: 3868 if (my_delay < arrow_delay) 3869 arrow_delay = my_delay; 3870 waiting_events++; 3871 break; 3872 default: 3873 out(O_DIE, 3874 "Bug in requirements_test."); 3875 } 3876 else 3877 deferred_events++; 3878 } 3879 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3880 bp->mark |= BUBBLE_ELIDED; 3881 continue; 3882 } 3883 indent(); 3884 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3885 credible_events + deferred_events, waiting_events); 3886 if (credible_events + deferred_events + waiting_events < n) { 3887 /* Can never meet requirements */ 3888 ep->cached_state |= REQMNTS_DISPROVED; 3889 indent(); 3890 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3891 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3892 out(O_ALTFP|O_VERB, NULL); 3893 indent_pop(); 3894 return (FME_DISPROVED); 3895 } 3896 if (credible_events + deferred_events < n) { 3897 /* will have to wait */ 3898 /* wait time is shortest known */ 3899 if (arrow_delay < overall_delay) 3900 overall_delay = arrow_delay; 3901 return_value = FME_WAIT; 3902 } else if (credible_events < n) { 3903 if (return_value != FME_WAIT) 3904 return_value = FME_DEFERRED; 3905 } 3906 } 3907 3908 /* 3909 * don't mark as FME_DEFERRED. If this event isn't reached by another 3910 * path, then this will be considered FME_CREDIBLE. But if it is 3911 * reached by a different path so the K-count is met, then might 3912 * get overridden by FME_WAIT or FME_DISPROVED. 3913 */ 3914 if (return_value == FME_WAIT) { 3915 ep->cached_state |= REQMNTS_WAIT; 3916 ep->cached_delay = *pdelay = overall_delay; 3917 } else if (return_value == FME_CREDIBLE) { 3918 ep->cached_state |= REQMNTS_CREDIBLE; 3919 } 3920 indent(); 3921 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 3922 fme_state2str(return_value)); 3923 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3924 out(O_ALTFP|O_VERB, NULL); 3925 indent_pop(); 3926 return (return_value); 3927 } 3928 3929 static enum fme_state 3930 causes_test(struct fme *fmep, struct event *ep, 3931 unsigned long long at_latest_by, unsigned long long *pdelay) 3932 { 3933 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3934 unsigned long long my_delay; 3935 int credible_results = 0; 3936 int waiting_results = 0; 3937 enum fme_state fstate; 3938 struct event *tail_event; 3939 struct bubble *bp; 3940 struct arrowlist *ap; 3941 int k = 1; 3942 3943 stats_counter_bump(fmep->Ccallcount); 3944 indent_push(" C"); 3945 indent(); 3946 out(O_ALTFP|O_VERB|O_NONL, "->"); 3947 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3948 out(O_ALTFP|O_VERB, NULL); 3949 3950 for (bp = itree_next_bubble(ep, NULL); bp; 3951 bp = itree_next_bubble(ep, bp)) { 3952 if (bp->t != B_TO) 3953 continue; 3954 k = bp->nork; /* remember the K value */ 3955 for (ap = itree_next_arrow(bp, NULL); ap; 3956 ap = itree_next_arrow(bp, ap)) { 3957 int do_not_follow = 0; 3958 3959 /* 3960 * if we get to the same event multiple times 3961 * only worry about the first one. 3962 */ 3963 if (ap->arrowp->tail->myevent->cached_state & 3964 CAUSES_TESTED) { 3965 indent(); 3966 out(O_ALTFP|O_VERB|O_NONL, 3967 " causes test already run for "); 3968 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3969 ap->arrowp->tail->myevent); 3970 out(O_ALTFP|O_VERB, NULL); 3971 continue; 3972 } 3973 3974 /* 3975 * see if false constraint prevents us 3976 * from traversing this arrow 3977 */ 3978 platform_set_payloadnvp(ep->nvp); 3979 if (checkconstraints(fmep, ap->arrowp) == 0) 3980 do_not_follow = 1; 3981 platform_set_payloadnvp(NULL); 3982 if (do_not_follow) { 3983 indent(); 3984 out(O_ALTFP|O_VERB|O_NONL, 3985 " False arrow from "); 3986 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3987 ap->arrowp->tail->myevent); 3988 out(O_ALTFP|O_VERB, NULL); 3989 continue; 3990 } 3991 3992 ap->arrowp->tail->myevent->cached_state |= 3993 CAUSES_TESTED; 3994 tail_event = ap->arrowp->tail->myevent; 3995 fstate = hypothesise(fmep, tail_event, at_latest_by, 3996 &my_delay); 3997 3998 switch (fstate) { 3999 case FME_WAIT: 4000 if (my_delay < overall_delay) 4001 overall_delay = my_delay; 4002 waiting_results++; 4003 break; 4004 case FME_CREDIBLE: 4005 credible_results++; 4006 break; 4007 case FME_DISPROVED: 4008 break; 4009 default: 4010 out(O_DIE, "Bug in causes_test"); 4011 } 4012 } 4013 } 4014 /* compare against K */ 4015 if (credible_results + waiting_results < k) { 4016 indent(); 4017 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4018 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4019 out(O_ALTFP|O_VERB, NULL); 4020 indent_pop(); 4021 return (FME_DISPROVED); 4022 } 4023 if (waiting_results != 0) { 4024 *pdelay = overall_delay; 4025 indent(); 4026 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4027 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4028 out(O_ALTFP|O_VERB|O_NONL, " to "); 4029 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4030 out(O_ALTFP|O_VERB, NULL); 4031 indent_pop(); 4032 return (FME_WAIT); 4033 } 4034 indent(); 4035 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4036 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4037 out(O_ALTFP|O_VERB, NULL); 4038 indent_pop(); 4039 return (FME_CREDIBLE); 4040 } 4041 4042 static enum fme_state 4043 hypothesise(struct fme *fmep, struct event *ep, 4044 unsigned long long at_latest_by, unsigned long long *pdelay) 4045 { 4046 enum fme_state rtr, otr; 4047 unsigned long long my_delay; 4048 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4049 4050 stats_counter_bump(fmep->Hcallcount); 4051 indent_push(" H"); 4052 indent(); 4053 out(O_ALTFP|O_VERB|O_NONL, "->"); 4054 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4055 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4056 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4057 out(O_ALTFP|O_VERB, NULL); 4058 4059 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4060 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4061 overall_delay = my_delay; 4062 if (rtr != FME_DISPROVED) { 4063 if (is_problem(ep->t)) { 4064 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4065 if (otr != FME_DISPROVED) { 4066 if (fmep->peek == 0 && ep->is_suspect == 0) { 4067 ep->suspects = fmep->suspects; 4068 ep->is_suspect = 1; 4069 fmep->suspects = ep; 4070 fmep->nsuspects++; 4071 if (!is_fault(ep->t)) 4072 fmep->nonfault++; 4073 } 4074 } 4075 } else 4076 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4077 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4078 overall_delay = my_delay; 4079 if ((otr != FME_DISPROVED) && 4080 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4081 *pdelay = overall_delay; 4082 } 4083 if (rtr == FME_DISPROVED) { 4084 indent(); 4085 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4086 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4087 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4088 indent_pop(); 4089 return (FME_DISPROVED); 4090 } 4091 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4092 indent(); 4093 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4094 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4095 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4096 indent_pop(); 4097 return (FME_DISPROVED); 4098 } 4099 if (otr == FME_DISPROVED) { 4100 indent(); 4101 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4102 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4103 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4104 indent_pop(); 4105 return (FME_DISPROVED); 4106 } 4107 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4108 indent(); 4109 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4110 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4111 out(O_ALTFP|O_VERB|O_NONL, " to "); 4112 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4113 out(O_ALTFP|O_VERB, NULL); 4114 indent_pop(); 4115 return (FME_WAIT); 4116 } 4117 indent(); 4118 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4119 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4120 out(O_ALTFP|O_VERB, NULL); 4121 indent_pop(); 4122 return (FME_CREDIBLE); 4123 } 4124 4125 /* 4126 * fme_istat_load -- reconstitute any persistent istats 4127 */ 4128 void 4129 fme_istat_load(fmd_hdl_t *hdl) 4130 { 4131 int sz; 4132 char *sbuf; 4133 char *ptr; 4134 4135 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4136 out(O_ALTFP, "fme_istat_load: No stats"); 4137 return; 4138 } 4139 4140 sbuf = alloca(sz); 4141 4142 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4143 4144 /* 4145 * pick apart the serialized stats 4146 * 4147 * format is: 4148 * <class-name>, '@', <path>, '\0', <value>, '\0' 4149 * for example: 4150 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4151 * 4152 * since this is parsing our own serialized data, any parsing issues 4153 * are fatal, so we check for them all with ASSERT() below. 4154 */ 4155 ptr = sbuf; 4156 while (ptr < &sbuf[sz]) { 4157 char *sepptr; 4158 struct node *np; 4159 int val; 4160 4161 sepptr = strchr(ptr, '@'); 4162 ASSERT(sepptr != NULL); 4163 *sepptr = '\0'; 4164 4165 /* construct the event */ 4166 np = newnode(T_EVENT, NULL, 0); 4167 np->u.event.ename = newnode(T_NAME, NULL, 0); 4168 np->u.event.ename->u.name.t = N_STAT; 4169 np->u.event.ename->u.name.s = stable(ptr); 4170 np->u.event.ename->u.name.it = IT_ENAME; 4171 np->u.event.ename->u.name.last = np->u.event.ename; 4172 4173 ptr = sepptr + 1; 4174 ASSERT(ptr < &sbuf[sz]); 4175 ptr += strlen(ptr); 4176 ptr++; /* move past the '\0' separating path from value */ 4177 ASSERT(ptr < &sbuf[sz]); 4178 ASSERT(isdigit(*ptr)); 4179 val = atoi(ptr); 4180 ASSERT(val > 0); 4181 ptr += strlen(ptr); 4182 ptr++; /* move past the final '\0' for this entry */ 4183 4184 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4185 ASSERT(np->u.event.epname != NULL); 4186 4187 istat_bump(np, val); 4188 tree_free(np); 4189 } 4190 4191 istat_save(); 4192 } 4193