1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <strings.h> 37 #include <ctype.h> 38 #include <alloca.h> 39 #include <libnvpair.h> 40 #include <sys/fm/protocol.h> 41 #include <fm/fmd_api.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 58 /* imported from eft.c... */ 59 extern int Autoconvict; 60 extern char *Autoclose; 61 extern hrtime_t Hesitate; 62 extern nv_alloc_t Eft_nv_hdl; 63 64 /* fme under construction is global so we can free it on module abort */ 65 static struct fme *Nfmep; 66 67 static const char *Undiag_reason; 68 69 static int Nextid = 0; 70 71 /* list of fault management exercises underway */ 72 static struct fme { 73 struct fme *next; /* next exercise */ 74 unsigned long long ull; /* time when fme was created */ 75 int id; /* FME id */ 76 struct cfgdata *cfgdata; /* full configuration data */ 77 struct lut *eventtree; /* propagation tree for this FME */ 78 /* 79 * The initial error report that created this FME is kept in 80 * two forms. e0 points to the instance tree node and is used 81 * by fme_eval() as the starting point for the inference 82 * algorithm. e0r is the event handle FMD passed to us when 83 * the ereport first arrived and is used when setting timers, 84 * which are always relative to the time of this initial 85 * report. 86 */ 87 struct event *e0; 88 fmd_event_t *e0r; 89 90 id_t timer; /* for setting an fmd time-out */ 91 id_t htid; /* for setting hesitation timer */ 92 93 struct event *ecurrent; /* ereport under consideration */ 94 struct event *suspects; /* current suspect list */ 95 struct event *psuspects; /* previous suspect list */ 96 int nsuspects; /* count of suspects */ 97 int nonfault; /* zero if all suspects T_FAULT */ 98 int posted_suspects; /* true if we've posted a diagnosis */ 99 int hesitated; /* true if we hesitated */ 100 int uniqobs; /* number of unique events observed */ 101 int peek; /* just peeking, don't track suspects */ 102 enum fme_state { 103 FME_NOTHING = 5000, /* not evaluated yet */ 104 FME_WAIT, /* need to wait for more info */ 105 FME_CREDIBLE, /* suspect list is credible */ 106 FME_DISPROVED /* no valid suspects found */ 107 } state; 108 109 unsigned long long pull; /* time passed since created */ 110 unsigned long long wull; /* wait until this time for re-eval */ 111 struct event *observations; /* observation list */ 112 struct lut *globals; /* values of global variables */ 113 /* fmd interfacing */ 114 fmd_hdl_t *hdl; /* handle for talking with fmd */ 115 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 116 /* stats */ 117 struct stats *Rcount; 118 struct stats *Hcallcount; 119 struct stats *Rcallcount; 120 struct stats *Ccallcount; 121 struct stats *Ecallcount; 122 struct stats *Tcallcount; 123 struct stats *Marrowcount; 124 struct stats *diags; 125 } *FMElist, *EFMElist, *ClosedFMEs; 126 127 static struct case_list { 128 fmd_case_t *fmcase; 129 struct case_list *next; 130 } *Undiagablecaselist; 131 132 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 133 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 134 unsigned long long at_latest_by, unsigned long long *pdelay, 135 struct arrow *arrowp); 136 static struct node *eventprop_lookup(struct event *ep, const char *propname); 137 static struct node *pathstring2epnamenp(char *path); 138 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep); 139 static void restore_suspects(struct fme *fmep); 140 static void save_suspects(struct fme *fmep); 141 static void destroy_fme(struct fme *f); 142 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 143 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 144 145 static struct fme * 146 alloc_fme(void) 147 { 148 struct fme *fmep; 149 150 fmep = MALLOC(sizeof (*fmep)); 151 bzero(fmep, sizeof (*fmep)); 152 return (fmep); 153 } 154 155 /* 156 * fme_ready -- called when all initialization of the FME (except for 157 * stats) has completed successfully. Adds the fme to global lists 158 * and establishes its stats. 159 */ 160 static struct fme * 161 fme_ready(struct fme *fmep) 162 { 163 char nbuf[100]; 164 165 Nfmep = NULL; /* don't need to free this on module abort now */ 166 167 if (EFMElist) { 168 EFMElist->next = fmep; 169 EFMElist = fmep; 170 } else 171 FMElist = EFMElist = fmep; 172 173 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 174 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 175 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 176 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 177 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 178 fmep->Rcallcount = stats_new_counter(nbuf, 179 "calls to requirements_test()", 1); 180 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 181 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 182 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 183 fmep->Ecallcount = 184 stats_new_counter(nbuf, "calls to effects_test()", 1); 185 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 186 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 187 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 188 fmep->Marrowcount = stats_new_counter(nbuf, 189 "arrows marked by mark_arrows()", 1); 190 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 191 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 192 193 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 194 config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked); 195 196 return (fmep); 197 } 198 199 static struct fme * 200 newfme(const char *e0class, const struct ipath *e0ipp) 201 { 202 struct cfgdata *cfgdata; 203 204 if ((cfgdata = config_snapshot()) == NULL) { 205 out(O_ALTFP, "newfme: NULL configuration"); 206 Undiag_reason = UD_NOCONF; 207 return (NULL); 208 } 209 210 Nfmep = alloc_fme(); 211 212 Nfmep->id = Nextid++; 213 Nfmep->cfgdata = cfgdata; 214 Nfmep->posted_suspects = 0; 215 Nfmep->uniqobs = 0; 216 Nfmep->state = FME_NOTHING; 217 Nfmep->pull = 0ULL; 218 219 Nfmep->fmcase = NULL; 220 Nfmep->hdl = NULL; 221 222 if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 223 out(O_ALTFP, "newfme: NULL instance tree"); 224 Undiag_reason = UD_INSTFAIL; 225 config_free(cfgdata); 226 FREE(Nfmep); 227 Nfmep = NULL; 228 return (NULL); 229 } 230 231 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 232 233 if ((Nfmep->e0 = 234 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 235 out(O_ALTFP, "newfme: e0 not in instance tree"); 236 Undiag_reason = UD_BADEVENTI; 237 itree_free(Nfmep->eventtree); 238 config_free(cfgdata); 239 FREE(Nfmep); 240 Nfmep = NULL; 241 return (NULL); 242 } 243 244 return (fme_ready(Nfmep)); 245 } 246 247 void 248 fme_fini(void) 249 { 250 struct fme *sfp, *fp; 251 struct case_list *ucasep, *nextcasep; 252 253 ucasep = Undiagablecaselist; 254 while (ucasep != NULL) { 255 nextcasep = ucasep->next; 256 FREE(ucasep); 257 ucasep = nextcasep; 258 } 259 Undiagablecaselist = NULL; 260 261 /* clean up closed fmes */ 262 fp = ClosedFMEs; 263 while (fp != NULL) { 264 sfp = fp->next; 265 destroy_fme(fp); 266 fp = sfp; 267 } 268 ClosedFMEs = NULL; 269 270 fp = FMElist; 271 while (fp != NULL) { 272 sfp = fp->next; 273 destroy_fme(fp); 274 fp = sfp; 275 } 276 FMElist = EFMElist = NULL; 277 278 /* if we were in the middle of creating an fme, free it now */ 279 if (Nfmep) { 280 destroy_fme(Nfmep); 281 Nfmep = NULL; 282 } 283 } 284 285 /* 286 * Allocated space for a buffer name. 20 bytes allows for 287 * a ridiculous 9,999,999 unique observations. 288 */ 289 #define OBBUFNMSZ 20 290 291 /* 292 * serialize_observation 293 * 294 * Create a recoverable version of the current observation 295 * (f->ecurrent). We keep a serialized version of each unique 296 * observation in order that we may resume correctly the fme in the 297 * correct state if eft or fmd crashes and we're restarted. 298 */ 299 static void 300 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 301 { 302 size_t pkdlen; 303 char tmpbuf[OBBUFNMSZ]; 304 char *pkd = NULL; 305 char *estr; 306 307 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 308 estr = ipath2str(cls, ipp); 309 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 310 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 311 strlen(estr) + 1); 312 FREE(estr); 313 314 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 315 (void) snprintf(tmpbuf, 316 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 317 if (nvlist_xpack(fp->ecurrent->nvp, 318 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 319 out(O_DIE|O_SYS, "pack of observed nvl failed"); 320 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 321 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 322 FREE(pkd); 323 } 324 325 fp->uniqobs++; 326 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 327 sizeof (fp->uniqobs)); 328 } 329 330 /* 331 * init_fme_bufs -- We keep several bits of state about an fme for 332 * use if eft or fmd crashes and we're restarted. 333 */ 334 static void 335 init_fme_bufs(struct fme *fp) 336 { 337 size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin; 338 339 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen)); 340 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen, 341 sizeof (cfglen)); 342 if (cfglen != 0) { 343 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen); 344 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG, 345 fp->cfgdata->begin, cfglen); 346 } 347 348 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 349 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 350 sizeof (fp->pull)); 351 352 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 353 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 354 sizeof (fp->id)); 355 356 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 357 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 358 sizeof (fp->uniqobs)); 359 360 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 361 sizeof (fp->posted_suspects)); 362 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 363 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 364 } 365 366 static void 367 destroy_fme_bufs(struct fme *fp) 368 { 369 char tmpbuf[OBBUFNMSZ]; 370 int o; 371 372 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 373 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 374 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 375 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 376 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 377 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 378 379 for (o = 0; o < fp->uniqobs; o++) { 380 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 381 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 382 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 383 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 384 } 385 } 386 387 /* 388 * reconstitute_observations -- convert a case's serialized observations 389 * back into struct events. Returns zero if all observations are 390 * successfully reconstituted. 391 */ 392 static int 393 reconstitute_observations(struct fme *fmep) 394 { 395 struct event *ep; 396 struct node *epnamenp = NULL; 397 size_t pkdlen; 398 char *pkd = NULL; 399 char *tmpbuf = alloca(OBBUFNMSZ); 400 char *sepptr; 401 char *estr; 402 int ocnt; 403 int elen; 404 405 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 406 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 407 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 408 if (elen == 0) { 409 out(O_ALTFP, 410 "reconstitute_observation: no %s buffer found.", 411 tmpbuf); 412 Undiag_reason = UD_MISSINGOBS; 413 break; 414 } 415 416 estr = MALLOC(elen); 417 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 418 sepptr = strchr(estr, '@'); 419 if (sepptr == NULL) { 420 out(O_ALTFP, 421 "reconstitute_observation: %s: " 422 "missing @ separator in %s.", 423 tmpbuf, estr); 424 Undiag_reason = UD_MISSINGPATH; 425 FREE(estr); 426 break; 427 } 428 429 *sepptr = '\0'; 430 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 431 out(O_ALTFP, 432 "reconstitute_observation: %s: " 433 "trouble converting path string \"%s\" " 434 "to internal representation.", 435 tmpbuf, sepptr + 1); 436 Undiag_reason = UD_MISSINGPATH; 437 FREE(estr); 438 break; 439 } 440 441 /* construct the event */ 442 ep = itree_lookup(fmep->eventtree, 443 stable(estr), ipath(epnamenp)); 444 if (ep == NULL) { 445 out(O_ALTFP, 446 "reconstitute_observation: %s: " 447 "lookup of \"%s\" in itree failed.", 448 tmpbuf, ipath2str(estr, ipath(epnamenp))); 449 Undiag_reason = UD_BADOBS; 450 tree_free(epnamenp); 451 FREE(estr); 452 break; 453 } 454 tree_free(epnamenp); 455 456 /* 457 * We may or may not have a saved nvlist for the observation 458 */ 459 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 460 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 461 if (pkdlen != 0) { 462 pkd = MALLOC(pkdlen); 463 fmd_buf_read(fmep->hdl, 464 fmep->fmcase, tmpbuf, pkd, pkdlen); 465 if (nvlist_xunpack(pkd, 466 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 467 out(O_DIE|O_SYS, "pack of observed nvl failed"); 468 FREE(pkd); 469 } 470 471 if (ocnt == 0) 472 fmep->e0 = ep; 473 474 FREE(estr); 475 fmep->ecurrent = ep; 476 ep->count++; 477 478 /* link it into list of observations seen */ 479 ep->observations = fmep->observations; 480 fmep->observations = ep; 481 } 482 483 if (ocnt == fmep->uniqobs) { 484 (void) fme_ready(fmep); 485 return (0); 486 } 487 488 return (1); 489 } 490 491 /* 492 * restart_fme -- called during eft initialization. Reconstitutes 493 * an in-progress fme. 494 */ 495 void 496 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 497 { 498 nvlist_t *defect; 499 struct case_list *bad; 500 struct fme *fmep; 501 struct cfgdata *cfgdata = NULL; 502 size_t rawsz; 503 504 fmep = alloc_fme(); 505 fmep->fmcase = inprogress; 506 fmep->hdl = hdl; 507 508 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 509 out(O_ALTFP, "restart_fme: No config data"); 510 Undiag_reason = UD_MISSINGINFO; 511 goto badcase; 512 } 513 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 514 sizeof (size_t)); 515 516 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 517 out(O_ALTFP, "restart_fme: No event zero"); 518 Undiag_reason = UD_MISSINGZERO; 519 goto badcase; 520 } 521 522 cfgdata = MALLOC(sizeof (struct cfgdata)); 523 cfgdata->cooked = NULL; 524 cfgdata->devcache = NULL; 525 cfgdata->cpucache = NULL; 526 cfgdata->refcnt = 1; 527 528 if (rawsz > 0) { 529 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 530 out(O_ALTFP, "restart_fme: Config data size mismatch"); 531 Undiag_reason = UD_CFGMISMATCH; 532 goto badcase; 533 } 534 cfgdata->begin = MALLOC(rawsz); 535 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 536 fmd_buf_read(hdl, 537 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 538 } else { 539 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 540 } 541 fmep->cfgdata = cfgdata; 542 543 config_cook(cfgdata); 544 if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 545 /* case not properly saved or irretrievable */ 546 out(O_ALTFP, "restart_fme: NULL instance tree"); 547 Undiag_reason = UD_INSTFAIL; 548 goto badcase; 549 } 550 551 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 552 553 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 554 out(O_ALTFP, "restart_fme: no saved wait time"); 555 Undiag_reason = UD_MISSINGINFO; 556 goto badcase; 557 } else { 558 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 559 sizeof (fmep->pull)); 560 } 561 562 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 563 out(O_ALTFP, "restart_fme: no saved posted status"); 564 Undiag_reason = UD_MISSINGINFO; 565 goto badcase; 566 } else { 567 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 568 (void *)&fmep->posted_suspects, 569 sizeof (fmep->posted_suspects)); 570 } 571 572 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 573 out(O_ALTFP, "restart_fme: no saved id"); 574 Undiag_reason = UD_MISSINGINFO; 575 goto badcase; 576 } else { 577 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 578 sizeof (fmep->id)); 579 } 580 if (Nextid <= fmep->id) 581 Nextid = fmep->id + 1; 582 583 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 584 out(O_ALTFP, "restart_fme: no count of observations"); 585 Undiag_reason = UD_MISSINGINFO; 586 goto badcase; 587 } else { 588 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 589 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 590 } 591 592 if (reconstitute_observations(fmep) != 0) 593 goto badcase; 594 595 /* give the diagnosis algorithm a shot at the new FME state */ 596 fme_eval(fmep, NULL); 597 return; 598 599 badcase: 600 if (fmep->eventtree != NULL) 601 itree_free(fmep->eventtree); 602 config_free(cfgdata); 603 destroy_fme_bufs(fmep); 604 FREE(fmep); 605 606 /* 607 * Since we're unable to restart the case, add it to the undiagable 608 * list and solve and close it as appropriate. 609 */ 610 bad = MALLOC(sizeof (struct case_list)); 611 bad->next = NULL; 612 613 if (Undiagablecaselist != NULL) 614 bad->next = Undiagablecaselist; 615 Undiagablecaselist = bad; 616 bad->fmcase = inprogress; 617 618 out(O_ALTFP, "[case %s (unable to restart), ", 619 fmd_case_uuid(hdl, bad->fmcase)); 620 621 if (fmd_case_solved(hdl, bad->fmcase)) { 622 out(O_ALTFP, "already solved, "); 623 } else { 624 out(O_ALTFP, "solving, "); 625 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 626 NULL, NULL, NULL); 627 if (Undiag_reason != NULL) 628 (void) nvlist_add_string(defect, 629 UNDIAG_REASON, Undiag_reason); 630 fmd_case_add_suspect(hdl, bad->fmcase, defect); 631 fmd_case_solve(hdl, bad->fmcase); 632 } 633 634 if (fmd_case_closed(hdl, bad->fmcase)) { 635 out(O_ALTFP, "already closed ]"); 636 } else { 637 out(O_ALTFP, "closing ]"); 638 fmd_case_close(hdl, bad->fmcase); 639 } 640 } 641 642 void 643 destroy_fme(struct fme *f) 644 { 645 stats_delete(f->Rcount); 646 stats_delete(f->Hcallcount); 647 stats_delete(f->Rcallcount); 648 stats_delete(f->Ccallcount); 649 stats_delete(f->Ecallcount); 650 stats_delete(f->Tcallcount); 651 stats_delete(f->Marrowcount); 652 stats_delete(f->diags); 653 654 itree_free(f->eventtree); 655 config_free(f->cfgdata); 656 FREE(f); 657 } 658 659 static const char * 660 fme_state2str(enum fme_state s) 661 { 662 switch (s) { 663 case FME_NOTHING: return ("NOTHING"); 664 case FME_WAIT: return ("WAIT"); 665 case FME_CREDIBLE: return ("CREDIBLE"); 666 case FME_DISPROVED: return ("DISPROVED"); 667 default: return ("UNKNOWN"); 668 } 669 } 670 671 static int 672 is_problem(enum nametype t) 673 { 674 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 675 } 676 677 static int 678 is_fault(enum nametype t) 679 { 680 return (t == N_FAULT); 681 } 682 683 static int 684 is_defect(enum nametype t) 685 { 686 return (t == N_DEFECT); 687 } 688 689 static int 690 is_upset(enum nametype t) 691 { 692 return (t == N_UPSET); 693 } 694 695 /*ARGSUSED*/ 696 static void 697 clear_causes_tested(struct event *lhs, struct event *ep, void *arg) 698 { 699 struct bubble *bp; 700 struct arrowlist *ap; 701 702 for (bp = itree_next_bubble(ep, NULL); bp; 703 bp = itree_next_bubble(ep, bp)) { 704 if (bp->t != B_FROM) 705 continue; 706 for (ap = itree_next_arrow(bp, NULL); ap; 707 ap = itree_next_arrow(bp, ap)) 708 ap->arrowp->causes_tested = 0; 709 } 710 } 711 712 /* 713 * call this function with initcode set to 0 to initialize cycle tracking 714 */ 715 static void 716 initialize_cycles(struct fme *fmep) 717 { 718 lut_walk(fmep->eventtree, (lut_cb)clear_causes_tested, NULL); 719 } 720 721 static void 722 fme_print(int flags, struct fme *fmep) 723 { 724 struct event *ep; 725 726 out(flags, "Fault Management Exercise %d", fmep->id); 727 out(flags, "\t State: %s", fme_state2str(fmep->state)); 728 out(flags|O_NONL, "\t Start time: "); 729 ptree_timeval(flags|O_NONL, &fmep->ull); 730 out(flags, NULL); 731 if (fmep->wull) { 732 out(flags|O_NONL, "\t Wait time: "); 733 ptree_timeval(flags|O_NONL, &fmep->wull); 734 out(flags, NULL); 735 } 736 out(flags|O_NONL, "\t E0: "); 737 if (fmep->e0) 738 itree_pevent_brief(flags|O_NONL, fmep->e0); 739 else 740 out(flags|O_NONL, "NULL"); 741 out(flags, NULL); 742 out(flags|O_NONL, "\tObservations:"); 743 for (ep = fmep->observations; ep; ep = ep->observations) { 744 out(flags|O_NONL, " "); 745 itree_pevent_brief(flags|O_NONL, ep); 746 } 747 out(flags, NULL); 748 out(flags|O_NONL, "\tSuspect list:"); 749 for (ep = fmep->suspects; ep; ep = ep->suspects) { 750 out(flags|O_NONL, " "); 751 itree_pevent_brief(flags|O_NONL, ep); 752 } 753 out(flags, NULL); 754 out(flags|O_VERB2, "\t Tree:"); 755 itree_ptree(flags|O_VERB2, fmep->eventtree); 756 } 757 758 static struct node * 759 pathstring2epnamenp(char *path) 760 { 761 char *sep = "/"; 762 struct node *ret; 763 char *ptr; 764 765 if ((ptr = strtok(path, sep)) == NULL) 766 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 767 768 ret = tree_iname(stable(ptr), NULL, 0); 769 770 while ((ptr = strtok(NULL, sep)) != NULL) 771 ret = tree_name_append(ret, 772 tree_iname(stable(ptr), NULL, 0)); 773 774 return (ret); 775 } 776 777 /* 778 * for a given upset sp, increment the corresponding SERD engine. if the 779 * SERD engine trips, return the ename and ipp of the resulting ereport. 780 * returns true if engine tripped and *enamep and *ippp were filled in. 781 */ 782 static int 783 serd_eval(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 784 struct event *sp, const char **enamep, const struct ipath **ippp) 785 { 786 struct node *serdinst; 787 char *serdname; 788 789 ASSERT(sp->t == N_UPSET); 790 ASSERT(ffep != NULL); 791 792 /* 793 * obtain instanced SERD engine from the upset sp. from this 794 * derive serdname, the string used to identify the SERD engine. 795 */ 796 serdinst = eventprop_lookup(sp, L_engine); 797 798 if (serdinst == NULL) 799 return (NULL); 800 801 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 802 ipath(serdinst->u.stmt.np->u.event.epname)); 803 804 if (!fmd_serd_exists(hdl, serdname)) { 805 struct node *nN, *nT; 806 807 /* no SERD engine yet, so create it */ 808 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL); 809 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL); 810 811 ASSERT(nN->t == T_NUM); 812 ASSERT(nT->t == T_TIMEVAL); 813 814 fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull, 815 (hrtime_t)nT->u.ull); 816 } 817 818 819 /* 820 * increment SERD engine. if engine fires, reset serd 821 * engine and return trip_strcode 822 */ 823 if (fmd_serd_record(hdl, serdname, ffep)) { 824 struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp, 825 (void *)L_trip, NULL); 826 827 ASSERT(tripinst != NULL); 828 829 *enamep = tripinst->u.event.ename->u.name.s; 830 *ippp = ipath(tripinst->u.event.epname); 831 832 fmd_case_add_serd(hdl, fmcase, serdname); 833 fmd_serd_reset(hdl, serdname); 834 out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname); 835 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 836 out(O_ALTFP, "]"); 837 838 FREE(serdname); 839 return (1); 840 } 841 842 FREE(serdname); 843 return (0); 844 } 845 846 /* 847 * search a suspect list for upsets. feed each upset to serd_eval() and 848 * build up tripped[], an array of ereports produced by the firing of 849 * any SERD engines. then feed each ereport back into 850 * fme_receive_report(). 851 * 852 * returns ntrip, the number of these ereports produced. 853 */ 854 static int 855 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 856 { 857 /* we build an array of tripped ereports that we send ourselves */ 858 struct { 859 const char *ename; 860 const struct ipath *ipp; 861 } *tripped; 862 struct event *sp; 863 int ntrip, nupset, i; 864 865 /* 866 * we avoid recursion by calling fme_receive_report() at the end of 867 * this function with a NULL ffep 868 */ 869 if (ffep == NULL) 870 return (0); 871 872 /* 873 * count the number of upsets to determine the upper limit on 874 * expected trip ereport strings. remember that one upset can 875 * lead to at most one ereport. 876 */ 877 nupset = 0; 878 for (sp = fmep->suspects; sp; sp = sp->suspects) { 879 if (sp->t == N_UPSET) 880 nupset++; 881 } 882 883 if (nupset == 0) 884 return (0); 885 886 /* 887 * get to this point if we have upsets and expect some trip 888 * ereports 889 */ 890 tripped = alloca(sizeof (*tripped) * nupset); 891 bzero((void *)tripped, sizeof (*tripped) * nupset); 892 893 ntrip = 0; 894 for (sp = fmep->suspects; sp; sp = sp->suspects) 895 if (sp->t == N_UPSET && 896 serd_eval(fmep->hdl, ffep, fmep->fmcase, sp, 897 &tripped[ntrip].ename, &tripped[ntrip].ipp)) 898 ntrip++; 899 900 for (i = 0; i < ntrip; i++) 901 fme_receive_report(fmep->hdl, NULL, 902 tripped[i].ename, tripped[i].ipp, NULL); 903 904 return (ntrip); 905 } 906 907 /* 908 * fme_receive_external_report -- call when an external ereport comes in 909 * 910 * this routine just converts the relevant information from the ereport 911 * into a format used internally and passes it on to fme_receive_report(). 912 */ 913 void 914 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 915 const char *eventstring) 916 { 917 struct node *epnamenp = platform_getpath(nvl); 918 const struct ipath *ipp; 919 920 /* 921 * XFILE: If we ended up without a path, it's an X-file. 922 * For now, use our undiagnosable interface. 923 */ 924 if (epnamenp == NULL) { 925 out(O_ALTFP, "XFILE: Unable to get path from ereport"); 926 Undiag_reason = UD_NOPATH; 927 publish_undiagnosable(hdl, ffep); 928 return; 929 } 930 931 ipp = ipath(epnamenp); 932 tree_free(epnamenp); 933 fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl); 934 } 935 936 static void 937 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 938 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 939 { 940 struct event *ep; 941 struct fme *fmep = NULL; 942 struct fme *ofmep, *svfmep; 943 int matched = 0; 944 945 out(O_ALTFP|O_NONL, "fme_receive_report: "); 946 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 947 out(O_ALTFP|O_STAMP, NULL); 948 949 /* decide which FME it goes to */ 950 for (fmep = FMElist; fmep; fmep = fmep->next) { 951 int prev_verbose; 952 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 953 enum fme_state state; 954 955 /* look up event in event tree for this FME */ 956 if ((ep = itree_lookup(fmep->eventtree, 957 eventstring, ipp)) == NULL) 958 continue; 959 960 /* note observation */ 961 fmep->ecurrent = ep; 962 if (ep->count++ == 0) { 963 /* link it into list of observations seen */ 964 ep->observations = fmep->observations; 965 fmep->observations = ep; 966 ep->nvp = evnv_dupnvl(nvl); 967 } 968 969 /* tell hypothesise() not to mess with suspect list */ 970 fmep->peek = 1; 971 972 /* don't want this to be verbose (unless Debug is set) */ 973 prev_verbose = Verbose; 974 if (Debug == 0) 975 Verbose = 0; 976 977 initialize_cycles(fmep); 978 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL); 979 980 fmep->peek = 0; 981 982 /* put verbose flag back */ 983 Verbose = prev_verbose; 984 985 if (state != FME_DISPROVED) { 986 /* found an FME that explains the ereport */ 987 matched++; 988 out(O_ALTFP|O_NONL, "["); 989 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 990 out(O_ALTFP, " explained by FME%d]", fmep->id); 991 992 if (ep->count == 1) 993 serialize_observation(fmep, eventstring, ipp); 994 995 if (ffep) 996 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 997 998 stats_counter_bump(fmep->Rcount); 999 1000 /* re-eval FME */ 1001 fme_eval(fmep, ffep); 1002 } else { 1003 1004 /* not a match, undo noting of observation */ 1005 fmep->ecurrent = NULL; 1006 if (--ep->count == 0) { 1007 /* unlink it from observations */ 1008 fmep->observations = ep->observations; 1009 ep->observations = NULL; 1010 nvlist_free(ep->nvp); 1011 ep->nvp = NULL; 1012 } 1013 } 1014 } 1015 1016 if (matched) 1017 return; /* explained by at least one existing FME */ 1018 1019 /* clean up closed fmes */ 1020 ofmep = ClosedFMEs; 1021 while (ofmep != NULL) { 1022 svfmep = ofmep->next; 1023 destroy_fme(ofmep); 1024 ofmep = svfmep; 1025 } 1026 ClosedFMEs = NULL; 1027 1028 /* start a new FME */ 1029 if ((fmep = newfme(eventstring, ipp)) == NULL) { 1030 out(O_ALTFP|O_NONL, "["); 1031 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1032 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1033 publish_undiagnosable(hdl, ffep); 1034 return; 1035 } 1036 1037 /* open a case */ 1038 fmep->fmcase = fmd_case_open(hdl, NULL); 1039 fmep->hdl = hdl; 1040 init_fme_bufs(fmep); 1041 1042 out(O_ALTFP|O_NONL, "["); 1043 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1044 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1045 fmd_case_uuid(hdl, fmep->fmcase)); 1046 1047 ep = fmep->e0; 1048 ASSERT(ep != NULL); 1049 1050 /* note observation */ 1051 fmep->ecurrent = ep; 1052 if (ep->count++ == 0) { 1053 /* link it into list of observations seen */ 1054 ep->observations = fmep->observations; 1055 fmep->observations = ep; 1056 ep->nvp = evnv_dupnvl(nvl); 1057 serialize_observation(fmep, eventstring, ipp); 1058 } 1059 1060 stats_counter_bump(fmep->Rcount); 1061 1062 if (ffep) { 1063 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1064 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1065 fmep->e0r = ffep; 1066 } 1067 1068 /* give the diagnosis algorithm a shot at the new FME state */ 1069 fme_eval(fmep, ffep); 1070 } 1071 1072 void 1073 fme_status(int flags) 1074 { 1075 struct fme *fmep; 1076 1077 if (FMElist == NULL) { 1078 out(flags, "No fault management exercises underway."); 1079 return; 1080 } 1081 1082 for (fmep = FMElist; fmep; fmep = fmep->next) 1083 fme_print(flags, fmep); 1084 } 1085 1086 /* 1087 * "indent" routines used mostly for nicely formatted debug output, but also 1088 * for sanity checking for infinite recursion bugs. 1089 */ 1090 1091 #define MAX_INDENT 1024 1092 static const char *indent_s[MAX_INDENT]; 1093 static int current_indent; 1094 1095 static void 1096 indent_push(const char *s) 1097 { 1098 if (current_indent < MAX_INDENT) 1099 indent_s[current_indent++] = s; 1100 else 1101 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1102 } 1103 1104 static void 1105 indent_set(const char *s) 1106 { 1107 current_indent = 0; 1108 indent_push(s); 1109 } 1110 1111 static void 1112 indent_pop(void) 1113 { 1114 if (current_indent > 0) 1115 current_indent--; 1116 else 1117 out(O_DIE, "recursion underflow"); 1118 } 1119 1120 static void 1121 indent(void) 1122 { 1123 int i; 1124 if (!Verbose) 1125 return; 1126 for (i = 0; i < current_indent; i++) 1127 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1128 } 1129 1130 static int 1131 suspects_changed(struct fme *fmep) 1132 { 1133 struct event *suspects = fmep->suspects; 1134 struct event *psuspects = fmep->psuspects; 1135 1136 while (suspects != NULL && psuspects != NULL) { 1137 if (suspects != psuspects) 1138 return (1); 1139 suspects = suspects->suspects; 1140 psuspects = psuspects->psuspects; 1141 } 1142 1143 return (suspects != psuspects); 1144 } 1145 1146 #define SLNEW 1 1147 #define SLCHANGED 2 1148 #define SLWAIT 3 1149 #define SLDISPROVED 4 1150 1151 static void 1152 print_suspects(int circumstance, struct fme *fmep) 1153 { 1154 struct event *ep; 1155 1156 out(O_ALTFP|O_NONL, "["); 1157 if (circumstance == SLCHANGED) { 1158 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1159 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1160 } else if (circumstance == SLWAIT) { 1161 out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id); 1162 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1163 } else if (circumstance == SLDISPROVED) { 1164 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1165 } else { 1166 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1167 } 1168 1169 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1170 out(O_ALTFP, "]"); 1171 return; 1172 } 1173 1174 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1175 out(O_ALTFP|O_NONL, " "); 1176 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1177 } 1178 out(O_ALTFP, "]"); 1179 } 1180 1181 static struct node * 1182 eventprop_lookup(struct event *ep, const char *propname) 1183 { 1184 return (lut_lookup(ep->props, (void *)propname, NULL)); 1185 } 1186 1187 #define MAXDIGITIDX 23 1188 static char numbuf[MAXDIGITIDX + 1]; 1189 1190 static int 1191 node2uint(struct node *n, uint_t *valp) 1192 { 1193 struct evalue value; 1194 struct lut *globals = NULL; 1195 1196 if (n == NULL) 1197 return (1); 1198 1199 /* 1200 * check value.v since we are being asked to convert an unsigned 1201 * long long int to an unsigned int 1202 */ 1203 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1204 value.t != UINT64 || value.v > (1ULL << 32)) 1205 return (1); 1206 1207 *valp = (uint_t)value.v; 1208 1209 return (0); 1210 } 1211 1212 static nvlist_t * 1213 node2fmri(struct node *n) 1214 { 1215 nvlist_t **pa, *f, *p; 1216 struct node *nc; 1217 uint_t depth = 0; 1218 char *numstr, *nullbyte; 1219 char *failure; 1220 int err, i; 1221 1222 /* XXX do we need to be able to handle a non-T_NAME node? */ 1223 if (n == NULL || n->t != T_NAME) 1224 return (NULL); 1225 1226 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1227 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1228 break; 1229 depth++; 1230 } 1231 1232 if (nc != NULL) { 1233 /* We bailed early, something went wrong */ 1234 return (NULL); 1235 } 1236 1237 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1238 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1239 pa = alloca(depth * sizeof (nvlist_t *)); 1240 for (i = 0; i < depth; i++) 1241 pa[i] = NULL; 1242 1243 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1244 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1245 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1246 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1247 if (err != 0) { 1248 failure = "basic construction of FMRI failed"; 1249 goto boom; 1250 } 1251 1252 numbuf[MAXDIGITIDX] = '\0'; 1253 nullbyte = &numbuf[MAXDIGITIDX]; 1254 i = 0; 1255 1256 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1257 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1258 if (err != 0) { 1259 failure = "alloc of an hc-pair failed"; 1260 goto boom; 1261 } 1262 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1263 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1264 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1265 if (err != 0) { 1266 failure = "construction of an hc-pair failed"; 1267 goto boom; 1268 } 1269 pa[i++] = p; 1270 } 1271 1272 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1273 if (err == 0) { 1274 for (i = 0; i < depth; i++) 1275 if (pa[i] != NULL) 1276 nvlist_free(pa[i]); 1277 return (f); 1278 } 1279 failure = "addition of hc-pair array to FMRI failed"; 1280 1281 boom: 1282 for (i = 0; i < depth; i++) 1283 if (pa[i] != NULL) 1284 nvlist_free(pa[i]); 1285 nvlist_free(f); 1286 out(O_DIE, "%s", failure); 1287 /*NOTREACHED*/ 1288 } 1289 1290 static uint_t 1291 avg(uint_t sum, uint_t cnt) 1292 { 1293 unsigned long long s = sum * 10; 1294 1295 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 1296 } 1297 1298 static uint8_t 1299 percentof(uint_t part, uint_t whole) 1300 { 1301 unsigned long long p = part * 1000; 1302 1303 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 1304 } 1305 1306 static struct rsl { 1307 struct event *suspect; 1308 nvlist_t *asru; 1309 nvlist_t *fru; 1310 nvlist_t *rsrc; 1311 }; 1312 1313 /* 1314 * rslfree -- free internal members of struct rsl not expected to be 1315 * freed elsewhere. 1316 */ 1317 static void 1318 rslfree(struct rsl *freeme) 1319 { 1320 if (freeme->asru != NULL) 1321 nvlist_free(freeme->asru); 1322 if (freeme->fru != NULL) 1323 nvlist_free(freeme->fru); 1324 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 1325 nvlist_free(freeme->rsrc); 1326 } 1327 1328 /* 1329 * rslcmp -- compare two rsl structures. Use the following 1330 * comparisons to establish cardinality: 1331 * 1332 * 1. Name of the suspect's class. (simple strcmp) 1333 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 1334 * 1335 */ 1336 static int 1337 rslcmp(const void *a, const void *b) 1338 { 1339 struct rsl *r1 = (struct rsl *)a; 1340 struct rsl *r2 = (struct rsl *)b; 1341 int rv; 1342 1343 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 1344 r2->suspect->enode->u.event.ename->u.name.s); 1345 if (rv != 0) 1346 return (rv); 1347 1348 if (r1->asru == NULL && r2->asru == NULL) 1349 return (0); 1350 if (r1->asru == NULL) 1351 return (-1); 1352 if (r2->asru == NULL) 1353 return (1); 1354 return (evnv_cmpnvl(r1->asru, r2->asru, 0)); 1355 } 1356 1357 /* 1358 * rsluniq -- given an array of rsl structures, seek out and "remove" 1359 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 1360 * of the array element. Removal also means updating the number of 1361 * problems and the number of problems which are not faults. User 1362 * provides the first and last element pointers. 1363 */ 1364 static void 1365 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 1366 { 1367 struct rsl *cr; 1368 1369 if (*nprobs == 1) 1370 return; 1371 1372 /* 1373 * At this point, we only expect duplicate defects. 1374 * Eversholt's diagnosis algorithm prevents duplicate 1375 * suspects, but we rewrite defects in the platform code after 1376 * the diagnosis is made, and that can introduce new 1377 * duplicates. 1378 */ 1379 while (first <= last) { 1380 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 1381 first++; 1382 continue; 1383 } 1384 cr = first + 1; 1385 while (cr <= last) { 1386 if (is_defect(first->suspect->t)) { 1387 if (rslcmp(first, cr) == 0) { 1388 cr->suspect = NULL; 1389 rslfree(cr); 1390 (*nprobs)--; 1391 (*nnonf)--; 1392 } 1393 } 1394 /* 1395 * assume all defects are in order after our 1396 * sort and short circuit here with "else break" ? 1397 */ 1398 cr++; 1399 } 1400 first++; 1401 } 1402 } 1403 1404 /* 1405 * get_resources -- for a given suspect, determine what ASRU, FRU and 1406 * RSRC nvlists should be advertised in the final suspect list. 1407 */ 1408 void 1409 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 1410 { 1411 struct node *asrudef, *frudef; 1412 nvlist_t *asru, *fru; 1413 nvlist_t *rsrc = NULL; 1414 char *pathstr; 1415 1416 /* 1417 * First find any ASRU and/or FRU defined in the 1418 * initial fault tree. 1419 */ 1420 asrudef = eventprop_lookup(sp, L_ASRU); 1421 frudef = eventprop_lookup(sp, L_FRU); 1422 1423 /* 1424 * Create FMRIs based on those definitions 1425 */ 1426 asru = node2fmri(asrudef); 1427 fru = node2fmri(frudef); 1428 pathstr = ipath2str(NULL, sp->ipp); 1429 1430 /* 1431 * Allow for platform translations of the FMRIs 1432 */ 1433 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 1434 pathstr); 1435 1436 FREE(pathstr); 1437 rsrcs->suspect = sp; 1438 rsrcs->asru = asru; 1439 rsrcs->fru = fru; 1440 rsrcs->rsrc = rsrc; 1441 } 1442 1443 /* 1444 * trim_suspects -- prior to publishing, we may need to remove some 1445 * suspects from the list. If we're auto-closing upsets, we don't 1446 * want any of those in the published list. If the ASRUs for multiple 1447 * defects resolve to the same ASRU (driver) we only want to publish 1448 * that as a single suspect. 1449 */ 1450 static void 1451 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin, 1452 struct rsl **end) 1453 { 1454 struct event *ep; 1455 struct rsl *rp; 1456 int rpcnt; 1457 1458 /* 1459 * First save the suspects in the psuspects, then copy back 1460 * only the ones we wish to retain. This resets nsuspects to 1461 * zero. 1462 */ 1463 rpcnt = fmep->nsuspects; 1464 save_suspects(fmep); 1465 1466 /* 1467 * allocate an array of resource pointers for the suspects. 1468 * We may end up using less than the full allocation, but this 1469 * is a very short-lived array. publish_suspects() will free 1470 * this array when it's done using it. 1471 */ 1472 rp = *begin = MALLOC(rpcnt * sizeof (struct rsl)); 1473 bzero(rp, rpcnt * sizeof (struct rsl)); 1474 1475 /* first pass, remove any unwanted upsets and populate our array */ 1476 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 1477 if (no_upsets && is_upset(ep->t)) 1478 continue; 1479 get_resources(ep, rp, fmep->cfgdata->cooked); 1480 rp++; 1481 fmep->nsuspects++; 1482 if (!is_fault(ep->t)) 1483 fmep->nonfault++; 1484 } 1485 1486 /* if all we had was unwanted upsets, we're done */ 1487 if (fmep->nsuspects == 0) 1488 return; 1489 1490 *end = rp - 1; 1491 1492 /* sort the array */ 1493 qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp); 1494 rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault); 1495 } 1496 1497 static void 1498 publish_suspects(struct fme *fmep) 1499 { 1500 struct event *ep; 1501 struct rsl *srl = NULL; 1502 struct rsl *erl; 1503 struct rsl *rp; 1504 nvlist_t *fault; 1505 uint8_t cert; 1506 uint_t *frs; 1507 uint_t fravg, frsum, fr; 1508 int frcnt, fridx; 1509 boolean_t no_upsets = B_FALSE; 1510 1511 stats_counter_bump(fmep->diags); 1512 1513 /* 1514 * The current fmd interfaces don't allow us to solve a case 1515 * that's already solved. If we make a new case, what of the 1516 * ereports? We don't appear to have an interface that allows 1517 * us to access the ereports attached to a case (if we wanted 1518 * to copy the original case's ereport attachments to the new 1519 * case) and it's also a bit unclear if there would be any 1520 * problems with having ereports attached to multiple cases 1521 * and/or attaching DIAGNOSED ereports to a case. For now, 1522 * we'll just output a message. 1523 */ 1524 if (fmep->posted_suspects || 1525 fmd_case_solved(fmep->hdl, fmep->fmcase)) { 1526 out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ", 1527 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1528 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1529 out(O_ALTFP|O_NONL, " "); 1530 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1531 } 1532 out(O_ALTFP, NULL); 1533 return; 1534 } 1535 1536 /* 1537 * If we're auto-closing upsets, we don't want to include them 1538 * in any produced suspect lists or certainty accounting. 1539 */ 1540 if (Autoclose != NULL) 1541 if (strcmp(Autoclose, "true") == 0 || 1542 strcmp(Autoclose, "all") == 0 || 1543 strcmp(Autoclose, "upsets") == 0) 1544 no_upsets = B_TRUE; 1545 1546 trim_suspects(fmep, no_upsets, &srl, &erl); 1547 1548 /* 1549 * If the resulting suspect list has no members, we're 1550 * done. Returning here will simply close the case. 1551 */ 1552 if (fmep->nsuspects == 0) { 1553 out(O_ALTFP, 1554 "[FME%d, case %s (all suspects are upsets)]", 1555 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1556 FREE(srl); 1557 restore_suspects(fmep); 1558 return; 1559 } 1560 1561 /* 1562 * If the suspect list is all faults, then for a given fault, 1563 * say X of N, X's certainty is computed via: 1564 * 1565 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 1566 * 1567 * If none of the suspects are faults, and there are N suspects, 1568 * the certainty of a given suspect is 100/N. 1569 * 1570 * If there are are a mixture of faults and other problems in 1571 * the suspect list, we take an average of the faults' 1572 * FITrates and treat this average as the FITrate for any 1573 * non-faults. The fitrate of any given suspect is then 1574 * computed per the first formula above. 1575 */ 1576 if (fmep->nonfault == fmep->nsuspects) { 1577 /* NO faults in the suspect list */ 1578 cert = percentof(1, fmep->nsuspects); 1579 } else { 1580 /* sum the fitrates */ 1581 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 1582 fridx = frcnt = frsum = 0; 1583 1584 for (rp = srl; rp <= erl; rp++) { 1585 struct node *n; 1586 1587 if (rp->suspect == NULL) 1588 continue; 1589 if (!is_fault(rp->suspect->t)) { 1590 frs[fridx++] = 0; 1591 continue; 1592 } 1593 n = eventprop_lookup(rp->suspect, L_FITrate); 1594 if (node2uint(n, &fr) != 0) { 1595 out(O_DEBUG|O_NONL, "event "); 1596 ipath_print(O_DEBUG|O_NONL, 1597 ep->enode->u.event.ename->u.name.s, 1598 ep->ipp); 1599 out(O_DEBUG, " has no FITrate (using 1)"); 1600 fr = 1; 1601 } else if (fr == 0) { 1602 out(O_DEBUG|O_NONL, "event "); 1603 ipath_print(O_DEBUG|O_NONL, 1604 ep->enode->u.event.ename->u.name.s, 1605 ep->ipp); 1606 out(O_DEBUG, " has zero FITrate (using 1)"); 1607 fr = 1; 1608 } 1609 1610 frs[fridx++] = fr; 1611 frsum += fr; 1612 frcnt++; 1613 } 1614 fravg = avg(frsum, frcnt); 1615 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 1616 if (frs[fridx] == 0) { 1617 frs[fridx] = fravg; 1618 frsum += fravg; 1619 } 1620 } 1621 1622 /* Add them in reverse order of our sort, as fmd reverses order */ 1623 for (rp = erl; rp >= srl; rp--) { 1624 if (rp->suspect == NULL) 1625 continue; 1626 if (fmep->nonfault != fmep->nsuspects) 1627 cert = percentof(frs[--fridx], frsum); 1628 fault = fmd_nvl_create_fault(fmep->hdl, 1629 rp->suspect->enode->u.event.ename->u.name.s, 1630 cert, 1631 rp->asru, 1632 rp->fru, 1633 rp->rsrc); 1634 if (fault == NULL) 1635 out(O_DIE, "fault creation failed"); 1636 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 1637 rp->suspect->fault = fault; 1638 rslfree(rp); 1639 } 1640 fmd_case_solve(fmep->hdl, fmep->fmcase); 1641 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 1642 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1643 1644 if (Autoconvict) { 1645 for (rp = srl; rp <= erl; rp++) { 1646 if (rp->suspect == NULL) 1647 continue; 1648 fmd_case_convict(fmep->hdl, 1649 fmep->fmcase, rp->suspect->fault); 1650 } 1651 out(O_ALTFP, "[convicting FME%d, case %s]", fmep->id, 1652 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1653 } 1654 1655 /* 1656 * revert to the original suspect list 1657 */ 1658 FREE(srl); 1659 restore_suspects(fmep); 1660 } 1661 1662 static void 1663 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep) 1664 { 1665 struct case_list *newcase; 1666 nvlist_t *defect; 1667 1668 out(O_ALTFP, 1669 "[undiagnosable ereport received, " 1670 "creating and closing a new case (%s)]", 1671 Undiag_reason ? Undiag_reason : "reason not provided"); 1672 1673 newcase = MALLOC(sizeof (struct case_list)); 1674 newcase->next = NULL; 1675 1676 newcase->fmcase = fmd_case_open(hdl, NULL); 1677 if (Undiagablecaselist != NULL) 1678 newcase->next = Undiagablecaselist; 1679 Undiagablecaselist = newcase; 1680 1681 if (ffep != NULL) 1682 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 1683 1684 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 1685 NULL, NULL, NULL); 1686 if (Undiag_reason != NULL) 1687 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 1688 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 1689 1690 fmd_case_solve(hdl, newcase->fmcase); 1691 fmd_case_close(hdl, newcase->fmcase); 1692 } 1693 1694 static void 1695 fme_undiagnosable(struct fme *f) 1696 { 1697 nvlist_t *defect; 1698 1699 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 1700 f->id, fmd_case_uuid(f->hdl, f->fmcase), 1701 Undiag_reason ? Undiag_reason : "undiagnosable"); 1702 1703 defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100, 1704 NULL, NULL, NULL); 1705 if (Undiag_reason != NULL) 1706 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 1707 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 1708 fmd_case_solve(f->hdl, f->fmcase); 1709 destroy_fme_bufs(f); 1710 fmd_case_close(f->hdl, f->fmcase); 1711 } 1712 1713 /* 1714 * fme_close_case 1715 * 1716 * Find the requested case amongst our fmes and close it. Free up 1717 * the related fme. 1718 */ 1719 void 1720 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 1721 { 1722 struct case_list *ucasep, *prevcasep = NULL; 1723 struct fme *prev = NULL; 1724 struct fme *fmep; 1725 1726 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 1727 if (fmcase != ucasep->fmcase) { 1728 prevcasep = ucasep; 1729 continue; 1730 } 1731 1732 if (prevcasep == NULL) 1733 Undiagablecaselist = Undiagablecaselist->next; 1734 else 1735 prevcasep->next = ucasep->next; 1736 1737 FREE(ucasep); 1738 return; 1739 } 1740 1741 for (fmep = FMElist; fmep; fmep = fmep->next) { 1742 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 1743 break; 1744 prev = fmep; 1745 } 1746 1747 if (fmep == NULL) { 1748 out(O_WARN, "Eft asked to close unrecognized case [%s].", 1749 fmd_case_uuid(hdl, fmcase)); 1750 return; 1751 } 1752 1753 if (EFMElist == fmep) 1754 EFMElist = prev; 1755 1756 if (prev == NULL) 1757 FMElist = FMElist->next; 1758 else 1759 prev->next = fmep->next; 1760 1761 fmep->next = NULL; 1762 1763 /* Get rid of any timer this fme has set */ 1764 if (fmep->wull != 0) 1765 fmd_timer_remove(fmep->hdl, fmep->timer); 1766 1767 if (ClosedFMEs == NULL) { 1768 ClosedFMEs = fmep; 1769 } else { 1770 fmep->next = ClosedFMEs; 1771 ClosedFMEs = fmep; 1772 } 1773 } 1774 1775 /* 1776 * fme_set_timer() 1777 * If the time we need to wait for the given FME is less than the 1778 * current timer, kick that old timer out and establish a new one. 1779 */ 1780 static void 1781 fme_set_timer(struct fme *fmep, unsigned long long wull) 1782 { 1783 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 1784 ptree_timeval(O_ALTFP|O_VERB, &wull); 1785 1786 if (wull <= fmep->pull) { 1787 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 1788 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 1789 out(O_ALTFP|O_VERB, NULL); 1790 /* we've waited at least wull already, don't need timer */ 1791 return; 1792 } 1793 1794 out(O_ALTFP|O_VERB|O_NONL, " currently "); 1795 if (fmep->wull != 0) { 1796 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 1797 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 1798 out(O_ALTFP|O_VERB, NULL); 1799 } else { 1800 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 1801 out(O_ALTFP|O_VERB, NULL); 1802 } 1803 1804 if (fmep->wull != 0) 1805 if (wull >= fmep->wull) 1806 /* New timer would fire later than established timer */ 1807 return; 1808 1809 if (fmep->wull != 0) 1810 fmd_timer_remove(fmep->hdl, fmep->timer); 1811 1812 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 1813 fmep->e0r, wull); 1814 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 1815 fmep->wull = wull; 1816 } 1817 1818 void 1819 fme_timer_fired(struct fme *fmep, id_t tid) 1820 { 1821 struct fme *ffmep = NULL; 1822 1823 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 1824 if (ffmep == fmep) 1825 break; 1826 1827 if (ffmep == NULL) { 1828 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 1829 (void *)fmep); 1830 return; 1831 } 1832 1833 if (tid != fmep->htid) { 1834 /* 1835 * normal timer (not the hesitation timer 1836 */ 1837 fmep->pull = fmep->wull; 1838 fmep->wull = 0; 1839 fmd_buf_write(fmep->hdl, fmep->fmcase, 1840 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 1841 } else { 1842 fmep->hesitated = 1; 1843 } 1844 fme_eval(fmep, NULL); 1845 } 1846 1847 /* 1848 * Preserve the fme's suspect list in its psuspects list, NULLing the 1849 * suspects list in the meantime. 1850 */ 1851 static void 1852 save_suspects(struct fme *fmep) 1853 { 1854 struct event *ep; 1855 struct event *nextep; 1856 1857 /* zero out the previous suspect list */ 1858 for (ep = fmep->psuspects; ep; ep = nextep) { 1859 nextep = ep->psuspects; 1860 ep->psuspects = NULL; 1861 } 1862 fmep->psuspects = NULL; 1863 1864 /* zero out the suspect list, copying it to previous suspect list */ 1865 fmep->psuspects = fmep->suspects; 1866 for (ep = fmep->suspects; ep; ep = nextep) { 1867 nextep = ep->suspects; 1868 ep->psuspects = ep->suspects; 1869 ep->suspects = NULL; 1870 ep->is_suspect = 0; 1871 } 1872 fmep->suspects = NULL; 1873 fmep->nsuspects = 0; 1874 fmep->nonfault = 0; 1875 } 1876 1877 /* 1878 * Retrieve the fme's suspect list from its psuspects list. 1879 */ 1880 static void 1881 restore_suspects(struct fme *fmep) 1882 { 1883 struct event *ep; 1884 struct event *nextep; 1885 1886 fmep->nsuspects = fmep->nonfault = 0; 1887 fmep->suspects = fmep->psuspects; 1888 for (ep = fmep->psuspects; ep; ep = nextep) { 1889 fmep->nsuspects++; 1890 if (!is_fault(ep->t)) 1891 fmep->nonfault++; 1892 nextep = ep->psuspects; 1893 ep->suspects = ep->psuspects; 1894 } 1895 } 1896 1897 /* 1898 * this is what we use to call the Emrys prototype code instead of main() 1899 */ 1900 static void 1901 fme_eval(struct fme *fmep, fmd_event_t *ffep) 1902 { 1903 struct event *ep; 1904 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1905 1906 save_suspects(fmep); 1907 1908 out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id); 1909 indent_set(" "); 1910 1911 initialize_cycles(fmep); 1912 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL); 1913 1914 out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 1915 fme_state2str(fmep->state)); 1916 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1917 out(O_ALTFP|O_VERB|O_NONL, " "); 1918 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 1919 } 1920 out(O_ALTFP|O_VERB, NULL); 1921 1922 if (fmep->posted_suspects) { 1923 /* 1924 * this FME has already posted a diagnosis, so see if 1925 * the event changed the diagnosis and print a warning 1926 * if it did. 1927 * 1928 */ 1929 if (suspects_changed(fmep)) { 1930 print_suspects(SLCHANGED, fmep); 1931 publish_suspects(fmep); 1932 } 1933 } else { 1934 switch (fmep->state) { 1935 case FME_CREDIBLE: 1936 /* 1937 * if the suspect list contains any upsets, we 1938 * turn off the hesitation logic (by setting 1939 * the hesitate flag which normally indicates 1940 * we've already done the hesitate logic). 1941 * this is done because hesitating with upsets 1942 * causes us to explain away additional soft errors 1943 * while the upset FME stays open. 1944 */ 1945 if (fmep->hesitated == 0) { 1946 struct event *s; 1947 1948 for (s = fmep->suspects; s; s = s->suspects) { 1949 if (s->t == N_UPSET) { 1950 fmep->hesitated = 1; 1951 break; 1952 } 1953 } 1954 } 1955 1956 if (Hesitate && 1957 fmep->suspects != NULL && 1958 fmep->suspects->suspects != NULL && 1959 fmep->hesitated == 0) { 1960 /* 1961 * about to publish multi-entry suspect list, 1962 * set the hesitation timer if not already set. 1963 */ 1964 if (fmep->htid == 0) { 1965 out(O_ALTFP|O_NONL, 1966 "[hesitate FME%d, case %s ", 1967 fmep->id, 1968 fmd_case_uuid(fmep->hdl, 1969 fmep->fmcase)); 1970 ptree_timeval(O_ALTFP|O_NONL, 1971 (unsigned long long *)&Hesitate); 1972 out(O_ALTFP, "]"); 1973 fme_set_timer(fmep, my_delay); 1974 fmep->htid = 1975 fmd_timer_install(fmep->hdl, 1976 (void *)fmep, NULL, Hesitate); 1977 } else { 1978 out(O_ALTFP, 1979 "[still hesitating FME%d, case %s]", 1980 fmep->id, 1981 fmd_case_uuid(fmep->hdl, 1982 fmep->fmcase)); 1983 } 1984 } else { 1985 print_suspects(SLNEW, fmep); 1986 (void) upsets_eval(fmep, ffep); 1987 publish_suspects(fmep); 1988 fmep->posted_suspects = 1; 1989 fmd_buf_write(fmep->hdl, fmep->fmcase, 1990 WOBUF_POSTD, 1991 (void *)&fmep->posted_suspects, 1992 sizeof (fmep->posted_suspects)); 1993 } 1994 break; 1995 1996 case FME_WAIT: 1997 /* 1998 * singleton suspect list implies 1999 * no point in waiting 2000 */ 2001 if (fmep->suspects && 2002 fmep->suspects->suspects == NULL) { 2003 print_suspects(SLNEW, fmep); 2004 (void) upsets_eval(fmep, ffep); 2005 publish_suspects(fmep); 2006 fmep->posted_suspects = 1; 2007 fmd_buf_write(fmep->hdl, fmep->fmcase, 2008 WOBUF_POSTD, 2009 (void *)&fmep->posted_suspects, 2010 sizeof (fmep->posted_suspects)); 2011 fmep->state = FME_CREDIBLE; 2012 } else { 2013 ASSERT(my_delay > fmep->ull); 2014 fme_set_timer(fmep, my_delay); 2015 print_suspects(SLWAIT, fmep); 2016 } 2017 break; 2018 2019 case FME_DISPROVED: 2020 print_suspects(SLDISPROVED, fmep); 2021 Undiag_reason = UD_UNSOLVD; 2022 fme_undiagnosable(fmep); 2023 break; 2024 } 2025 } 2026 2027 if (fmep->posted_suspects == 1 && Autoclose != NULL) { 2028 int doclose = 0; 2029 2030 if (strcmp(Autoclose, "true") == 0 || 2031 strcmp(Autoclose, "all") == 0) 2032 doclose = 1; 2033 2034 if (strcmp(Autoclose, "upsets") == 0) { 2035 doclose = 1; 2036 for (ep = fmep->suspects; ep; ep = ep->suspects) { 2037 if (ep->t != N_UPSET) { 2038 doclose = 0; 2039 break; 2040 } 2041 } 2042 } 2043 2044 if (doclose) { 2045 out(O_ALTFP, "[closing FME%d, case %s (autoclose)]", 2046 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2047 2048 destroy_fme_bufs(fmep); 2049 fmd_case_close(fmep->hdl, fmep->fmcase); 2050 } 2051 } 2052 } 2053 2054 /* 2055 * below here is the code derived from the Emrys prototype 2056 */ 2057 2058 static void indent(void); 2059 static int triggered(struct fme *fmep, struct event *ep, int mark); 2060 static void mark_arrows(struct fme *fmep, struct event *ep, int mark); 2061 static enum fme_state effects_test(struct fme *fmep, 2062 struct event *fault_event); 2063 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 2064 unsigned long long at_latest_by, unsigned long long *pdelay, 2065 struct arrow *arrowp); 2066 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 2067 unsigned long long at_latest_by, unsigned long long *pdelay); 2068 2069 static int 2070 triggered(struct fme *fmep, struct event *ep, int mark) 2071 { 2072 struct bubble *bp; 2073 struct arrowlist *ap; 2074 int count = 0; 2075 2076 stats_counter_bump(fmep->Tcallcount); 2077 for (bp = itree_next_bubble(ep, NULL); bp; 2078 bp = itree_next_bubble(ep, bp)) { 2079 if (bp->t != B_TO) 2080 continue; 2081 for (ap = itree_next_arrow(bp, NULL); ap; 2082 ap = itree_next_arrow(bp, ap)) { 2083 /* check count of marks against K in the bubble */ 2084 if (ap->arrowp->tail->mark == mark && 2085 ++count >= bp->nork) 2086 return (1); 2087 } 2088 } 2089 return (0); 2090 } 2091 2092 static void 2093 mark_arrows(struct fme *fmep, struct event *ep, int mark) 2094 { 2095 struct bubble *bp; 2096 struct arrowlist *ap; 2097 2098 for (bp = itree_next_bubble(ep, NULL); bp; 2099 bp = itree_next_bubble(ep, bp)) { 2100 if (bp->t != B_FROM) 2101 continue; 2102 if (bp->mark != mark) { 2103 stats_counter_bump(fmep->Marrowcount); 2104 bp->mark = mark; 2105 for (ap = itree_next_arrow(bp, NULL); ap; 2106 ap = itree_next_arrow(bp, ap)) { 2107 struct constraintlist *ctp; 2108 struct evalue value; 2109 int do_not_follow = 0; 2110 /* 2111 * see if false constraint prevents us 2112 * from traversing this arrow, but don't 2113 * bother if the event is an ereport we 2114 * haven't seen 2115 */ 2116 if (ap->arrowp->head->myevent->t != N_EREPORT || 2117 ap->arrowp->head->myevent->count != 0) { 2118 platform_set_payloadnvp( 2119 ap->arrowp->head->myevent->nvp); 2120 for (ctp = ap->arrowp->constraints; 2121 ctp != NULL; ctp = ctp->next) { 2122 if (eval_expr(ctp->cnode, 2123 NULL, NULL, 2124 &fmep->globals, 2125 fmep->cfgdata->cooked, 2126 ap->arrowp, 0, 2127 &value) == 0 || 2128 value.t == UNDEFINED || 2129 value.v == 0) { 2130 do_not_follow = 1; 2131 break; 2132 } 2133 } 2134 platform_set_payloadnvp(NULL); 2135 } 2136 2137 if (do_not_follow) { 2138 indent(); 2139 out(O_ALTFP|O_VERB|O_NONL, 2140 " False arrow to "); 2141 itree_pevent_brief( 2142 O_ALTFP|O_VERB|O_NONL, 2143 ap->arrowp->head->myevent); 2144 out(O_ALTFP|O_VERB|O_NONL, " "); 2145 ptree(O_ALTFP|O_VERB|O_NONL, 2146 ctp->cnode, 1, 0); 2147 out(O_ALTFP|O_VERB, NULL); 2148 continue; 2149 } 2150 2151 if (triggered(fmep, ap->arrowp->head->myevent, 2152 mark)) 2153 mark_arrows(fmep, 2154 ap->arrowp->head->myevent, mark); 2155 } 2156 } 2157 } 2158 } 2159 2160 static enum fme_state 2161 effects_test(struct fme *fmep, struct event *fault_event) 2162 { 2163 struct event *error_event; 2164 enum fme_state return_value = FME_CREDIBLE; 2165 2166 stats_counter_bump(fmep->Ecallcount); 2167 indent_push(" E"); 2168 indent(); 2169 out(O_ALTFP|O_VERB|O_NONL, "->"); 2170 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 2171 out(O_ALTFP|O_VERB, NULL); 2172 2173 mark_arrows(fmep, fault_event, 1); 2174 for (error_event = fmep->observations; 2175 error_event; error_event = error_event->observations) { 2176 indent(); 2177 out(O_ALTFP|O_VERB|O_NONL, " "); 2178 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 2179 if (!triggered(fmep, error_event, 1)) { 2180 return_value = FME_DISPROVED; 2181 out(O_ALTFP|O_VERB, " NOT triggered"); 2182 break; 2183 } else { 2184 out(O_ALTFP|O_VERB, " triggered"); 2185 } 2186 } 2187 mark_arrows(fmep, fault_event, 0); 2188 2189 indent(); 2190 out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value)); 2191 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 2192 out(O_ALTFP|O_VERB, NULL); 2193 indent_pop(); 2194 return (return_value); 2195 } 2196 2197 static enum fme_state 2198 requirements_test(struct fme *fmep, struct event *ep, 2199 unsigned long long at_latest_by, unsigned long long *pdelay, 2200 struct arrow *arrowp) 2201 { 2202 int waiting_events; 2203 int credible_events; 2204 enum fme_state return_value = FME_CREDIBLE; 2205 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2206 unsigned long long arrow_delay; 2207 unsigned long long my_delay; 2208 struct event *ep2; 2209 struct bubble *bp; 2210 struct arrowlist *ap; 2211 2212 stats_counter_bump(fmep->Rcallcount); 2213 indent_push(" R"); 2214 indent(); 2215 out(O_ALTFP|O_VERB|O_NONL, "->"); 2216 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2217 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 2218 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2219 out(O_ALTFP|O_VERB, NULL); 2220 2221 if (ep->t == N_EREPORT) { 2222 if (ep->count == 0) { 2223 if (fmep->pull >= at_latest_by) { 2224 return_value = FME_DISPROVED; 2225 } else { 2226 *pdelay = at_latest_by; 2227 return_value = FME_WAIT; 2228 } 2229 } else if (arrowp != NULL) { 2230 /* 2231 * evaluate constraints only for current observation 2232 */ 2233 struct constraintlist *ctp; 2234 struct evalue value; 2235 2236 platform_set_payloadnvp(ep->nvp); 2237 for (ctp = arrowp->constraints; ctp != NULL; 2238 ctp = ctp->next) { 2239 if (eval_expr(ctp->cnode, NULL, NULL, 2240 &fmep->globals, fmep->cfgdata->cooked, 2241 arrowp, 0, &value) == 0 || 2242 value.t == UNDEFINED || value.v == 0) { 2243 indent(); 2244 out(O_ALTFP|O_VERB|O_NONL, 2245 " False constraint "); 2246 out(O_ALTFP|O_VERB|O_NONL, " "); 2247 ptree(O_ALTFP|O_VERB|O_NONL, 2248 ctp->cnode, 1, 0); 2249 out(O_ALTFP|O_VERB, NULL); 2250 return_value = FME_DISPROVED; 2251 break; 2252 } 2253 } 2254 platform_set_payloadnvp(NULL); 2255 } 2256 2257 indent(); 2258 switch (return_value) { 2259 case FME_CREDIBLE: 2260 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2261 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2262 break; 2263 case FME_DISPROVED: 2264 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2265 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2266 break; 2267 case FME_WAIT: 2268 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2269 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2270 out(O_ALTFP|O_VERB|O_NONL, " to "); 2271 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2272 break; 2273 default: 2274 out(O_DIE, "requirements_test: unexpected fme_state"); 2275 break; 2276 } 2277 out(O_ALTFP|O_VERB, NULL); 2278 indent_pop(); 2279 2280 return (return_value); 2281 } 2282 2283 /* this event is not a report, descend the tree */ 2284 for (bp = itree_next_bubble(ep, NULL); bp; 2285 bp = itree_next_bubble(ep, bp)) { 2286 if (bp->t != B_FROM) 2287 continue; 2288 if (bp->mark == 0) { 2289 int n = bp->nork; 2290 2291 bp->mark = 1; 2292 credible_events = 0; 2293 waiting_events = 0; 2294 arrow_delay = TIMEVAL_EVENTUALLY; 2295 /* 2296 * n is -1 for 'A' so adjust it. 2297 * XXX just count up the arrows for now. 2298 */ 2299 if (n < 0) { 2300 n = 0; 2301 for (ap = itree_next_arrow(bp, NULL); ap; 2302 ap = itree_next_arrow(bp, ap)) 2303 n++; 2304 indent(); 2305 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 2306 } else { 2307 indent(); 2308 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 2309 } 2310 2311 for (ap = itree_next_arrow(bp, NULL); ap; 2312 ap = itree_next_arrow(bp, ap)) { 2313 ep2 = ap->arrowp->head->myevent; 2314 if (n <= credible_events) 2315 break; 2316 2317 if (triggered(fmep, ep2, 1)) 2318 /* XXX adding max timevals! */ 2319 switch (requirements_test(fmep, ep2, 2320 at_latest_by + ap->arrowp->maxdelay, 2321 &my_delay, ap->arrowp)) { 2322 case FME_CREDIBLE: 2323 credible_events++; 2324 break; 2325 case FME_DISPROVED: 2326 break; 2327 case FME_WAIT: 2328 if (my_delay < arrow_delay) 2329 arrow_delay = my_delay; 2330 waiting_events++; 2331 break; 2332 default: 2333 out(O_DIE, 2334 "Bug in requirements_test."); 2335 } 2336 else 2337 credible_events++; 2338 } 2339 indent(); 2340 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 2341 credible_events, waiting_events); 2342 if (credible_events + waiting_events < n) { 2343 /* Can never meet requirements */ 2344 indent(); 2345 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2346 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2347 out(O_ALTFP|O_VERB, NULL); 2348 indent_pop(); 2349 return (FME_DISPROVED); 2350 } 2351 if (credible_events < n) { /* will have to wait */ 2352 /* wait time is shortest known */ 2353 if (arrow_delay < overall_delay) 2354 overall_delay = arrow_delay; 2355 return_value = FME_WAIT; 2356 } 2357 } else { 2358 indent(); 2359 out(O_ALTFP|O_VERB|O_NONL, " Mark was set: "); 2360 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2361 out(O_ALTFP|O_VERB|O_NONL, " to"); 2362 for (ap = itree_next_arrow(bp, NULL); ap; 2363 ap = itree_next_arrow(bp, ap)) { 2364 out(O_ALTFP|O_VERB|O_NONL, " "); 2365 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 2366 ap->arrowp->head->myevent); 2367 } 2368 out(O_ALTFP|O_VERB, NULL); 2369 } 2370 } 2371 2372 /* 2373 * evaluate constraints for ctlist, which is the list of 2374 * constraints for the arrow pointing into this node of the tree 2375 */ 2376 if (return_value == FME_CREDIBLE && arrowp != NULL) { 2377 struct constraintlist *ctp; 2378 struct evalue value; 2379 2380 platform_set_payloadnvp(ep->nvp); 2381 for (ctp = arrowp->constraints; ctp != NULL; 2382 ctp = ctp->next) { 2383 if (eval_expr(ctp->cnode, NULL, NULL, &fmep->globals, 2384 fmep->cfgdata->cooked, arrowp, 0, &value) == 0 || 2385 value.t == UNDEFINED || value.v == 0) { 2386 indent(); 2387 out(O_ALTFP|O_VERB|O_NONL, 2388 " False constraint "); 2389 out(O_ALTFP|O_VERB|O_NONL, " "); 2390 ptree(O_ALTFP|O_VERB|O_NONL, 2391 ctp->cnode, 1, 0); 2392 out(O_ALTFP|O_VERB, NULL); 2393 return_value = FME_DISPROVED; 2394 break; 2395 } 2396 } 2397 platform_set_payloadnvp(NULL); 2398 } 2399 2400 if (return_value == FME_WAIT) 2401 *pdelay = overall_delay; 2402 indent(); 2403 out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value)); 2404 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2405 out(O_ALTFP|O_VERB, NULL); 2406 indent_pop(); 2407 return (return_value); 2408 } 2409 2410 static enum fme_state 2411 causes_test(struct fme *fmep, struct event *ep, 2412 unsigned long long at_latest_by, unsigned long long *pdelay) 2413 { 2414 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2415 unsigned long long my_delay; 2416 int credible_results = 0; 2417 int waiting_results = 0; 2418 enum fme_state fstate; 2419 struct event *tail_event; 2420 struct bubble *bp; 2421 struct arrowlist *ap; 2422 int k = 1; 2423 2424 stats_counter_bump(fmep->Ccallcount); 2425 indent_push(" C"); 2426 indent(); 2427 out(O_ALTFP|O_VERB|O_NONL, "->"); 2428 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2429 out(O_ALTFP|O_VERB, NULL); 2430 2431 for (bp = itree_next_bubble(ep, NULL); bp; 2432 bp = itree_next_bubble(ep, bp)) { 2433 if (bp->t != B_TO) 2434 continue; 2435 k = bp->nork; /* remember the K value */ 2436 for (ap = itree_next_arrow(bp, NULL); ap; 2437 ap = itree_next_arrow(bp, ap)) { 2438 struct constraintlist *ctp; 2439 struct evalue value; 2440 int do_not_follow = 0; 2441 /* 2442 * see if false constraint prevents us 2443 * from traversing this arrow 2444 */ 2445 platform_set_payloadnvp(ep->nvp); 2446 for (ctp = ap->arrowp->constraints; 2447 ctp != NULL; ctp = ctp->next) { 2448 if (eval_expr(ctp->cnode, NULL, NULL, 2449 &fmep->globals, 2450 fmep->cfgdata->cooked, 2451 ap->arrowp, 0, 2452 &value) == 0 || 2453 value.t == UNDEFINED || 2454 value.v == 0) { 2455 do_not_follow = 1; 2456 break; 2457 } 2458 } 2459 platform_set_payloadnvp(NULL); 2460 if (do_not_follow) { 2461 indent(); 2462 out(O_ALTFP|O_VERB|O_NONL, 2463 " False arrow from "); 2464 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 2465 ap->arrowp->tail->myevent); 2466 out(O_ALTFP|O_VERB|O_NONL, " "); 2467 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2468 out(O_ALTFP|O_VERB, NULL); 2469 continue; 2470 } 2471 2472 if (ap->arrowp->causes_tested++ > 0) { 2473 /* 2474 * get to this point if this is not the 2475 * first time we're going through this 2476 * arrow in the causes test. consider this 2477 * branch to be credible and let the 2478 * credible/noncredible outcome depend on 2479 * the other branches in this cycle. 2480 */ 2481 fstate = FME_CREDIBLE; 2482 } else { 2483 /* 2484 * get to this point if this is the first 2485 * time we're going through this arrow. 2486 */ 2487 tail_event = ap->arrowp->tail->myevent; 2488 fstate = hypothesise(fmep, tail_event, 2489 at_latest_by, 2490 &my_delay, ap->arrowp); 2491 } 2492 2493 switch (fstate) { 2494 case FME_WAIT: 2495 if (my_delay < overall_delay) 2496 overall_delay = my_delay; 2497 waiting_results++; 2498 break; 2499 case FME_CREDIBLE: 2500 credible_results++; 2501 break; 2502 case FME_DISPROVED: 2503 break; 2504 default: 2505 out(O_DIE, "Bug in causes_test"); 2506 } 2507 2508 ap->arrowp->causes_tested--; 2509 ASSERT(ap->arrowp->causes_tested >= 0); 2510 } 2511 } 2512 /* compare against K */ 2513 if (credible_results + waiting_results < k) { 2514 indent(); 2515 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2516 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2517 out(O_ALTFP|O_VERB, NULL); 2518 indent_pop(); 2519 return (FME_DISPROVED); 2520 } 2521 if (waiting_results != 0) { 2522 *pdelay = overall_delay; 2523 indent(); 2524 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2525 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2526 out(O_ALTFP|O_VERB|O_NONL, " to "); 2527 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2528 out(O_ALTFP|O_VERB, NULL); 2529 indent_pop(); 2530 return (FME_WAIT); 2531 } 2532 indent(); 2533 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2534 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2535 out(O_ALTFP|O_VERB, NULL); 2536 indent_pop(); 2537 return (FME_CREDIBLE); 2538 } 2539 2540 static enum fme_state 2541 hypothesise(struct fme *fmep, struct event *ep, 2542 unsigned long long at_latest_by, unsigned long long *pdelay, 2543 struct arrow *arrowp) 2544 { 2545 enum fme_state rtr, otr; 2546 unsigned long long my_delay; 2547 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2548 2549 stats_counter_bump(fmep->Hcallcount); 2550 indent_push(" H"); 2551 indent(); 2552 out(O_ALTFP|O_VERB|O_NONL, "->"); 2553 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2554 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 2555 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2556 out(O_ALTFP|O_VERB, NULL); 2557 2558 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay, arrowp); 2559 mark_arrows(fmep, ep, 0); /* clean up after requirements test */ 2560 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 2561 overall_delay = my_delay; 2562 if (rtr != FME_DISPROVED) { 2563 if (is_problem(ep->t)) { 2564 otr = effects_test(fmep, ep); 2565 if (otr != FME_DISPROVED) { 2566 if (fmep->peek == 0 && ep->is_suspect++ == 0) { 2567 ep->suspects = fmep->suspects; 2568 fmep->suspects = ep; 2569 fmep->nsuspects++; 2570 if (!is_fault(ep->t)) 2571 fmep->nonfault++; 2572 } 2573 } 2574 } else 2575 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 2576 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 2577 overall_delay = my_delay; 2578 if ((otr != FME_DISPROVED) && 2579 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 2580 *pdelay = overall_delay; 2581 } 2582 if (rtr == FME_DISPROVED) { 2583 indent(); 2584 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2585 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2586 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 2587 indent_pop(); 2588 return (FME_DISPROVED); 2589 } 2590 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 2591 indent(); 2592 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2593 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2594 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 2595 indent_pop(); 2596 return (FME_DISPROVED); 2597 } 2598 if (otr == FME_DISPROVED) { 2599 indent(); 2600 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2601 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2602 out(O_ALTFP|O_VERB, " (causes are not credible)"); 2603 indent_pop(); 2604 return (FME_DISPROVED); 2605 } 2606 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 2607 indent(); 2608 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2609 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2610 out(O_ALTFP|O_VERB|O_NONL, " to "); 2611 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 2612 out(O_ALTFP|O_VERB, NULL); 2613 indent_pop(); 2614 return (FME_WAIT); 2615 } 2616 indent(); 2617 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2618 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2619 out(O_ALTFP|O_VERB, NULL); 2620 indent_pop(); 2621 return (FME_CREDIBLE); 2622 } 2623