1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <strings.h> 37 #include <ctype.h> 38 #include <alloca.h> 39 #include <libnvpair.h> 40 #include <sys/fm/protocol.h> 41 #include <fm/fmd_api.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 58 /* imported from eft.c... */ 59 extern int Autoconvict; 60 extern char *Autoclose; 61 extern hrtime_t Hesitate; 62 extern nv_alloc_t Eft_nv_hdl; 63 64 /* fme under construction is global so we can free it on module abort */ 65 static struct fme *Nfmep; 66 67 static const char *Undiag_reason; 68 69 static int Nextid = 0; 70 71 /* list of fault management exercises underway */ 72 static struct fme { 73 struct fme *next; /* next exercise */ 74 unsigned long long ull; /* time when fme was created */ 75 int id; /* FME id */ 76 struct cfgdata *cfgdata; /* full configuration data */ 77 struct lut *eventtree; /* propagation tree for this FME */ 78 /* 79 * The initial error report that created this FME is kept in 80 * two forms. e0 points to the instance tree node and is used 81 * by fme_eval() as the starting point for the inference 82 * algorithm. e0r is the event handle FMD passed to us when 83 * the ereport first arrived and is used when setting timers, 84 * which are always relative to the time of this initial 85 * report. 86 */ 87 struct event *e0; 88 fmd_event_t *e0r; 89 90 id_t timer; /* for setting an fmd time-out */ 91 id_t htid; /* for setting hesitation timer */ 92 93 struct event *ecurrent; /* ereport under consideration */ 94 struct event *suspects; /* current suspect list */ 95 struct event *psuspects; /* previous suspect list */ 96 int nsuspects; /* count of suspects */ 97 int nonfault; /* zero if all suspects T_FAULT */ 98 int posted_suspects; /* true if we've posted a diagnosis */ 99 int hesitated; /* true if we hesitated */ 100 int uniqobs; /* number of unique events observed */ 101 int peek; /* just peeking, don't track suspects */ 102 enum fme_state { 103 FME_NOTHING = 5000, /* not evaluated yet */ 104 FME_WAIT, /* need to wait for more info */ 105 FME_CREDIBLE, /* suspect list is credible */ 106 FME_DISPROVED /* no valid suspects found */ 107 } state; 108 109 unsigned long long pull; /* time passed since created */ 110 unsigned long long wull; /* wait until this time for re-eval */ 111 struct event *observations; /* observation list */ 112 struct lut *globals; /* values of global variables */ 113 /* fmd interfacing */ 114 fmd_hdl_t *hdl; /* handle for talking with fmd */ 115 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 116 /* stats */ 117 struct stats *Rcount; 118 struct stats *Hcallcount; 119 struct stats *Rcallcount; 120 struct stats *Ccallcount; 121 struct stats *Ecallcount; 122 struct stats *Tcallcount; 123 struct stats *Marrowcount; 124 struct stats *diags; 125 } *FMElist, *EFMElist, *ClosedFMEs; 126 127 static struct case_list { 128 fmd_case_t *fmcase; 129 struct case_list *next; 130 } *Undiagablecaselist; 131 132 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 133 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 134 unsigned long long at_latest_by, unsigned long long *pdelay, 135 struct arrow *arrowp); 136 static struct node *eventprop_lookup(struct event *ep, const char *propname); 137 static struct node *pathstring2epnamenp(char *path); 138 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep); 139 static void restore_suspects(struct fme *fmep); 140 static void save_suspects(struct fme *fmep); 141 static void destroy_fme(struct fme *f); 142 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 143 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 144 145 static struct fme * 146 alloc_fme(void) 147 { 148 struct fme *fmep; 149 150 fmep = MALLOC(sizeof (*fmep)); 151 bzero(fmep, sizeof (*fmep)); 152 return (fmep); 153 } 154 155 /* 156 * fme_ready -- called when all initialization of the FME (except for 157 * stats) has completed successfully. Adds the fme to global lists 158 * and establishes its stats. 159 */ 160 static struct fme * 161 fme_ready(struct fme *fmep) 162 { 163 char nbuf[100]; 164 165 Nfmep = NULL; /* don't need to free this on module abort now */ 166 167 if (EFMElist) { 168 EFMElist->next = fmep; 169 EFMElist = fmep; 170 } else 171 FMElist = EFMElist = fmep; 172 173 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 174 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 175 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 176 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 177 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 178 fmep->Rcallcount = stats_new_counter(nbuf, 179 "calls to requirements_test()", 1); 180 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 181 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 182 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 183 fmep->Ecallcount = 184 stats_new_counter(nbuf, "calls to effects_test()", 1); 185 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 186 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 187 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 188 fmep->Marrowcount = stats_new_counter(nbuf, 189 "arrows marked by mark_arrows()", 1); 190 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 191 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 192 193 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 194 config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked); 195 196 return (fmep); 197 } 198 199 static struct fme * 200 newfme(const char *e0class, const struct ipath *e0ipp) 201 { 202 struct cfgdata *cfgdata; 203 204 if ((cfgdata = config_snapshot()) == NULL) { 205 out(O_ALTFP, "newfme: NULL configuration"); 206 Undiag_reason = UD_NOCONF; 207 return (NULL); 208 } 209 210 Nfmep = alloc_fme(); 211 212 Nfmep->id = Nextid++; 213 Nfmep->cfgdata = cfgdata; 214 Nfmep->posted_suspects = 0; 215 Nfmep->uniqobs = 0; 216 Nfmep->state = FME_NOTHING; 217 Nfmep->pull = 0ULL; 218 219 Nfmep->fmcase = NULL; 220 Nfmep->hdl = NULL; 221 222 if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 223 out(O_ALTFP, "newfme: NULL instance tree"); 224 Undiag_reason = UD_INSTFAIL; 225 config_free(cfgdata); 226 FREE(Nfmep); 227 Nfmep = NULL; 228 return (NULL); 229 } 230 231 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 232 233 if ((Nfmep->e0 = 234 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 235 out(O_ALTFP, "newfme: e0 not in instance tree"); 236 Undiag_reason = UD_BADEVENTI; 237 itree_free(Nfmep->eventtree); 238 config_free(cfgdata); 239 FREE(Nfmep); 240 Nfmep = NULL; 241 return (NULL); 242 } 243 244 return (fme_ready(Nfmep)); 245 } 246 247 void 248 fme_fini(void) 249 { 250 struct fme *sfp, *fp; 251 struct case_list *ucasep, *nextcasep; 252 253 ucasep = Undiagablecaselist; 254 while (ucasep != NULL) { 255 nextcasep = ucasep->next; 256 FREE(ucasep); 257 ucasep = nextcasep; 258 } 259 Undiagablecaselist = NULL; 260 261 /* clean up closed fmes */ 262 fp = ClosedFMEs; 263 while (fp != NULL) { 264 sfp = fp->next; 265 destroy_fme(fp); 266 fp = sfp; 267 } 268 ClosedFMEs = NULL; 269 270 fp = FMElist; 271 while (fp != NULL) { 272 sfp = fp->next; 273 destroy_fme(fp); 274 fp = sfp; 275 } 276 FMElist = EFMElist = NULL; 277 278 /* if we were in the middle of creating an fme, free it now */ 279 if (Nfmep) { 280 destroy_fme(Nfmep); 281 Nfmep = NULL; 282 } 283 } 284 285 /* 286 * Allocated space for a buffer name. 20 bytes allows for 287 * a ridiculous 9,999,999 unique observations. 288 */ 289 #define OBBUFNMSZ 20 290 291 /* 292 * serialize_observation 293 * 294 * Create a recoverable version of the current observation 295 * (f->ecurrent). We keep a serialized version of each unique 296 * observation in order that we may resume correctly the fme in the 297 * correct state if eft or fmd crashes and we're restarted. 298 */ 299 static void 300 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 301 { 302 size_t pkdlen; 303 char tmpbuf[OBBUFNMSZ]; 304 char *pkd = NULL; 305 char *estr; 306 307 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 308 estr = ipath2str(cls, ipp); 309 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 310 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 311 strlen(estr) + 1); 312 FREE(estr); 313 314 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 315 (void) snprintf(tmpbuf, 316 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 317 if (nvlist_xpack(fp->ecurrent->nvp, 318 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 319 out(O_DIE|O_SYS, "pack of observed nvl failed"); 320 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 321 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 322 FREE(pkd); 323 } 324 325 fp->uniqobs++; 326 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 327 sizeof (fp->uniqobs)); 328 } 329 330 /* 331 * init_fme_bufs -- We keep several bits of state about an fme for 332 * use if eft or fmd crashes and we're restarted. 333 */ 334 static void 335 init_fme_bufs(struct fme *fp) 336 { 337 size_t cfglen = fp->cfgdata->nextfree - fp->cfgdata->begin; 338 339 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFGLEN, sizeof (cfglen)); 340 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFGLEN, (void *)&cfglen, 341 sizeof (cfglen)); 342 if (cfglen != 0) { 343 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_CFG, cfglen); 344 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_CFG, 345 fp->cfgdata->begin, cfglen); 346 } 347 348 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 349 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 350 sizeof (fp->pull)); 351 352 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 353 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 354 sizeof (fp->id)); 355 356 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 357 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 358 sizeof (fp->uniqobs)); 359 360 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 361 sizeof (fp->posted_suspects)); 362 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 363 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 364 } 365 366 static void 367 destroy_fme_bufs(struct fme *fp) 368 { 369 char tmpbuf[OBBUFNMSZ]; 370 int o; 371 372 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 373 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 374 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 375 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 376 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 377 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 378 379 for (o = 0; o < fp->uniqobs; o++) { 380 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 381 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 382 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 383 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 384 } 385 } 386 387 /* 388 * reconstitute_observations -- convert a case's serialized observations 389 * back into struct events. Returns zero if all observations are 390 * successfully reconstituted. 391 */ 392 static int 393 reconstitute_observations(struct fme *fmep) 394 { 395 struct event *ep; 396 struct node *epnamenp = NULL; 397 size_t pkdlen; 398 char *pkd = NULL; 399 char *tmpbuf = alloca(OBBUFNMSZ); 400 char *sepptr; 401 char *estr; 402 int ocnt; 403 int elen; 404 405 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 406 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 407 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 408 if (elen == 0) { 409 out(O_ALTFP, 410 "reconstitute_observation: no %s buffer found.", 411 tmpbuf); 412 Undiag_reason = UD_MISSINGOBS; 413 break; 414 } 415 416 estr = MALLOC(elen); 417 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 418 sepptr = strchr(estr, '@'); 419 if (sepptr == NULL) { 420 out(O_ALTFP, 421 "reconstitute_observation: %s: " 422 "missing @ separator in %s.", 423 tmpbuf, estr); 424 Undiag_reason = UD_MISSINGPATH; 425 FREE(estr); 426 break; 427 } 428 429 *sepptr = '\0'; 430 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 431 out(O_ALTFP, 432 "reconstitute_observation: %s: " 433 "trouble converting path string \"%s\" " 434 "to internal representation.", 435 tmpbuf, sepptr + 1); 436 Undiag_reason = UD_MISSINGPATH; 437 FREE(estr); 438 break; 439 } 440 441 /* construct the event */ 442 ep = itree_lookup(fmep->eventtree, 443 stable(estr), ipath(epnamenp)); 444 if (ep == NULL) { 445 out(O_ALTFP, 446 "reconstitute_observation: %s: " 447 "lookup of \"%s\" in itree failed.", 448 tmpbuf, ipath2str(estr, ipath(epnamenp))); 449 Undiag_reason = UD_BADOBS; 450 tree_free(epnamenp); 451 FREE(estr); 452 break; 453 } 454 tree_free(epnamenp); 455 456 /* 457 * We may or may not have a saved nvlist for the observation 458 */ 459 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 460 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 461 if (pkdlen != 0) { 462 pkd = MALLOC(pkdlen); 463 fmd_buf_read(fmep->hdl, 464 fmep->fmcase, tmpbuf, pkd, pkdlen); 465 if (nvlist_xunpack(pkd, 466 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 467 out(O_DIE|O_SYS, "pack of observed nvl failed"); 468 FREE(pkd); 469 } 470 471 if (ocnt == 0) 472 fmep->e0 = ep; 473 474 FREE(estr); 475 fmep->ecurrent = ep; 476 ep->count++; 477 478 /* link it into list of observations seen */ 479 ep->observations = fmep->observations; 480 fmep->observations = ep; 481 } 482 483 if (ocnt == fmep->uniqobs) { 484 (void) fme_ready(fmep); 485 return (0); 486 } 487 488 return (1); 489 } 490 491 /* 492 * restart_fme -- called during eft initialization. Reconstitutes 493 * an in-progress fme. 494 */ 495 void 496 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 497 { 498 nvlist_t *defect; 499 struct case_list *bad; 500 struct fme *fmep; 501 struct cfgdata *cfgdata = NULL; 502 size_t rawsz; 503 504 fmep = alloc_fme(); 505 fmep->fmcase = inprogress; 506 fmep->hdl = hdl; 507 508 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 509 out(O_ALTFP, "restart_fme: No config data"); 510 Undiag_reason = UD_MISSINGINFO; 511 goto badcase; 512 } 513 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 514 sizeof (size_t)); 515 516 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 517 out(O_ALTFP, "restart_fme: No event zero"); 518 Undiag_reason = UD_MISSINGZERO; 519 goto badcase; 520 } 521 522 cfgdata = MALLOC(sizeof (struct cfgdata)); 523 cfgdata->cooked = NULL; 524 cfgdata->devcache = NULL; 525 cfgdata->cpucache = NULL; 526 cfgdata->refcnt = 1; 527 528 if (rawsz > 0) { 529 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 530 out(O_ALTFP, "restart_fme: Config data size mismatch"); 531 Undiag_reason = UD_CFGMISMATCH; 532 goto badcase; 533 } 534 cfgdata->begin = MALLOC(rawsz); 535 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 536 fmd_buf_read(hdl, 537 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 538 } else { 539 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 540 } 541 fmep->cfgdata = cfgdata; 542 543 config_cook(cfgdata); 544 if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 545 /* case not properly saved or irretrievable */ 546 out(O_ALTFP, "restart_fme: NULL instance tree"); 547 Undiag_reason = UD_INSTFAIL; 548 goto badcase; 549 } 550 551 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 552 553 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 554 out(O_ALTFP, "restart_fme: no saved wait time"); 555 Undiag_reason = UD_MISSINGINFO; 556 goto badcase; 557 } else { 558 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 559 sizeof (fmep->pull)); 560 } 561 562 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 563 out(O_ALTFP, "restart_fme: no saved posted status"); 564 Undiag_reason = UD_MISSINGINFO; 565 goto badcase; 566 } else { 567 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 568 (void *)&fmep->posted_suspects, 569 sizeof (fmep->posted_suspects)); 570 } 571 572 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 573 out(O_ALTFP, "restart_fme: no saved id"); 574 Undiag_reason = UD_MISSINGINFO; 575 goto badcase; 576 } else { 577 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 578 sizeof (fmep->id)); 579 } 580 if (Nextid <= fmep->id) 581 Nextid = fmep->id + 1; 582 583 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 584 out(O_ALTFP, "restart_fme: no count of observations"); 585 Undiag_reason = UD_MISSINGINFO; 586 goto badcase; 587 } else { 588 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 589 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 590 } 591 592 if (reconstitute_observations(fmep) != 0) 593 goto badcase; 594 595 /* give the diagnosis algorithm a shot at the new FME state */ 596 fme_eval(fmep, NULL); 597 return; 598 599 badcase: 600 if (fmep->eventtree != NULL) 601 itree_free(fmep->eventtree); 602 config_free(cfgdata); 603 destroy_fme_bufs(fmep); 604 FREE(fmep); 605 606 /* 607 * Since we're unable to restart the case, add it to the undiagable 608 * list and solve and close it as appropriate. 609 */ 610 bad = MALLOC(sizeof (struct case_list)); 611 bad->next = NULL; 612 613 if (Undiagablecaselist != NULL) 614 bad->next = Undiagablecaselist; 615 Undiagablecaselist = bad; 616 bad->fmcase = inprogress; 617 618 out(O_ALTFP, "[case %s (unable to restart), ", 619 fmd_case_uuid(hdl, bad->fmcase)); 620 621 if (fmd_case_solved(hdl, bad->fmcase)) { 622 out(O_ALTFP, "already solved, "); 623 } else { 624 out(O_ALTFP, "solving, "); 625 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 626 NULL, NULL, NULL); 627 if (Undiag_reason != NULL) 628 (void) nvlist_add_string(defect, 629 UNDIAG_REASON, Undiag_reason); 630 fmd_case_add_suspect(hdl, bad->fmcase, defect); 631 fmd_case_solve(hdl, bad->fmcase); 632 } 633 634 if (fmd_case_closed(hdl, bad->fmcase)) { 635 out(O_ALTFP, "already closed ]"); 636 } else { 637 out(O_ALTFP, "closing ]"); 638 fmd_case_close(hdl, bad->fmcase); 639 } 640 } 641 642 void 643 destroy_fme(struct fme *f) 644 { 645 stats_delete(f->Rcount); 646 stats_delete(f->Hcallcount); 647 stats_delete(f->Rcallcount); 648 stats_delete(f->Ccallcount); 649 stats_delete(f->Ecallcount); 650 stats_delete(f->Tcallcount); 651 stats_delete(f->Marrowcount); 652 stats_delete(f->diags); 653 654 itree_free(f->eventtree); 655 config_free(f->cfgdata); 656 FREE(f); 657 } 658 659 static const char * 660 fme_state2str(enum fme_state s) 661 { 662 switch (s) { 663 case FME_NOTHING: return ("NOTHING"); 664 case FME_WAIT: return ("WAIT"); 665 case FME_CREDIBLE: return ("CREDIBLE"); 666 case FME_DISPROVED: return ("DISPROVED"); 667 default: return ("UNKNOWN"); 668 } 669 } 670 671 static int 672 is_problem(enum nametype t) 673 { 674 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 675 } 676 677 static int 678 is_fault(enum nametype t) 679 { 680 return (t == N_FAULT); 681 } 682 683 static int 684 is_defect(enum nametype t) 685 { 686 return (t == N_DEFECT); 687 } 688 689 static int 690 is_upset(enum nametype t) 691 { 692 return (t == N_UPSET); 693 } 694 695 /*ARGSUSED*/ 696 static void 697 clear_causes_tested(struct event *lhs, struct event *ep, void *arg) 698 { 699 struct bubble *bp; 700 struct arrowlist *ap; 701 702 for (bp = itree_next_bubble(ep, NULL); bp; 703 bp = itree_next_bubble(ep, bp)) { 704 if (bp->t != B_FROM) 705 continue; 706 for (ap = itree_next_arrow(bp, NULL); ap; 707 ap = itree_next_arrow(bp, ap)) 708 ap->arrowp->causes_tested = 0; 709 } 710 } 711 712 /* 713 * call this function with initcode set to 0 to initialize cycle tracking 714 */ 715 static void 716 initialize_cycles(struct fme *fmep) 717 { 718 lut_walk(fmep->eventtree, (lut_cb)clear_causes_tested, NULL); 719 } 720 721 static void 722 fme_print(int flags, struct fme *fmep) 723 { 724 struct event *ep; 725 726 out(flags, "Fault Management Exercise %d", fmep->id); 727 out(flags, "\t State: %s", fme_state2str(fmep->state)); 728 out(flags|O_NONL, "\t Start time: "); 729 ptree_timeval(flags|O_NONL, &fmep->ull); 730 out(flags, NULL); 731 if (fmep->wull) { 732 out(flags|O_NONL, "\t Wait time: "); 733 ptree_timeval(flags|O_NONL, &fmep->wull); 734 out(flags, NULL); 735 } 736 out(flags|O_NONL, "\t E0: "); 737 if (fmep->e0) 738 itree_pevent_brief(flags|O_NONL, fmep->e0); 739 else 740 out(flags|O_NONL, "NULL"); 741 out(flags, NULL); 742 out(flags|O_NONL, "\tObservations:"); 743 for (ep = fmep->observations; ep; ep = ep->observations) { 744 out(flags|O_NONL, " "); 745 itree_pevent_brief(flags|O_NONL, ep); 746 } 747 out(flags, NULL); 748 out(flags|O_NONL, "\tSuspect list:"); 749 for (ep = fmep->suspects; ep; ep = ep->suspects) { 750 out(flags|O_NONL, " "); 751 itree_pevent_brief(flags|O_NONL, ep); 752 } 753 out(flags, NULL); 754 out(flags|O_VERB2, "\t Tree:"); 755 itree_ptree(flags|O_VERB2, fmep->eventtree); 756 } 757 758 static struct node * 759 pathstring2epnamenp(char *path) 760 { 761 char *sep = "/"; 762 struct node *ret; 763 char *ptr; 764 765 if ((ptr = strtok(path, sep)) == NULL) 766 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 767 768 ret = tree_iname(stable(ptr), NULL, 0); 769 770 while ((ptr = strtok(NULL, sep)) != NULL) 771 ret = tree_name_append(ret, 772 tree_iname(stable(ptr), NULL, 0)); 773 774 return (ret); 775 } 776 777 /* 778 * for a given upset sp, increment the corresponding SERD engine. if the 779 * SERD engine trips, return the ename and ipp of the resulting ereport. 780 * returns true if engine tripped and *enamep and *ippp were filled in. 781 */ 782 static int 783 serd_eval(fmd_hdl_t *hdl, fmd_event_t *ffep, struct event *sp, 784 const char **enamep, const struct ipath **ippp) 785 { 786 struct node *serdinst; 787 char *serdname; 788 789 ASSERT(sp->t == N_UPSET); 790 ASSERT(ffep != NULL); 791 792 /* 793 * obtain instanced SERD engine from the upset sp. from this 794 * derive serdname, the string used to identify the SERD engine. 795 */ 796 serdinst = eventprop_lookup(sp, L_engine); 797 798 if (serdinst == NULL) 799 return (NULL); 800 801 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 802 ipath(serdinst->u.stmt.np->u.event.epname)); 803 804 if (!fmd_serd_exists(hdl, serdname)) { 805 struct node *nN, *nT; 806 807 /* no SERD engine yet, so create it */ 808 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL); 809 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL); 810 811 ASSERT(nN->t == T_NUM); 812 ASSERT(nT->t == T_TIMEVAL); 813 814 fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull, 815 (hrtime_t)nT->u.ull); 816 } 817 818 819 /* 820 * increment SERD engine. if engine fires, reset serd 821 * engine and return trip_strcode 822 */ 823 if (fmd_serd_record(hdl, serdname, ffep)) { 824 struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp, 825 (void *)L_trip, NULL); 826 827 ASSERT(tripinst != NULL); 828 829 *enamep = tripinst->u.event.ename->u.name.s; 830 *ippp = ipath(tripinst->u.event.epname); 831 832 fmd_serd_reset(hdl, serdname); 833 out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname); 834 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 835 out(O_ALTFP, "]"); 836 837 FREE(serdname); 838 return (1); 839 } 840 841 FREE(serdname); 842 return (0); 843 } 844 845 /* 846 * search a suspect list for upsets. feed each upset to serd_eval() and 847 * build up tripped[], an array of ereports produced by the firing of 848 * any SERD engines. then feed each ereport back into 849 * fme_receive_report(). 850 * 851 * returns ntrip, the number of these ereports produced. 852 */ 853 static int 854 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 855 { 856 /* we build an array of tripped ereports that we send ourselves */ 857 struct { 858 const char *ename; 859 const struct ipath *ipp; 860 } *tripped; 861 struct event *sp; 862 int ntrip, nupset, i; 863 864 /* 865 * we avoid recursion by calling fme_receive_report() at the end of 866 * this function with a NULL ffep 867 */ 868 if (ffep == NULL) 869 return (0); 870 871 /* 872 * count the number of upsets to determine the upper limit on 873 * expected trip ereport strings. remember that one upset can 874 * lead to at most one ereport. 875 */ 876 nupset = 0; 877 for (sp = fmep->suspects; sp; sp = sp->suspects) { 878 if (sp->t == N_UPSET) 879 nupset++; 880 } 881 882 if (nupset == 0) 883 return (0); 884 885 /* 886 * get to this point if we have upsets and expect some trip 887 * ereports 888 */ 889 tripped = alloca(sizeof (*tripped) * nupset); 890 bzero((void *)tripped, sizeof (*tripped) * nupset); 891 892 ntrip = 0; 893 for (sp = fmep->suspects; sp; sp = sp->suspects) 894 if (sp->t == N_UPSET && serd_eval(fmep->hdl, ffep, sp, 895 &tripped[ntrip].ename, &tripped[ntrip].ipp)) 896 ntrip++; 897 898 for (i = 0; i < ntrip; i++) 899 fme_receive_report(fmep->hdl, NULL, 900 tripped[i].ename, tripped[i].ipp, NULL); 901 902 return (ntrip); 903 } 904 905 /* 906 * fme_receive_external_report -- call when an external ereport comes in 907 * 908 * this routine just converts the relevant information from the ereport 909 * into a format used internally and passes it on to fme_receive_report(). 910 */ 911 void 912 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 913 const char *eventstring) 914 { 915 struct node *epnamenp = platform_getpath(nvl); 916 const struct ipath *ipp; 917 918 /* 919 * XFILE: If we ended up without a path, it's an X-file. 920 * For now, use our undiagnosable interface. 921 */ 922 if (epnamenp == NULL) { 923 out(O_ALTFP, "XFILE: Unable to get path from ereport"); 924 Undiag_reason = UD_NOPATH; 925 publish_undiagnosable(hdl, ffep); 926 return; 927 } 928 929 ipp = ipath(epnamenp); 930 tree_free(epnamenp); 931 fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl); 932 } 933 934 static void 935 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 936 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 937 { 938 struct event *ep; 939 struct fme *fmep = NULL; 940 struct fme *ofmep, *svfmep; 941 int matched = 0; 942 943 out(O_ALTFP|O_NONL, "fme_receive_report: "); 944 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 945 out(O_ALTFP|O_STAMP, NULL); 946 947 /* decide which FME it goes to */ 948 for (fmep = FMElist; fmep; fmep = fmep->next) { 949 int prev_verbose; 950 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 951 enum fme_state state; 952 953 /* look up event in event tree for this FME */ 954 if ((ep = itree_lookup(fmep->eventtree, 955 eventstring, ipp)) == NULL) 956 continue; 957 958 /* note observation */ 959 fmep->ecurrent = ep; 960 if (ep->count++ == 0) { 961 /* link it into list of observations seen */ 962 ep->observations = fmep->observations; 963 fmep->observations = ep; 964 ep->nvp = evnv_dupnvl(nvl); 965 } 966 967 /* tell hypothesise() not to mess with suspect list */ 968 fmep->peek = 1; 969 970 /* don't want this to be verbose (unless Debug is set) */ 971 prev_verbose = Verbose; 972 if (Debug == 0) 973 Verbose = 0; 974 975 initialize_cycles(fmep); 976 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL); 977 978 fmep->peek = 0; 979 980 /* put verbose flag back */ 981 Verbose = prev_verbose; 982 983 if (state != FME_DISPROVED) { 984 /* found an FME that explains the ereport */ 985 matched++; 986 out(O_ALTFP|O_NONL, "["); 987 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 988 out(O_ALTFP, " explained by FME%d]", fmep->id); 989 990 if (ep->count == 1) 991 serialize_observation(fmep, eventstring, ipp); 992 993 if (ffep) 994 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 995 996 stats_counter_bump(fmep->Rcount); 997 998 /* re-eval FME */ 999 fme_eval(fmep, ffep); 1000 } else { 1001 1002 /* not a match, undo noting of observation */ 1003 fmep->ecurrent = NULL; 1004 if (--ep->count == 0) { 1005 /* unlink it from observations */ 1006 fmep->observations = ep->observations; 1007 ep->observations = NULL; 1008 nvlist_free(ep->nvp); 1009 ep->nvp = NULL; 1010 } 1011 } 1012 } 1013 1014 if (matched) 1015 return; /* explained by at least one existing FME */ 1016 1017 /* clean up closed fmes */ 1018 ofmep = ClosedFMEs; 1019 while (ofmep != NULL) { 1020 svfmep = ofmep->next; 1021 destroy_fme(ofmep); 1022 ofmep = svfmep; 1023 } 1024 ClosedFMEs = NULL; 1025 1026 /* start a new FME */ 1027 if ((fmep = newfme(eventstring, ipp)) == NULL) { 1028 out(O_ALTFP|O_NONL, "["); 1029 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1030 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1031 publish_undiagnosable(hdl, ffep); 1032 return; 1033 } 1034 1035 /* open a case */ 1036 fmep->fmcase = fmd_case_open(hdl, NULL); 1037 fmep->hdl = hdl; 1038 init_fme_bufs(fmep); 1039 1040 out(O_ALTFP|O_NONL, "["); 1041 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1042 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1043 fmd_case_uuid(hdl, fmep->fmcase)); 1044 1045 ep = fmep->e0; 1046 ASSERT(ep != NULL); 1047 1048 /* note observation */ 1049 fmep->ecurrent = ep; 1050 if (ep->count++ == 0) { 1051 /* link it into list of observations seen */ 1052 ep->observations = fmep->observations; 1053 fmep->observations = ep; 1054 ep->nvp = evnv_dupnvl(nvl); 1055 serialize_observation(fmep, eventstring, ipp); 1056 } 1057 1058 stats_counter_bump(fmep->Rcount); 1059 1060 if (ffep) { 1061 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1062 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1063 fmep->e0r = ffep; 1064 } 1065 1066 /* give the diagnosis algorithm a shot at the new FME state */ 1067 fme_eval(fmep, ffep); 1068 } 1069 1070 void 1071 fme_status(int flags) 1072 { 1073 struct fme *fmep; 1074 1075 if (FMElist == NULL) { 1076 out(flags, "No fault management exercises underway."); 1077 return; 1078 } 1079 1080 for (fmep = FMElist; fmep; fmep = fmep->next) 1081 fme_print(flags, fmep); 1082 } 1083 1084 /* 1085 * "indent" routines used mostly for nicely formatted debug output, but also 1086 * for sanity checking for infinite recursion bugs. 1087 */ 1088 1089 #define MAX_INDENT 1024 1090 static const char *indent_s[MAX_INDENT]; 1091 static int current_indent; 1092 1093 static void 1094 indent_push(const char *s) 1095 { 1096 if (current_indent < MAX_INDENT) 1097 indent_s[current_indent++] = s; 1098 else 1099 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1100 } 1101 1102 static void 1103 indent_set(const char *s) 1104 { 1105 current_indent = 0; 1106 indent_push(s); 1107 } 1108 1109 static void 1110 indent_pop(void) 1111 { 1112 if (current_indent > 0) 1113 current_indent--; 1114 else 1115 out(O_DIE, "recursion underflow"); 1116 } 1117 1118 static void 1119 indent(void) 1120 { 1121 int i; 1122 if (!Verbose) 1123 return; 1124 for (i = 0; i < current_indent; i++) 1125 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1126 } 1127 1128 static int 1129 suspects_changed(struct fme *fmep) 1130 { 1131 struct event *suspects = fmep->suspects; 1132 struct event *psuspects = fmep->psuspects; 1133 1134 while (suspects != NULL && psuspects != NULL) { 1135 if (suspects != psuspects) 1136 return (1); 1137 suspects = suspects->suspects; 1138 psuspects = psuspects->psuspects; 1139 } 1140 1141 return (suspects != psuspects); 1142 } 1143 1144 #define SLNEW 1 1145 #define SLCHANGED 2 1146 #define SLWAIT 3 1147 #define SLDISPROVED 4 1148 1149 static void 1150 print_suspects(int circumstance, struct fme *fmep) 1151 { 1152 struct event *ep; 1153 1154 out(O_ALTFP|O_NONL, "["); 1155 if (circumstance == SLCHANGED) { 1156 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1157 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1158 } else if (circumstance == SLWAIT) { 1159 out(O_ALTFP|O_NONL, "FME%d set wait timer ", fmep->id); 1160 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1161 } else if (circumstance == SLDISPROVED) { 1162 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1163 } else { 1164 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1165 } 1166 1167 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1168 out(O_ALTFP, "]"); 1169 return; 1170 } 1171 1172 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1173 out(O_ALTFP|O_NONL, " "); 1174 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1175 } 1176 out(O_ALTFP, "]"); 1177 } 1178 1179 static struct node * 1180 eventprop_lookup(struct event *ep, const char *propname) 1181 { 1182 return (lut_lookup(ep->props, (void *)propname, NULL)); 1183 } 1184 1185 #define MAXDIGITIDX 23 1186 static char numbuf[MAXDIGITIDX + 1]; 1187 1188 static int 1189 node2uint(struct node *n, uint_t *valp) 1190 { 1191 struct evalue value; 1192 struct lut *globals = NULL; 1193 1194 if (n == NULL) 1195 return (1); 1196 1197 /* 1198 * check value.v since we are being asked to convert an unsigned 1199 * long long int to an unsigned int 1200 */ 1201 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1202 value.t != UINT64 || value.v > (1ULL << 32)) 1203 return (1); 1204 1205 *valp = (uint_t)value.v; 1206 1207 return (0); 1208 } 1209 1210 static nvlist_t * 1211 node2fmri(struct node *n) 1212 { 1213 nvlist_t **pa, *f, *p; 1214 struct node *nc; 1215 uint_t depth = 0; 1216 char *numstr, *nullbyte; 1217 char *failure; 1218 int err, i; 1219 1220 /* XXX do we need to be able to handle a non-T_NAME node? */ 1221 if (n == NULL || n->t != T_NAME) 1222 return (NULL); 1223 1224 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1225 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1226 break; 1227 depth++; 1228 } 1229 1230 if (nc != NULL) { 1231 /* We bailed early, something went wrong */ 1232 return (NULL); 1233 } 1234 1235 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1236 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1237 pa = alloca(depth * sizeof (nvlist_t *)); 1238 for (i = 0; i < depth; i++) 1239 pa[i] = NULL; 1240 1241 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1242 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1243 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1244 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1245 if (err != 0) { 1246 failure = "basic construction of FMRI failed"; 1247 goto boom; 1248 } 1249 1250 numbuf[MAXDIGITIDX] = '\0'; 1251 nullbyte = &numbuf[MAXDIGITIDX]; 1252 i = 0; 1253 1254 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1255 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1256 if (err != 0) { 1257 failure = "alloc of an hc-pair failed"; 1258 goto boom; 1259 } 1260 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1261 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1262 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1263 if (err != 0) { 1264 failure = "construction of an hc-pair failed"; 1265 goto boom; 1266 } 1267 pa[i++] = p; 1268 } 1269 1270 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1271 if (err == 0) { 1272 for (i = 0; i < depth; i++) 1273 if (pa[i] != NULL) 1274 nvlist_free(pa[i]); 1275 return (f); 1276 } 1277 failure = "addition of hc-pair array to FMRI failed"; 1278 1279 boom: 1280 for (i = 0; i < depth; i++) 1281 if (pa[i] != NULL) 1282 nvlist_free(pa[i]); 1283 nvlist_free(f); 1284 out(O_DIE, "%s", failure); 1285 /*NOTREACHED*/ 1286 } 1287 1288 static uint_t 1289 avg(uint_t sum, uint_t cnt) 1290 { 1291 unsigned long long s = sum * 10; 1292 1293 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 1294 } 1295 1296 static uint8_t 1297 percentof(uint_t part, uint_t whole) 1298 { 1299 unsigned long long p = part * 1000; 1300 1301 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 1302 } 1303 1304 static struct rsl { 1305 struct event *suspect; 1306 nvlist_t *asru; 1307 nvlist_t *fru; 1308 nvlist_t *rsrc; 1309 }; 1310 1311 /* 1312 * rslfree -- free internal members of struct rsl not expected to be 1313 * freed elsewhere. 1314 */ 1315 static void 1316 rslfree(struct rsl *freeme) 1317 { 1318 if (freeme->asru != NULL) 1319 nvlist_free(freeme->asru); 1320 if (freeme->fru != NULL) 1321 nvlist_free(freeme->fru); 1322 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 1323 nvlist_free(freeme->rsrc); 1324 } 1325 1326 /* 1327 * rslcmp -- compare two rsl structures. Use the following 1328 * comparisons to establish cardinality: 1329 * 1330 * 1. Name of the suspect's class. (simple strcmp) 1331 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 1332 * 1333 */ 1334 static int 1335 rslcmp(const void *a, const void *b) 1336 { 1337 struct rsl *r1 = (struct rsl *)a; 1338 struct rsl *r2 = (struct rsl *)b; 1339 int rv; 1340 1341 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 1342 r2->suspect->enode->u.event.ename->u.name.s); 1343 if (rv != 0) 1344 return (rv); 1345 1346 if (r1->asru == NULL && r2->asru == NULL) 1347 return (0); 1348 if (r1->asru == NULL) 1349 return (-1); 1350 if (r2->asru == NULL) 1351 return (1); 1352 return (evnv_cmpnvl(r1->asru, r2->asru, 0)); 1353 } 1354 1355 /* 1356 * rsluniq -- given an array of rsl structures, seek out and "remove" 1357 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 1358 * of the array element. Removal also means updating the number of 1359 * problems and the number of problems which are not faults. User 1360 * provides the first and last element pointers. 1361 */ 1362 static void 1363 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 1364 { 1365 struct rsl *cr; 1366 1367 if (*nprobs == 1) 1368 return; 1369 1370 /* 1371 * At this point, we only expect duplicate defects. 1372 * Eversholt's diagnosis algorithm prevents duplicate 1373 * suspects, but we rewrite defects in the platform code after 1374 * the diagnosis is made, and that can introduce new 1375 * duplicates. 1376 */ 1377 while (first <= last) { 1378 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 1379 first++; 1380 continue; 1381 } 1382 cr = first + 1; 1383 while (cr <= last) { 1384 if (is_defect(first->suspect->t)) { 1385 if (rslcmp(first, cr) == 0) { 1386 cr->suspect = NULL; 1387 rslfree(cr); 1388 (*nprobs)--; 1389 (*nnonf)--; 1390 } 1391 } 1392 /* 1393 * assume all defects are in order after our 1394 * sort and short circuit here with "else break" ? 1395 */ 1396 cr++; 1397 } 1398 first++; 1399 } 1400 } 1401 1402 /* 1403 * get_resources -- for a given suspect, determine what ASRU, FRU and 1404 * RSRC nvlists should be advertised in the final suspect list. 1405 */ 1406 void 1407 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 1408 { 1409 struct node *asrudef, *frudef; 1410 nvlist_t *asru, *fru; 1411 nvlist_t *rsrc = NULL; 1412 char *pathstr; 1413 1414 /* 1415 * First find any ASRU and/or FRU defined in the 1416 * initial fault tree. 1417 */ 1418 asrudef = eventprop_lookup(sp, L_ASRU); 1419 frudef = eventprop_lookup(sp, L_FRU); 1420 1421 /* 1422 * Create FMRIs based on those definitions 1423 */ 1424 asru = node2fmri(asrudef); 1425 fru = node2fmri(frudef); 1426 pathstr = ipath2str(NULL, sp->ipp); 1427 1428 /* 1429 * Allow for platform translations of the FMRIs 1430 */ 1431 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 1432 pathstr); 1433 1434 FREE(pathstr); 1435 rsrcs->suspect = sp; 1436 rsrcs->asru = asru; 1437 rsrcs->fru = fru; 1438 rsrcs->rsrc = rsrc; 1439 } 1440 1441 /* 1442 * trim_suspects -- prior to publishing, we may need to remove some 1443 * suspects from the list. If we're auto-closing upsets, we don't 1444 * want any of those in the published list. If the ASRUs for multiple 1445 * defects resolve to the same ASRU (driver) we only want to publish 1446 * that as a single suspect. 1447 */ 1448 static void 1449 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin, 1450 struct rsl **end) 1451 { 1452 struct event *ep; 1453 struct rsl *rp; 1454 int rpcnt; 1455 1456 /* 1457 * First save the suspects in the psuspects, then copy back 1458 * only the ones we wish to retain. This resets nsuspects to 1459 * zero. 1460 */ 1461 rpcnt = fmep->nsuspects; 1462 save_suspects(fmep); 1463 1464 /* 1465 * allocate an array of resource pointers for the suspects. 1466 * We may end up using less than the full allocation, but this 1467 * is a very short-lived array. publish_suspects() will free 1468 * this array when it's done using it. 1469 */ 1470 rp = *begin = MALLOC(rpcnt * sizeof (struct rsl)); 1471 bzero(rp, rpcnt * sizeof (struct rsl)); 1472 1473 /* first pass, remove any unwanted upsets and populate our array */ 1474 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 1475 if (no_upsets && is_upset(ep->t)) 1476 continue; 1477 get_resources(ep, rp, fmep->cfgdata->cooked); 1478 rp++; 1479 fmep->nsuspects++; 1480 if (!is_fault(ep->t)) 1481 fmep->nonfault++; 1482 } 1483 1484 /* if all we had was unwanted upsets, we're done */ 1485 if (fmep->nsuspects == 0) 1486 return; 1487 1488 *end = rp - 1; 1489 1490 /* sort the array */ 1491 qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp); 1492 rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault); 1493 } 1494 1495 static void 1496 publish_suspects(struct fme *fmep) 1497 { 1498 struct event *ep; 1499 struct rsl *srl = NULL; 1500 struct rsl *erl; 1501 struct rsl *rp; 1502 nvlist_t *fault; 1503 uint8_t cert; 1504 uint_t *frs; 1505 uint_t fravg, frsum, fr; 1506 int frcnt, fridx; 1507 boolean_t no_upsets = B_FALSE; 1508 1509 stats_counter_bump(fmep->diags); 1510 1511 /* 1512 * The current fmd interfaces don't allow us to solve a case 1513 * that's already solved. If we make a new case, what of the 1514 * ereports? We don't appear to have an interface that allows 1515 * us to access the ereports attached to a case (if we wanted 1516 * to copy the original case's ereport attachments to the new 1517 * case) and it's also a bit unclear if there would be any 1518 * problems with having ereports attached to multiple cases 1519 * and/or attaching DIAGNOSED ereports to a case. For now, 1520 * we'll just output a message. 1521 */ 1522 if (fmep->posted_suspects || 1523 fmd_case_solved(fmep->hdl, fmep->fmcase)) { 1524 out(O_ALTFP|O_NONL, "Revised diagnosis for case %s: ", 1525 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1526 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1527 out(O_ALTFP|O_NONL, " "); 1528 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1529 } 1530 out(O_ALTFP, NULL); 1531 return; 1532 } 1533 1534 /* 1535 * If we're auto-closing upsets, we don't want to include them 1536 * in any produced suspect lists or certainty accounting. 1537 */ 1538 if (Autoclose != NULL) 1539 if (strcmp(Autoclose, "true") == 0 || 1540 strcmp(Autoclose, "all") == 0 || 1541 strcmp(Autoclose, "upsets") == 0) 1542 no_upsets = B_TRUE; 1543 1544 trim_suspects(fmep, no_upsets, &srl, &erl); 1545 1546 /* 1547 * If the resulting suspect list has no members, we're 1548 * done. Returning here will simply close the case. 1549 */ 1550 if (fmep->nsuspects == 0) { 1551 out(O_ALTFP, 1552 "[FME%d, case %s (all suspects are upsets)]", 1553 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1554 FREE(srl); 1555 restore_suspects(fmep); 1556 return; 1557 } 1558 1559 /* 1560 * If the suspect list is all faults, then for a given fault, 1561 * say X of N, X's certainty is computed via: 1562 * 1563 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 1564 * 1565 * If none of the suspects are faults, and there are N suspects, 1566 * the certainty of a given suspect is 100/N. 1567 * 1568 * If there are are a mixture of faults and other problems in 1569 * the suspect list, we take an average of the faults' 1570 * FITrates and treat this average as the FITrate for any 1571 * non-faults. The fitrate of any given suspect is then 1572 * computed per the first formula above. 1573 */ 1574 if (fmep->nonfault == fmep->nsuspects) { 1575 /* NO faults in the suspect list */ 1576 cert = percentof(1, fmep->nsuspects); 1577 } else { 1578 /* sum the fitrates */ 1579 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 1580 fridx = frcnt = frsum = 0; 1581 1582 for (rp = srl; rp <= erl; rp++) { 1583 struct node *n; 1584 1585 if (rp->suspect == NULL) 1586 continue; 1587 if (!is_fault(rp->suspect->t)) { 1588 frs[fridx++] = 0; 1589 continue; 1590 } 1591 n = eventprop_lookup(rp->suspect, L_FITrate); 1592 if (node2uint(n, &fr) != 0) { 1593 out(O_DEBUG|O_NONL, "event "); 1594 ipath_print(O_DEBUG|O_NONL, 1595 ep->enode->u.event.ename->u.name.s, 1596 ep->ipp); 1597 out(O_DEBUG, " has no FITrate (using 1)"); 1598 fr = 1; 1599 } else if (fr == 0) { 1600 out(O_DEBUG|O_NONL, "event "); 1601 ipath_print(O_DEBUG|O_NONL, 1602 ep->enode->u.event.ename->u.name.s, 1603 ep->ipp); 1604 out(O_DEBUG, " has zero FITrate (using 1)"); 1605 fr = 1; 1606 } 1607 1608 frs[fridx++] = fr; 1609 frsum += fr; 1610 frcnt++; 1611 } 1612 fravg = avg(frsum, frcnt); 1613 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 1614 if (frs[fridx] == 0) { 1615 frs[fridx] = fravg; 1616 frsum += fravg; 1617 } 1618 } 1619 1620 /* Add them in reverse order of our sort, as fmd reverses order */ 1621 for (rp = erl; rp >= srl; rp--) { 1622 if (rp->suspect == NULL) 1623 continue; 1624 if (fmep->nonfault != fmep->nsuspects) 1625 cert = percentof(frs[--fridx], frsum); 1626 fault = fmd_nvl_create_fault(fmep->hdl, 1627 rp->suspect->enode->u.event.ename->u.name.s, 1628 cert, 1629 rp->asru, 1630 rp->fru, 1631 rp->rsrc); 1632 if (fault == NULL) 1633 out(O_DIE, "fault creation failed"); 1634 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 1635 rp->suspect->fault = fault; 1636 rslfree(rp); 1637 } 1638 fmd_case_solve(fmep->hdl, fmep->fmcase); 1639 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 1640 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1641 1642 if (Autoconvict) { 1643 for (rp = srl; rp <= erl; rp++) { 1644 if (rp->suspect == NULL) 1645 continue; 1646 fmd_case_convict(fmep->hdl, 1647 fmep->fmcase, rp->suspect->fault); 1648 } 1649 out(O_ALTFP, "[convicting FME%d, case %s]", fmep->id, 1650 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 1651 } 1652 1653 /* 1654 * revert to the original suspect list 1655 */ 1656 FREE(srl); 1657 restore_suspects(fmep); 1658 } 1659 1660 static void 1661 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep) 1662 { 1663 struct case_list *newcase; 1664 nvlist_t *defect; 1665 1666 out(O_ALTFP, 1667 "[undiagnosable ereport received, " 1668 "creating and closing a new case (%s)]", 1669 Undiag_reason ? Undiag_reason : "reason not provided"); 1670 1671 newcase = MALLOC(sizeof (struct case_list)); 1672 newcase->next = NULL; 1673 1674 newcase->fmcase = fmd_case_open(hdl, NULL); 1675 if (Undiagablecaselist != NULL) 1676 newcase->next = Undiagablecaselist; 1677 Undiagablecaselist = newcase; 1678 1679 if (ffep != NULL) 1680 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 1681 1682 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 1683 NULL, NULL, NULL); 1684 if (Undiag_reason != NULL) 1685 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 1686 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 1687 1688 fmd_case_solve(hdl, newcase->fmcase); 1689 fmd_case_close(hdl, newcase->fmcase); 1690 } 1691 1692 static void 1693 fme_undiagnosable(struct fme *f) 1694 { 1695 nvlist_t *defect; 1696 1697 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 1698 f->id, fmd_case_uuid(f->hdl, f->fmcase), 1699 Undiag_reason ? Undiag_reason : "undiagnosable"); 1700 1701 defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100, 1702 NULL, NULL, NULL); 1703 if (Undiag_reason != NULL) 1704 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 1705 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 1706 fmd_case_solve(f->hdl, f->fmcase); 1707 destroy_fme_bufs(f); 1708 fmd_case_close(f->hdl, f->fmcase); 1709 } 1710 1711 /* 1712 * fme_close_case 1713 * 1714 * Find the requested case amongst our fmes and close it. Free up 1715 * the related fme. 1716 */ 1717 void 1718 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 1719 { 1720 struct case_list *ucasep, *prevcasep = NULL; 1721 struct fme *prev = NULL; 1722 struct fme *fmep; 1723 1724 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 1725 if (fmcase != ucasep->fmcase) { 1726 prevcasep = ucasep; 1727 continue; 1728 } 1729 1730 if (prevcasep == NULL) 1731 Undiagablecaselist = Undiagablecaselist->next; 1732 else 1733 prevcasep->next = ucasep->next; 1734 1735 FREE(ucasep); 1736 return; 1737 } 1738 1739 for (fmep = FMElist; fmep; fmep = fmep->next) { 1740 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 1741 break; 1742 prev = fmep; 1743 } 1744 1745 if (fmep == NULL) { 1746 out(O_WARN, "Eft asked to close unrecognized case [%s].", 1747 fmd_case_uuid(hdl, fmcase)); 1748 return; 1749 } 1750 1751 if (EFMElist == fmep) 1752 EFMElist = prev; 1753 1754 if (prev == NULL) 1755 FMElist = FMElist->next; 1756 else 1757 prev->next = fmep->next; 1758 1759 fmep->next = NULL; 1760 1761 /* Get rid of any timer this fme has set */ 1762 if (fmep->wull != 0) 1763 fmd_timer_remove(fmep->hdl, fmep->timer); 1764 1765 if (ClosedFMEs == NULL) { 1766 ClosedFMEs = fmep; 1767 } else { 1768 fmep->next = ClosedFMEs; 1769 ClosedFMEs = fmep; 1770 } 1771 } 1772 1773 /* 1774 * fme_set_timer() 1775 * If the time we need to wait for the given FME is less than the 1776 * current timer, kick that old timer out and establish a new one. 1777 */ 1778 static void 1779 fme_set_timer(struct fme *fmep, unsigned long long wull) 1780 { 1781 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 1782 ptree_timeval(O_ALTFP|O_VERB, &wull); 1783 1784 if (wull <= fmep->pull) { 1785 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 1786 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 1787 out(O_ALTFP|O_VERB, NULL); 1788 /* we've waited at least wull already, don't need timer */ 1789 return; 1790 } 1791 1792 out(O_ALTFP|O_VERB|O_NONL, " currently "); 1793 if (fmep->wull != 0) { 1794 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 1795 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 1796 out(O_ALTFP|O_VERB, NULL); 1797 } else { 1798 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 1799 out(O_ALTFP|O_VERB, NULL); 1800 } 1801 1802 if (fmep->wull != 0) 1803 if (wull >= fmep->wull) 1804 /* New timer would fire later than established timer */ 1805 return; 1806 1807 if (fmep->wull != 0) 1808 fmd_timer_remove(fmep->hdl, fmep->timer); 1809 1810 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 1811 fmep->e0r, wull); 1812 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 1813 fmep->wull = wull; 1814 } 1815 1816 void 1817 fme_timer_fired(struct fme *fmep, id_t tid) 1818 { 1819 struct fme *ffmep = NULL; 1820 1821 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 1822 if (ffmep == fmep) 1823 break; 1824 1825 if (ffmep == NULL) { 1826 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 1827 (void *)fmep); 1828 return; 1829 } 1830 1831 if (tid != fmep->htid) { 1832 /* 1833 * normal timer (not the hesitation timer 1834 */ 1835 fmep->pull = fmep->wull; 1836 fmep->wull = 0; 1837 fmd_buf_write(fmep->hdl, fmep->fmcase, 1838 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 1839 } else { 1840 fmep->hesitated = 1; 1841 } 1842 fme_eval(fmep, NULL); 1843 } 1844 1845 /* 1846 * Preserve the fme's suspect list in its psuspects list, NULLing the 1847 * suspects list in the meantime. 1848 */ 1849 static void 1850 save_suspects(struct fme *fmep) 1851 { 1852 struct event *ep; 1853 struct event *nextep; 1854 1855 /* zero out the previous suspect list */ 1856 for (ep = fmep->psuspects; ep; ep = nextep) { 1857 nextep = ep->psuspects; 1858 ep->psuspects = NULL; 1859 } 1860 fmep->psuspects = NULL; 1861 1862 /* zero out the suspect list, copying it to previous suspect list */ 1863 fmep->psuspects = fmep->suspects; 1864 for (ep = fmep->suspects; ep; ep = nextep) { 1865 nextep = ep->suspects; 1866 ep->psuspects = ep->suspects; 1867 ep->suspects = NULL; 1868 ep->is_suspect = 0; 1869 } 1870 fmep->suspects = NULL; 1871 fmep->nsuspects = 0; 1872 fmep->nonfault = 0; 1873 } 1874 1875 /* 1876 * Retrieve the fme's suspect list from its psuspects list. 1877 */ 1878 static void 1879 restore_suspects(struct fme *fmep) 1880 { 1881 struct event *ep; 1882 struct event *nextep; 1883 1884 fmep->nsuspects = fmep->nonfault = 0; 1885 fmep->suspects = fmep->psuspects; 1886 for (ep = fmep->psuspects; ep; ep = nextep) { 1887 fmep->nsuspects++; 1888 if (!is_fault(ep->t)) 1889 fmep->nonfault++; 1890 nextep = ep->psuspects; 1891 ep->suspects = ep->psuspects; 1892 } 1893 } 1894 1895 /* 1896 * this is what we use to call the Emrys prototype code instead of main() 1897 */ 1898 static void 1899 fme_eval(struct fme *fmep, fmd_event_t *ffep) 1900 { 1901 struct event *ep; 1902 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1903 1904 save_suspects(fmep); 1905 1906 out(O_ALTFP|O_VERB, "Evaluate FME %d", fmep->id); 1907 indent_set(" "); 1908 1909 initialize_cycles(fmep); 1910 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay, NULL); 1911 1912 out(O_ALTFP|O_VERB|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 1913 fme_state2str(fmep->state)); 1914 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1915 out(O_ALTFP|O_VERB|O_NONL, " "); 1916 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 1917 } 1918 out(O_ALTFP|O_VERB, NULL); 1919 1920 if (fmep->posted_suspects) { 1921 /* 1922 * this FME has already posted a diagnosis, so see if 1923 * the event changed the diagnosis and print a warning 1924 * if it did. 1925 * 1926 */ 1927 if (suspects_changed(fmep)) { 1928 print_suspects(SLCHANGED, fmep); 1929 publish_suspects(fmep); 1930 } 1931 } else { 1932 switch (fmep->state) { 1933 case FME_CREDIBLE: 1934 /* 1935 * if the suspect list contains any upsets, we 1936 * turn off the hesitation logic (by setting 1937 * the hesitate flag which normally indicates 1938 * we've already done the hesitate logic). 1939 * this is done because hesitating with upsets 1940 * causes us to explain away additional soft errors 1941 * while the upset FME stays open. 1942 */ 1943 if (fmep->hesitated == 0) { 1944 struct event *s; 1945 1946 for (s = fmep->suspects; s; s = s->suspects) { 1947 if (s->t == N_UPSET) { 1948 fmep->hesitated = 1; 1949 break; 1950 } 1951 } 1952 } 1953 1954 if (Hesitate && 1955 fmep->suspects != NULL && 1956 fmep->suspects->suspects != NULL && 1957 fmep->hesitated == 0) { 1958 /* 1959 * about to publish multi-entry suspect list, 1960 * set the hesitation timer if not already set. 1961 */ 1962 if (fmep->htid == 0) { 1963 out(O_ALTFP|O_NONL, 1964 "[hesitate FME%d, case %s ", 1965 fmep->id, 1966 fmd_case_uuid(fmep->hdl, 1967 fmep->fmcase)); 1968 ptree_timeval(O_ALTFP|O_NONL, 1969 (unsigned long long *)&Hesitate); 1970 out(O_ALTFP, "]"); 1971 fme_set_timer(fmep, my_delay); 1972 fmep->htid = 1973 fmd_timer_install(fmep->hdl, 1974 (void *)fmep, NULL, Hesitate); 1975 } else { 1976 out(O_ALTFP, 1977 "[still hesitating FME%d, case %s]", 1978 fmep->id, 1979 fmd_case_uuid(fmep->hdl, 1980 fmep->fmcase)); 1981 } 1982 } else { 1983 print_suspects(SLNEW, fmep); 1984 (void) upsets_eval(fmep, ffep); 1985 publish_suspects(fmep); 1986 fmep->posted_suspects = 1; 1987 fmd_buf_write(fmep->hdl, fmep->fmcase, 1988 WOBUF_POSTD, 1989 (void *)&fmep->posted_suspects, 1990 sizeof (fmep->posted_suspects)); 1991 } 1992 break; 1993 1994 case FME_WAIT: 1995 /* 1996 * singleton suspect list implies 1997 * no point in waiting 1998 */ 1999 if (fmep->suspects && 2000 fmep->suspects->suspects == NULL) { 2001 print_suspects(SLNEW, fmep); 2002 (void) upsets_eval(fmep, ffep); 2003 publish_suspects(fmep); 2004 fmep->posted_suspects = 1; 2005 fmd_buf_write(fmep->hdl, fmep->fmcase, 2006 WOBUF_POSTD, 2007 (void *)&fmep->posted_suspects, 2008 sizeof (fmep->posted_suspects)); 2009 fmep->state = FME_CREDIBLE; 2010 } else { 2011 ASSERT(my_delay > fmep->ull); 2012 fme_set_timer(fmep, my_delay); 2013 print_suspects(SLWAIT, fmep); 2014 } 2015 break; 2016 2017 case FME_DISPROVED: 2018 print_suspects(SLDISPROVED, fmep); 2019 Undiag_reason = UD_UNSOLVD; 2020 fme_undiagnosable(fmep); 2021 break; 2022 } 2023 } 2024 2025 if (fmep->posted_suspects == 1 && Autoclose != NULL) { 2026 int doclose = 0; 2027 2028 if (strcmp(Autoclose, "true") == 0 || 2029 strcmp(Autoclose, "all") == 0) 2030 doclose = 1; 2031 2032 if (strcmp(Autoclose, "upsets") == 0) { 2033 doclose = 1; 2034 for (ep = fmep->suspects; ep; ep = ep->suspects) { 2035 if (ep->t != N_UPSET) { 2036 doclose = 0; 2037 break; 2038 } 2039 } 2040 } 2041 2042 if (doclose) { 2043 out(O_ALTFP, "[closing FME%d, case %s (autoclose)]", 2044 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2045 2046 destroy_fme_bufs(fmep); 2047 fmd_case_close(fmep->hdl, fmep->fmcase); 2048 } 2049 } 2050 } 2051 2052 /* 2053 * below here is the code derived from the Emrys prototype 2054 */ 2055 2056 static void indent(void); 2057 static int triggered(struct fme *fmep, struct event *ep, int mark); 2058 static void mark_arrows(struct fme *fmep, struct event *ep, int mark); 2059 static enum fme_state effects_test(struct fme *fmep, 2060 struct event *fault_event); 2061 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 2062 unsigned long long at_latest_by, unsigned long long *pdelay, 2063 struct arrow *arrowp); 2064 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 2065 unsigned long long at_latest_by, unsigned long long *pdelay); 2066 2067 static int 2068 triggered(struct fme *fmep, struct event *ep, int mark) 2069 { 2070 struct bubble *bp; 2071 struct arrowlist *ap; 2072 int count = 0; 2073 2074 stats_counter_bump(fmep->Tcallcount); 2075 for (bp = itree_next_bubble(ep, NULL); bp; 2076 bp = itree_next_bubble(ep, bp)) { 2077 if (bp->t != B_TO) 2078 continue; 2079 for (ap = itree_next_arrow(bp, NULL); ap; 2080 ap = itree_next_arrow(bp, ap)) { 2081 /* check count of marks against K in the bubble */ 2082 if (ap->arrowp->tail->mark == mark && 2083 ++count >= bp->nork) 2084 return (1); 2085 } 2086 } 2087 return (0); 2088 } 2089 2090 static void 2091 mark_arrows(struct fme *fmep, struct event *ep, int mark) 2092 { 2093 struct bubble *bp; 2094 struct arrowlist *ap; 2095 2096 for (bp = itree_next_bubble(ep, NULL); bp; 2097 bp = itree_next_bubble(ep, bp)) { 2098 if (bp->t != B_FROM) 2099 continue; 2100 if (bp->mark != mark) { 2101 stats_counter_bump(fmep->Marrowcount); 2102 bp->mark = mark; 2103 for (ap = itree_next_arrow(bp, NULL); ap; 2104 ap = itree_next_arrow(bp, ap)) { 2105 struct constraintlist *ctp; 2106 struct evalue value; 2107 int do_not_follow = 0; 2108 /* 2109 * see if false constraint prevents us 2110 * from traversing this arrow, but don't 2111 * bother if the event is an ereport we 2112 * haven't seen 2113 */ 2114 if (ap->arrowp->head->myevent->t != N_EREPORT || 2115 ap->arrowp->head->myevent->count != 0) { 2116 platform_set_payloadnvp( 2117 ap->arrowp->head->myevent->nvp); 2118 for (ctp = ap->arrowp->constraints; 2119 ctp != NULL; ctp = ctp->next) { 2120 if (eval_expr(ctp->cnode, 2121 NULL, NULL, 2122 &fmep->globals, 2123 fmep->cfgdata->cooked, 2124 ap->arrowp, 0, 2125 &value) == 0 || 2126 value.t == UNDEFINED || 2127 value.v == 0) { 2128 do_not_follow = 1; 2129 break; 2130 } 2131 } 2132 platform_set_payloadnvp(NULL); 2133 } 2134 2135 if (do_not_follow) { 2136 indent(); 2137 out(O_ALTFP|O_VERB|O_NONL, 2138 " False arrow to "); 2139 itree_pevent_brief( 2140 O_ALTFP|O_VERB|O_NONL, 2141 ap->arrowp->head->myevent); 2142 out(O_ALTFP|O_VERB|O_NONL, " "); 2143 ptree(O_ALTFP|O_VERB|O_NONL, 2144 ctp->cnode, 1, 0); 2145 out(O_ALTFP|O_VERB, NULL); 2146 continue; 2147 } 2148 2149 if (triggered(fmep, ap->arrowp->head->myevent, 2150 mark)) 2151 mark_arrows(fmep, 2152 ap->arrowp->head->myevent, mark); 2153 } 2154 } 2155 } 2156 } 2157 2158 static enum fme_state 2159 effects_test(struct fme *fmep, struct event *fault_event) 2160 { 2161 struct event *error_event; 2162 enum fme_state return_value = FME_CREDIBLE; 2163 2164 stats_counter_bump(fmep->Ecallcount); 2165 indent_push(" E"); 2166 indent(); 2167 out(O_ALTFP|O_VERB|O_NONL, "->"); 2168 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 2169 out(O_ALTFP|O_VERB, NULL); 2170 2171 mark_arrows(fmep, fault_event, 1); 2172 for (error_event = fmep->observations; 2173 error_event; error_event = error_event->observations) { 2174 indent(); 2175 out(O_ALTFP|O_VERB|O_NONL, " "); 2176 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 2177 if (!triggered(fmep, error_event, 1)) { 2178 return_value = FME_DISPROVED; 2179 out(O_ALTFP|O_VERB, " NOT triggered"); 2180 break; 2181 } else { 2182 out(O_ALTFP|O_VERB, " triggered"); 2183 } 2184 } 2185 mark_arrows(fmep, fault_event, 0); 2186 2187 indent(); 2188 out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value)); 2189 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 2190 out(O_ALTFP|O_VERB, NULL); 2191 indent_pop(); 2192 return (return_value); 2193 } 2194 2195 static enum fme_state 2196 requirements_test(struct fme *fmep, struct event *ep, 2197 unsigned long long at_latest_by, unsigned long long *pdelay, 2198 struct arrow *arrowp) 2199 { 2200 int waiting_events; 2201 int credible_events; 2202 enum fme_state return_value = FME_CREDIBLE; 2203 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2204 unsigned long long arrow_delay; 2205 unsigned long long my_delay; 2206 struct event *ep2; 2207 struct bubble *bp; 2208 struct arrowlist *ap; 2209 2210 stats_counter_bump(fmep->Rcallcount); 2211 indent_push(" R"); 2212 indent(); 2213 out(O_ALTFP|O_VERB|O_NONL, "->"); 2214 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2215 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 2216 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2217 out(O_ALTFP|O_VERB, NULL); 2218 2219 if (ep->t == N_EREPORT) { 2220 if (ep->count == 0) { 2221 if (fmep->pull >= at_latest_by) { 2222 return_value = FME_DISPROVED; 2223 } else { 2224 *pdelay = at_latest_by; 2225 return_value = FME_WAIT; 2226 } 2227 } else if (arrowp != NULL) { 2228 /* 2229 * evaluate constraints only for current observation 2230 */ 2231 struct constraintlist *ctp; 2232 struct evalue value; 2233 2234 platform_set_payloadnvp(ep->nvp); 2235 for (ctp = arrowp->constraints; ctp != NULL; 2236 ctp = ctp->next) { 2237 if (eval_expr(ctp->cnode, NULL, NULL, 2238 &fmep->globals, fmep->cfgdata->cooked, 2239 arrowp, 0, &value) == 0 || 2240 value.t == UNDEFINED || value.v == 0) { 2241 indent(); 2242 out(O_ALTFP|O_VERB|O_NONL, 2243 " False constraint "); 2244 out(O_ALTFP|O_VERB|O_NONL, " "); 2245 ptree(O_ALTFP|O_VERB|O_NONL, 2246 ctp->cnode, 1, 0); 2247 out(O_ALTFP|O_VERB, NULL); 2248 return_value = FME_DISPROVED; 2249 break; 2250 } 2251 } 2252 platform_set_payloadnvp(NULL); 2253 } 2254 2255 indent(); 2256 switch (return_value) { 2257 case FME_CREDIBLE: 2258 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2259 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2260 break; 2261 case FME_DISPROVED: 2262 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2263 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2264 break; 2265 case FME_WAIT: 2266 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2267 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2268 out(O_ALTFP|O_VERB|O_NONL, " to "); 2269 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2270 break; 2271 default: 2272 out(O_DIE, "requirements_test: unexpected fme_state"); 2273 break; 2274 } 2275 out(O_ALTFP|O_VERB, NULL); 2276 indent_pop(); 2277 2278 return (return_value); 2279 } 2280 2281 /* this event is not a report, descend the tree */ 2282 for (bp = itree_next_bubble(ep, NULL); bp; 2283 bp = itree_next_bubble(ep, bp)) { 2284 if (bp->t != B_FROM) 2285 continue; 2286 if (bp->mark == 0) { 2287 int n = bp->nork; 2288 2289 bp->mark = 1; 2290 credible_events = 0; 2291 waiting_events = 0; 2292 arrow_delay = TIMEVAL_EVENTUALLY; 2293 /* 2294 * n is -1 for 'A' so adjust it. 2295 * XXX just count up the arrows for now. 2296 */ 2297 if (n < 0) { 2298 n = 0; 2299 for (ap = itree_next_arrow(bp, NULL); ap; 2300 ap = itree_next_arrow(bp, ap)) 2301 n++; 2302 indent(); 2303 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 2304 } else { 2305 indent(); 2306 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 2307 } 2308 2309 for (ap = itree_next_arrow(bp, NULL); ap; 2310 ap = itree_next_arrow(bp, ap)) { 2311 ep2 = ap->arrowp->head->myevent; 2312 if (n <= credible_events) 2313 break; 2314 2315 if (triggered(fmep, ep2, 1)) 2316 /* XXX adding max timevals! */ 2317 switch (requirements_test(fmep, ep2, 2318 at_latest_by + ap->arrowp->maxdelay, 2319 &my_delay, ap->arrowp)) { 2320 case FME_CREDIBLE: 2321 credible_events++; 2322 break; 2323 case FME_DISPROVED: 2324 break; 2325 case FME_WAIT: 2326 if (my_delay < arrow_delay) 2327 arrow_delay = my_delay; 2328 waiting_events++; 2329 break; 2330 default: 2331 out(O_DIE, 2332 "Bug in requirements_test."); 2333 } 2334 else 2335 credible_events++; 2336 } 2337 indent(); 2338 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 2339 credible_events, waiting_events); 2340 if (credible_events + waiting_events < n) { 2341 /* Can never meet requirements */ 2342 indent(); 2343 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2344 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2345 out(O_ALTFP|O_VERB, NULL); 2346 indent_pop(); 2347 return (FME_DISPROVED); 2348 } 2349 if (credible_events < n) { /* will have to wait */ 2350 /* wait time is shortest known */ 2351 if (arrow_delay < overall_delay) 2352 overall_delay = arrow_delay; 2353 return_value = FME_WAIT; 2354 } 2355 } else { 2356 indent(); 2357 out(O_ALTFP|O_VERB|O_NONL, " Mark was set: "); 2358 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2359 out(O_ALTFP|O_VERB|O_NONL, " to"); 2360 for (ap = itree_next_arrow(bp, NULL); ap; 2361 ap = itree_next_arrow(bp, ap)) { 2362 out(O_ALTFP|O_VERB|O_NONL, " "); 2363 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 2364 ap->arrowp->head->myevent); 2365 } 2366 out(O_ALTFP|O_VERB, NULL); 2367 } 2368 } 2369 2370 /* 2371 * evaluate constraints for ctlist, which is the list of 2372 * constraints for the arrow pointing into this node of the tree 2373 */ 2374 if (return_value == FME_CREDIBLE && arrowp != NULL) { 2375 struct constraintlist *ctp; 2376 struct evalue value; 2377 2378 platform_set_payloadnvp(ep->nvp); 2379 for (ctp = arrowp->constraints; ctp != NULL; 2380 ctp = ctp->next) { 2381 if (eval_expr(ctp->cnode, NULL, NULL, &fmep->globals, 2382 fmep->cfgdata->cooked, arrowp, 0, &value) == 0 || 2383 value.t == UNDEFINED || value.v == 0) { 2384 indent(); 2385 out(O_ALTFP|O_VERB|O_NONL, 2386 " False constraint "); 2387 out(O_ALTFP|O_VERB|O_NONL, " "); 2388 ptree(O_ALTFP|O_VERB|O_NONL, 2389 ctp->cnode, 1, 0); 2390 out(O_ALTFP|O_VERB, NULL); 2391 return_value = FME_DISPROVED; 2392 break; 2393 } 2394 } 2395 platform_set_payloadnvp(NULL); 2396 } 2397 2398 if (return_value == FME_WAIT) 2399 *pdelay = overall_delay; 2400 indent(); 2401 out(O_ALTFP|O_VERB|O_NONL, "<-%s ", fme_state2str(return_value)); 2402 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2403 out(O_ALTFP|O_VERB, NULL); 2404 indent_pop(); 2405 return (return_value); 2406 } 2407 2408 static enum fme_state 2409 causes_test(struct fme *fmep, struct event *ep, 2410 unsigned long long at_latest_by, unsigned long long *pdelay) 2411 { 2412 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2413 unsigned long long my_delay; 2414 int credible_results = 0; 2415 int waiting_results = 0; 2416 enum fme_state fstate; 2417 struct event *tail_event; 2418 struct bubble *bp; 2419 struct arrowlist *ap; 2420 int k = 1; 2421 2422 stats_counter_bump(fmep->Ccallcount); 2423 indent_push(" C"); 2424 indent(); 2425 out(O_ALTFP|O_VERB|O_NONL, "->"); 2426 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2427 out(O_ALTFP|O_VERB, NULL); 2428 2429 for (bp = itree_next_bubble(ep, NULL); bp; 2430 bp = itree_next_bubble(ep, bp)) { 2431 if (bp->t != B_TO) 2432 continue; 2433 k = bp->nork; /* remember the K value */ 2434 for (ap = itree_next_arrow(bp, NULL); ap; 2435 ap = itree_next_arrow(bp, ap)) { 2436 struct constraintlist *ctp; 2437 struct evalue value; 2438 int do_not_follow = 0; 2439 /* 2440 * see if false constraint prevents us 2441 * from traversing this arrow 2442 */ 2443 platform_set_payloadnvp(ep->nvp); 2444 for (ctp = ap->arrowp->constraints; 2445 ctp != NULL; ctp = ctp->next) { 2446 if (eval_expr(ctp->cnode, NULL, NULL, 2447 &fmep->globals, 2448 fmep->cfgdata->cooked, 2449 ap->arrowp, 0, 2450 &value) == 0 || 2451 value.t == UNDEFINED || 2452 value.v == 0) { 2453 do_not_follow = 1; 2454 break; 2455 } 2456 } 2457 platform_set_payloadnvp(NULL); 2458 if (do_not_follow) { 2459 indent(); 2460 out(O_ALTFP|O_VERB|O_NONL, 2461 " False arrow from "); 2462 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 2463 ap->arrowp->tail->myevent); 2464 out(O_ALTFP|O_VERB|O_NONL, " "); 2465 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2466 out(O_ALTFP|O_VERB, NULL); 2467 continue; 2468 } 2469 2470 if (ap->arrowp->causes_tested++ > 0) { 2471 /* 2472 * get to this point if this is not the 2473 * first time we're going through this 2474 * arrow in the causes test. consider this 2475 * branch to be credible and let the 2476 * credible/noncredible outcome depend on 2477 * the other branches in this cycle. 2478 */ 2479 fstate = FME_CREDIBLE; 2480 } else { 2481 /* 2482 * get to this point if this is the first 2483 * time we're going through this arrow. 2484 */ 2485 tail_event = ap->arrowp->tail->myevent; 2486 fstate = hypothesise(fmep, tail_event, 2487 at_latest_by, 2488 &my_delay, ap->arrowp); 2489 } 2490 2491 switch (fstate) { 2492 case FME_WAIT: 2493 if (my_delay < overall_delay) 2494 overall_delay = my_delay; 2495 waiting_results++; 2496 break; 2497 case FME_CREDIBLE: 2498 credible_results++; 2499 break; 2500 case FME_DISPROVED: 2501 break; 2502 default: 2503 out(O_DIE, "Bug in causes_test"); 2504 } 2505 2506 ap->arrowp->causes_tested--; 2507 ASSERT(ap->arrowp->causes_tested >= 0); 2508 } 2509 } 2510 /* compare against K */ 2511 if (credible_results + waiting_results < k) { 2512 indent(); 2513 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2514 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2515 out(O_ALTFP|O_VERB, NULL); 2516 indent_pop(); 2517 return (FME_DISPROVED); 2518 } 2519 if (waiting_results != 0) { 2520 *pdelay = overall_delay; 2521 indent(); 2522 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2523 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2524 out(O_ALTFP|O_VERB|O_NONL, " to "); 2525 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2526 out(O_ALTFP|O_VERB, NULL); 2527 indent_pop(); 2528 return (FME_WAIT); 2529 } 2530 indent(); 2531 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2532 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2533 out(O_ALTFP|O_VERB, NULL); 2534 indent_pop(); 2535 return (FME_CREDIBLE); 2536 } 2537 2538 static enum fme_state 2539 hypothesise(struct fme *fmep, struct event *ep, 2540 unsigned long long at_latest_by, unsigned long long *pdelay, 2541 struct arrow *arrowp) 2542 { 2543 enum fme_state rtr, otr; 2544 unsigned long long my_delay; 2545 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 2546 2547 stats_counter_bump(fmep->Hcallcount); 2548 indent_push(" H"); 2549 indent(); 2550 out(O_ALTFP|O_VERB|O_NONL, "->"); 2551 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2552 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 2553 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 2554 out(O_ALTFP|O_VERB, NULL); 2555 2556 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay, arrowp); 2557 mark_arrows(fmep, ep, 0); /* clean up after requirements test */ 2558 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 2559 overall_delay = my_delay; 2560 if (rtr != FME_DISPROVED) { 2561 if (is_problem(ep->t)) { 2562 otr = effects_test(fmep, ep); 2563 if (otr != FME_DISPROVED) { 2564 if (fmep->peek == 0 && ep->is_suspect++ == 0) { 2565 ep->suspects = fmep->suspects; 2566 fmep->suspects = ep; 2567 fmep->nsuspects++; 2568 if (!is_fault(ep->t)) 2569 fmep->nonfault++; 2570 } 2571 } 2572 } else 2573 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 2574 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 2575 overall_delay = my_delay; 2576 if ((otr != FME_DISPROVED) && 2577 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 2578 *pdelay = overall_delay; 2579 } 2580 if (rtr == FME_DISPROVED) { 2581 indent(); 2582 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2583 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2584 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 2585 indent_pop(); 2586 return (FME_DISPROVED); 2587 } 2588 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 2589 indent(); 2590 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2591 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2592 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 2593 indent_pop(); 2594 return (FME_DISPROVED); 2595 } 2596 if (otr == FME_DISPROVED) { 2597 indent(); 2598 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 2599 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2600 out(O_ALTFP|O_VERB, " (causes are not credible)"); 2601 indent_pop(); 2602 return (FME_DISPROVED); 2603 } 2604 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 2605 indent(); 2606 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 2607 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2608 out(O_ALTFP|O_VERB|O_NONL, " to "); 2609 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 2610 out(O_ALTFP|O_VERB, NULL); 2611 indent_pop(); 2612 return (FME_WAIT); 2613 } 2614 indent(); 2615 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 2616 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 2617 out(O_ALTFP|O_VERB, NULL); 2618 indent_pop(); 2619 return (FME_CREDIBLE); 2620 } 2621