1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <strings.h> 37 #include <ctype.h> 38 #include <alloca.h> 39 #include <libnvpair.h> 40 #include <sys/fm/protocol.h> 41 #include <fm/fmd_api.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 58 /* imported from eft.c... */ 59 extern char *Autoclose; 60 extern int Dupclose; 61 extern hrtime_t Hesitate; 62 extern nv_alloc_t Eft_nv_hdl; 63 extern int Max_fme; 64 extern fmd_hdl_t *Hdl; 65 66 static int Istat_need_save; 67 static int Serd_need_save; 68 void istat_save(void); 69 void serd_save(void); 70 71 /* fme under construction is global so we can free it on module abort */ 72 static struct fme *Nfmep; 73 74 static const char *Undiag_reason; 75 76 static int Nextid = 0; 77 78 static int Open_fme_count = 0; /* Count of open FMEs */ 79 80 /* list of fault management exercises underway */ 81 static struct fme { 82 struct fme *next; /* next exercise */ 83 unsigned long long ull; /* time when fme was created */ 84 int id; /* FME id */ 85 struct cfgdata *cfgdata; /* full configuration data */ 86 struct lut *eventtree; /* propagation tree for this FME */ 87 /* 88 * The initial error report that created this FME is kept in 89 * two forms. e0 points to the instance tree node and is used 90 * by fme_eval() as the starting point for the inference 91 * algorithm. e0r is the event handle FMD passed to us when 92 * the ereport first arrived and is used when setting timers, 93 * which are always relative to the time of this initial 94 * report. 95 */ 96 struct event *e0; 97 fmd_event_t *e0r; 98 99 id_t timer; /* for setting an fmd time-out */ 100 101 struct event *ecurrent; /* ereport under consideration */ 102 struct event *suspects; /* current suspect list */ 103 struct event *psuspects; /* previous suspect list */ 104 int nsuspects; /* count of suspects */ 105 int nonfault; /* zero if all suspects T_FAULT */ 106 int posted_suspects; /* true if we've posted a diagnosis */ 107 int uniqobs; /* number of unique events observed */ 108 int peek; /* just peeking, don't track suspects */ 109 int overflow; /* true if overflow FME */ 110 enum fme_state { 111 FME_NOTHING = 5000, /* not evaluated yet */ 112 FME_WAIT, /* need to wait for more info */ 113 FME_CREDIBLE, /* suspect list is credible */ 114 FME_DISPROVED, /* no valid suspects found */ 115 FME_DEFERRED /* don't know yet (k-count not met) */ 116 } state; 117 118 unsigned long long pull; /* time passed since created */ 119 unsigned long long wull; /* wait until this time for re-eval */ 120 struct event *observations; /* observation list */ 121 struct lut *globals; /* values of global variables */ 122 /* fmd interfacing */ 123 fmd_hdl_t *hdl; /* handle for talking with fmd */ 124 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 125 /* stats */ 126 struct stats *Rcount; 127 struct stats *Hcallcount; 128 struct stats *Rcallcount; 129 struct stats *Ccallcount; 130 struct stats *Ecallcount; 131 struct stats *Tcallcount; 132 struct stats *Marrowcount; 133 struct stats *diags; 134 } *FMElist, *EFMElist, *ClosedFMEs; 135 136 static struct case_list { 137 fmd_case_t *fmcase; 138 struct case_list *next; 139 } *Undiagablecaselist; 140 141 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 142 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 143 unsigned long long at_latest_by, unsigned long long *pdelay); 144 static struct node *eventprop_lookup(struct event *ep, const char *propname); 145 static struct node *pathstring2epnamenp(char *path); 146 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 147 fmd_case_t *fmcase); 148 static void restore_suspects(struct fme *fmep); 149 static void save_suspects(struct fme *fmep); 150 static void destroy_fme(struct fme *f); 151 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 152 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 153 static void istat_counter_reset_cb(struct istat_entry *entp, 154 struct stats *statp, const struct ipath *ipp); 155 static void serd_reset_cb(struct serd_entry *entp, void *unused, 156 const struct ipath *ipp); 157 static void destroy_fme_bufs(struct fme *fp); 158 159 static struct fme * 160 alloc_fme(void) 161 { 162 struct fme *fmep; 163 164 fmep = MALLOC(sizeof (*fmep)); 165 bzero(fmep, sizeof (*fmep)); 166 return (fmep); 167 } 168 169 /* 170 * fme_ready -- called when all initialization of the FME (except for 171 * stats) has completed successfully. Adds the fme to global lists 172 * and establishes its stats. 173 */ 174 static struct fme * 175 fme_ready(struct fme *fmep) 176 { 177 char nbuf[100]; 178 179 Nfmep = NULL; /* don't need to free this on module abort now */ 180 181 if (EFMElist) { 182 EFMElist->next = fmep; 183 EFMElist = fmep; 184 } else 185 FMElist = EFMElist = fmep; 186 187 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 188 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 189 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 190 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 191 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 192 fmep->Rcallcount = stats_new_counter(nbuf, 193 "calls to requirements_test()", 1); 194 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 195 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 196 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 197 fmep->Ecallcount = 198 stats_new_counter(nbuf, "calls to effects_test()", 1); 199 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 200 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 201 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 202 fmep->Marrowcount = stats_new_counter(nbuf, 203 "arrows marked by mark_arrows()", 1); 204 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 205 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 206 207 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 208 config_print(O_ALTFP|O_VERB2, fmep->cfgdata->cooked); 209 210 return (fmep); 211 } 212 213 extern void ipath_dummy_lut(struct arrow *); 214 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 215 216 /* ARGSUSED */ 217 static void 218 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 219 { 220 struct bubble *bp; 221 struct arrowlist *ap; 222 223 for (bp = itree_next_bubble(ep, NULL); bp; 224 bp = itree_next_bubble(ep, bp)) { 225 if (bp->t != B_FROM) 226 continue; 227 for (ap = itree_next_arrow(bp, NULL); ap; 228 ap = itree_next_arrow(bp, ap)) { 229 ap->arrowp->pnode->u.arrow.needed = 1; 230 ipath_dummy_lut(ap->arrowp); 231 } 232 } 233 } 234 235 /* ARGSUSED */ 236 static void 237 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 238 { 239 struct bubble *bp; 240 struct arrowlist *ap; 241 242 for (bp = itree_next_bubble(ep, NULL); bp; 243 bp = itree_next_bubble(ep, bp)) { 244 if (bp->t != B_FROM) 245 continue; 246 for (ap = itree_next_arrow(bp, NULL); ap; 247 ap = itree_next_arrow(bp, ap)) 248 ap->arrowp->pnode->u.arrow.needed = 0; 249 } 250 } 251 252 static void globals_destructor(void *left, void *right, void *arg); 253 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 254 255 static void 256 prune_propagations(const char *e0class, const struct ipath *e0ipp) 257 { 258 char nbuf[100]; 259 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 260 extern struct lut *Usednames; 261 262 Nfmep = alloc_fme(); 263 Nfmep->id = Nextid; 264 Nfmep->state = FME_NOTHING; 265 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 266 if ((Nfmep->e0 = 267 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 268 out(O_ALTFP, "prune_propagations: e0 not in instance tree"); 269 itree_free(Nfmep->eventtree); 270 FREE(Nfmep); 271 Nfmep = NULL; 272 return; 273 } 274 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 275 Nfmep->e0->count++; 276 277 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 278 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 279 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 280 Nfmep->Hcallcount = 281 stats_new_counter(nbuf, "calls to hypothesise()", 1); 282 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 283 Nfmep->Rcallcount = stats_new_counter(nbuf, 284 "calls to requirements_test()", 1); 285 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 286 Nfmep->Ccallcount = 287 stats_new_counter(nbuf, "calls to causes_test()", 1); 288 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 289 Nfmep->Ecallcount = 290 stats_new_counter(nbuf, "calls to effects_test()", 1); 291 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 292 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 293 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 294 Nfmep->Marrowcount = stats_new_counter(nbuf, 295 "arrows marked by mark_arrows()", 1); 296 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 297 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 298 299 Nfmep->peek = 1; 300 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 301 lut_free(Usednames, NULL, NULL); 302 Usednames = NULL; 303 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 304 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 305 itree_prune(Nfmep->eventtree); 306 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 307 308 stats_delete(Nfmep->Rcount); 309 stats_delete(Nfmep->Hcallcount); 310 stats_delete(Nfmep->Rcallcount); 311 stats_delete(Nfmep->Ccallcount); 312 stats_delete(Nfmep->Ecallcount); 313 stats_delete(Nfmep->Tcallcount); 314 stats_delete(Nfmep->Marrowcount); 315 stats_delete(Nfmep->diags); 316 itree_free(Nfmep->eventtree); 317 lut_free(Nfmep->globals, globals_destructor, NULL); 318 FREE(Nfmep); 319 } 320 321 static struct fme * 322 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 323 fmd_case_t *fmcase) 324 { 325 struct cfgdata *cfgdata; 326 int init_size; 327 extern int alloc_total(); 328 329 init_size = alloc_total(); 330 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 331 if ((cfgdata = config_snapshot()) == NULL) { 332 out(O_ALTFP, "newfme: NULL configuration"); 333 Undiag_reason = UD_NOCONF; 334 return (NULL); 335 } 336 platform_save_config(hdl, fmcase); 337 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 338 alloc_total() - init_size); 339 340 Nfmep = alloc_fme(); 341 342 Nfmep->id = Nextid++; 343 Nfmep->cfgdata = cfgdata; 344 Nfmep->posted_suspects = 0; 345 Nfmep->uniqobs = 0; 346 Nfmep->state = FME_NOTHING; 347 Nfmep->pull = 0ULL; 348 Nfmep->overflow = 0; 349 350 Nfmep->fmcase = fmcase; 351 Nfmep->hdl = hdl; 352 353 if ((Nfmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 354 out(O_ALTFP, "newfme: NULL instance tree"); 355 Undiag_reason = UD_INSTFAIL; 356 config_free(cfgdata); 357 destroy_fme_bufs(Nfmep); 358 FREE(Nfmep); 359 Nfmep = NULL; 360 return (NULL); 361 } 362 363 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 364 365 if ((Nfmep->e0 = 366 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 367 out(O_ALTFP, "newfme: e0 not in instance tree"); 368 Undiag_reason = UD_BADEVENTI; 369 itree_free(Nfmep->eventtree); 370 config_free(cfgdata); 371 destroy_fme_bufs(Nfmep); 372 FREE(Nfmep); 373 Nfmep = NULL; 374 return (NULL); 375 } 376 377 return (fme_ready(Nfmep)); 378 } 379 380 void 381 fme_fini(void) 382 { 383 struct fme *sfp, *fp; 384 struct case_list *ucasep, *nextcasep; 385 386 ucasep = Undiagablecaselist; 387 while (ucasep != NULL) { 388 nextcasep = ucasep->next; 389 FREE(ucasep); 390 ucasep = nextcasep; 391 } 392 Undiagablecaselist = NULL; 393 394 /* clean up closed fmes */ 395 fp = ClosedFMEs; 396 while (fp != NULL) { 397 sfp = fp->next; 398 destroy_fme(fp); 399 fp = sfp; 400 } 401 ClosedFMEs = NULL; 402 403 fp = FMElist; 404 while (fp != NULL) { 405 sfp = fp->next; 406 destroy_fme(fp); 407 fp = sfp; 408 } 409 FMElist = EFMElist = NULL; 410 411 /* if we were in the middle of creating an fme, free it now */ 412 if (Nfmep) { 413 destroy_fme(Nfmep); 414 Nfmep = NULL; 415 } 416 } 417 418 /* 419 * Allocated space for a buffer name. 20 bytes allows for 420 * a ridiculous 9,999,999 unique observations. 421 */ 422 #define OBBUFNMSZ 20 423 424 /* 425 * serialize_observation 426 * 427 * Create a recoverable version of the current observation 428 * (f->ecurrent). We keep a serialized version of each unique 429 * observation in order that we may resume correctly the fme in the 430 * correct state if eft or fmd crashes and we're restarted. 431 */ 432 static void 433 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 434 { 435 size_t pkdlen; 436 char tmpbuf[OBBUFNMSZ]; 437 char *pkd = NULL; 438 char *estr; 439 440 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 441 estr = ipath2str(cls, ipp); 442 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 443 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 444 strlen(estr) + 1); 445 FREE(estr); 446 447 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 448 (void) snprintf(tmpbuf, 449 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 450 if (nvlist_xpack(fp->ecurrent->nvp, 451 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 452 out(O_DIE|O_SYS, "pack of observed nvl failed"); 453 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 454 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 455 FREE(pkd); 456 } 457 458 fp->uniqobs++; 459 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 460 sizeof (fp->uniqobs)); 461 } 462 463 /* 464 * init_fme_bufs -- We keep several bits of state about an fme for 465 * use if eft or fmd crashes and we're restarted. 466 */ 467 static void 468 init_fme_bufs(struct fme *fp) 469 { 470 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 471 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 472 sizeof (fp->pull)); 473 474 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 475 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 476 sizeof (fp->id)); 477 478 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 479 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 480 sizeof (fp->uniqobs)); 481 482 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 483 sizeof (fp->posted_suspects)); 484 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 485 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 486 } 487 488 static void 489 destroy_fme_bufs(struct fme *fp) 490 { 491 char tmpbuf[OBBUFNMSZ]; 492 int o; 493 494 platform_restore_config(fp->hdl, fp->fmcase); 495 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 496 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 497 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 498 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 499 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 500 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 501 502 for (o = 0; o < fp->uniqobs; o++) { 503 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 504 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 505 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 506 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 507 } 508 } 509 510 /* 511 * reconstitute_observations -- convert a case's serialized observations 512 * back into struct events. Returns zero if all observations are 513 * successfully reconstituted. 514 */ 515 static int 516 reconstitute_observations(struct fme *fmep) 517 { 518 struct event *ep; 519 struct node *epnamenp = NULL; 520 size_t pkdlen; 521 char *pkd = NULL; 522 char *tmpbuf = alloca(OBBUFNMSZ); 523 char *sepptr; 524 char *estr; 525 int ocnt; 526 int elen; 527 528 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 529 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 530 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 531 if (elen == 0) { 532 out(O_ALTFP, 533 "reconstitute_observation: no %s buffer found.", 534 tmpbuf); 535 Undiag_reason = UD_MISSINGOBS; 536 break; 537 } 538 539 estr = MALLOC(elen); 540 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 541 sepptr = strchr(estr, '@'); 542 if (sepptr == NULL) { 543 out(O_ALTFP, 544 "reconstitute_observation: %s: " 545 "missing @ separator in %s.", 546 tmpbuf, estr); 547 Undiag_reason = UD_MISSINGPATH; 548 FREE(estr); 549 break; 550 } 551 552 *sepptr = '\0'; 553 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 554 out(O_ALTFP, 555 "reconstitute_observation: %s: " 556 "trouble converting path string \"%s\" " 557 "to internal representation.", 558 tmpbuf, sepptr + 1); 559 Undiag_reason = UD_MISSINGPATH; 560 FREE(estr); 561 break; 562 } 563 564 /* construct the event */ 565 ep = itree_lookup(fmep->eventtree, 566 stable(estr), ipath(epnamenp)); 567 if (ep == NULL) { 568 out(O_ALTFP, 569 "reconstitute_observation: %s: " 570 "lookup of \"%s\" in itree failed.", 571 tmpbuf, ipath2str(estr, ipath(epnamenp))); 572 Undiag_reason = UD_BADOBS; 573 tree_free(epnamenp); 574 FREE(estr); 575 break; 576 } 577 tree_free(epnamenp); 578 579 /* 580 * We may or may not have a saved nvlist for the observation 581 */ 582 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 583 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 584 if (pkdlen != 0) { 585 pkd = MALLOC(pkdlen); 586 fmd_buf_read(fmep->hdl, 587 fmep->fmcase, tmpbuf, pkd, pkdlen); 588 ASSERT(ep->nvp == NULL); 589 if (nvlist_xunpack(pkd, 590 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 591 out(O_DIE|O_SYS, "pack of observed nvl failed"); 592 FREE(pkd); 593 } 594 595 if (ocnt == 0) 596 fmep->e0 = ep; 597 598 FREE(estr); 599 fmep->ecurrent = ep; 600 ep->count++; 601 602 /* link it into list of observations seen */ 603 ep->observations = fmep->observations; 604 fmep->observations = ep; 605 } 606 607 if (ocnt == fmep->uniqobs) { 608 (void) fme_ready(fmep); 609 return (0); 610 } 611 612 return (1); 613 } 614 615 /* 616 * restart_fme -- called during eft initialization. Reconstitutes 617 * an in-progress fme. 618 */ 619 void 620 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 621 { 622 nvlist_t *defect; 623 struct case_list *bad; 624 struct fme *fmep; 625 struct cfgdata *cfgdata = NULL; 626 size_t rawsz; 627 struct event *ep; 628 char *tmpbuf = alloca(OBBUFNMSZ); 629 char *sepptr; 630 char *estr; 631 int elen; 632 struct node *epnamenp = NULL; 633 int init_size; 634 extern int alloc_total(); 635 636 /* 637 * ignore solved or closed cases 638 */ 639 if (fmd_case_solved(hdl, inprogress) || 640 fmd_case_closed(hdl, inprogress)) 641 return; 642 643 fmep = alloc_fme(); 644 fmep->fmcase = inprogress; 645 fmep->hdl = hdl; 646 647 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 648 out(O_ALTFP, "restart_fme: no saved posted status"); 649 Undiag_reason = UD_MISSINGINFO; 650 goto badcase; 651 } else { 652 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 653 (void *)&fmep->posted_suspects, 654 sizeof (fmep->posted_suspects)); 655 } 656 657 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 658 out(O_ALTFP, "restart_fme: no saved id"); 659 Undiag_reason = UD_MISSINGINFO; 660 goto badcase; 661 } else { 662 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 663 sizeof (fmep->id)); 664 } 665 if (Nextid <= fmep->id) 666 Nextid = fmep->id + 1; 667 668 out(O_ALTFP, "Replay FME %d", fmep->id); 669 670 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 671 out(O_ALTFP, "restart_fme: No config data"); 672 Undiag_reason = UD_MISSINGINFO; 673 goto badcase; 674 } 675 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 676 sizeof (size_t)); 677 678 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 679 out(O_ALTFP, "restart_fme: No event zero"); 680 Undiag_reason = UD_MISSINGZERO; 681 goto badcase; 682 } 683 684 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 685 out(O_ALTFP, "restart_fme: no saved wait time"); 686 Undiag_reason = UD_MISSINGINFO; 687 goto badcase; 688 } else { 689 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 690 sizeof (fmep->pull)); 691 } 692 693 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 694 out(O_ALTFP, "restart_fme: no count of observations"); 695 Undiag_reason = UD_MISSINGINFO; 696 goto badcase; 697 } else { 698 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 699 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 700 } 701 702 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 703 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 704 if (elen == 0) { 705 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 706 tmpbuf); 707 Undiag_reason = UD_MISSINGOBS; 708 goto badcase; 709 } 710 estr = MALLOC(elen); 711 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 712 sepptr = strchr(estr, '@'); 713 if (sepptr == NULL) { 714 out(O_ALTFP, "reconstitute_observation: %s: " 715 "missing @ separator in %s.", 716 tmpbuf, estr); 717 Undiag_reason = UD_MISSINGPATH; 718 FREE(estr); 719 goto badcase; 720 } 721 *sepptr = '\0'; 722 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 723 out(O_ALTFP, "reconstitute_observation: %s: " 724 "trouble converting path string \"%s\" " 725 "to internal representation.", tmpbuf, sepptr + 1); 726 Undiag_reason = UD_MISSINGPATH; 727 FREE(estr); 728 goto badcase; 729 } 730 prune_propagations(stable(estr), ipath(epnamenp)); 731 tree_free(epnamenp); 732 FREE(estr); 733 734 init_size = alloc_total(); 735 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 736 cfgdata = MALLOC(sizeof (struct cfgdata)); 737 cfgdata->cooked = NULL; 738 cfgdata->devcache = NULL; 739 cfgdata->cpucache = NULL; 740 cfgdata->cooked_refcnt = 0; 741 cfgdata->raw_refcnt = 1; 742 743 if (rawsz > 0) { 744 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 745 out(O_ALTFP, "restart_fme: Config data size mismatch"); 746 Undiag_reason = UD_CFGMISMATCH; 747 goto badcase; 748 } 749 cfgdata->begin = MALLOC(rawsz); 750 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 751 fmd_buf_read(hdl, 752 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 753 } else { 754 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 755 } 756 fmep->cfgdata = cfgdata; 757 758 config_cook(cfgdata); 759 if (cfgdata->begin) 760 FREE(cfgdata->begin); 761 cfgdata->begin = NULL; 762 cfgdata->end = NULL; 763 cfgdata->nextfree = NULL; 764 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 765 alloc_total() - init_size); 766 767 if ((fmep->eventtree = itree_create(cfgdata->cooked)) == NULL) { 768 /* case not properly saved or irretrievable */ 769 out(O_ALTFP, "restart_fme: NULL instance tree"); 770 Undiag_reason = UD_INSTFAIL; 771 goto badcase; 772 } 773 774 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 775 776 if (reconstitute_observations(fmep) != 0) 777 goto badcase; 778 779 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 780 for (ep = fmep->observations; ep; ep = ep->observations) { 781 out(O_ALTFP|O_NONL, " "); 782 itree_pevent_brief(O_ALTFP|O_NONL, ep); 783 } 784 out(O_ALTFP, NULL); 785 786 Open_fme_count++; 787 788 /* give the diagnosis algorithm a shot at the new FME state */ 789 fme_eval(fmep, fmep->e0r); 790 return; 791 792 badcase: 793 if (fmep->eventtree != NULL) 794 itree_free(fmep->eventtree); 795 config_free(cfgdata); 796 destroy_fme_bufs(fmep); 797 FREE(fmep); 798 799 /* 800 * Since we're unable to restart the case, add it to the undiagable 801 * list and solve and close it as appropriate. 802 */ 803 bad = MALLOC(sizeof (struct case_list)); 804 bad->next = NULL; 805 806 if (Undiagablecaselist != NULL) 807 bad->next = Undiagablecaselist; 808 Undiagablecaselist = bad; 809 bad->fmcase = inprogress; 810 811 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 812 fmd_case_uuid(hdl, bad->fmcase)); 813 814 if (fmd_case_solved(hdl, bad->fmcase)) { 815 out(O_ALTFP|O_NONL, "already solved, "); 816 } else { 817 out(O_ALTFP|O_NONL, "solving, "); 818 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 819 NULL, NULL, NULL); 820 if (Undiag_reason != NULL) 821 (void) nvlist_add_string(defect, 822 UNDIAG_REASON, Undiag_reason); 823 fmd_case_add_suspect(hdl, bad->fmcase, defect); 824 fmd_case_solve(hdl, bad->fmcase); 825 } 826 827 if (fmd_case_closed(hdl, bad->fmcase)) { 828 out(O_ALTFP, "already closed ]"); 829 } else { 830 out(O_ALTFP, "closing ]"); 831 fmd_case_close(hdl, bad->fmcase); 832 } 833 } 834 835 /*ARGSUSED*/ 836 static void 837 globals_destructor(void *left, void *right, void *arg) 838 { 839 struct evalue *evp = (struct evalue *)right; 840 if (evp->t == NODEPTR) 841 tree_free((struct node *)(uintptr_t)evp->v); 842 evp->v = NULL; 843 FREE(evp); 844 } 845 846 void 847 destroy_fme(struct fme *f) 848 { 849 stats_delete(f->Rcount); 850 stats_delete(f->Hcallcount); 851 stats_delete(f->Rcallcount); 852 stats_delete(f->Ccallcount); 853 stats_delete(f->Ecallcount); 854 stats_delete(f->Tcallcount); 855 stats_delete(f->Marrowcount); 856 stats_delete(f->diags); 857 858 if (f->eventtree != NULL) 859 itree_free(f->eventtree); 860 if (f->cfgdata != NULL) 861 config_free(f->cfgdata); 862 lut_free(f->globals, globals_destructor, NULL); 863 FREE(f); 864 } 865 866 static const char * 867 fme_state2str(enum fme_state s) 868 { 869 switch (s) { 870 case FME_NOTHING: return ("NOTHING"); 871 case FME_WAIT: return ("WAIT"); 872 case FME_CREDIBLE: return ("CREDIBLE"); 873 case FME_DISPROVED: return ("DISPROVED"); 874 case FME_DEFERRED: return ("DEFERRED"); 875 default: return ("UNKNOWN"); 876 } 877 } 878 879 static int 880 is_problem(enum nametype t) 881 { 882 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 883 } 884 885 static int 886 is_fault(enum nametype t) 887 { 888 return (t == N_FAULT); 889 } 890 891 static int 892 is_defect(enum nametype t) 893 { 894 return (t == N_DEFECT); 895 } 896 897 static int 898 is_upset(enum nametype t) 899 { 900 return (t == N_UPSET); 901 } 902 903 static void 904 fme_print(int flags, struct fme *fmep) 905 { 906 struct event *ep; 907 908 out(flags, "Fault Management Exercise %d", fmep->id); 909 out(flags, "\t State: %s", fme_state2str(fmep->state)); 910 out(flags|O_NONL, "\t Start time: "); 911 ptree_timeval(flags|O_NONL, &fmep->ull); 912 out(flags, NULL); 913 if (fmep->wull) { 914 out(flags|O_NONL, "\t Wait time: "); 915 ptree_timeval(flags|O_NONL, &fmep->wull); 916 out(flags, NULL); 917 } 918 out(flags|O_NONL, "\t E0: "); 919 if (fmep->e0) 920 itree_pevent_brief(flags|O_NONL, fmep->e0); 921 else 922 out(flags|O_NONL, "NULL"); 923 out(flags, NULL); 924 out(flags|O_NONL, "\tObservations:"); 925 for (ep = fmep->observations; ep; ep = ep->observations) { 926 out(flags|O_NONL, " "); 927 itree_pevent_brief(flags|O_NONL, ep); 928 } 929 out(flags, NULL); 930 out(flags|O_NONL, "\tSuspect list:"); 931 for (ep = fmep->suspects; ep; ep = ep->suspects) { 932 out(flags|O_NONL, " "); 933 itree_pevent_brief(flags|O_NONL, ep); 934 } 935 out(flags, NULL); 936 if (fmep->eventtree != NULL) { 937 out(flags|O_VERB2, "\t Tree:"); 938 itree_ptree(flags|O_VERB2, fmep->eventtree); 939 } 940 } 941 942 static struct node * 943 pathstring2epnamenp(char *path) 944 { 945 char *sep = "/"; 946 struct node *ret; 947 char *ptr; 948 949 if ((ptr = strtok(path, sep)) == NULL) 950 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 951 952 ret = tree_iname(stable(ptr), NULL, 0); 953 954 while ((ptr = strtok(NULL, sep)) != NULL) 955 ret = tree_name_append(ret, 956 tree_iname(stable(ptr), NULL, 0)); 957 958 return (ret); 959 } 960 961 /* 962 * for a given upset sp, increment the corresponding SERD engine. if the 963 * SERD engine trips, return the ename and ipp of the resulting ereport. 964 * returns true if engine tripped and *enamep and *ippp were filled in. 965 */ 966 static int 967 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 968 fmd_case_t *fmcase, struct event *sp, const char **enamep, 969 const struct ipath **ippp) 970 { 971 struct node *serdinst; 972 char *serdname; 973 struct node *nid; 974 struct serd_entry *newentp; 975 976 ASSERT(sp->t == N_UPSET); 977 ASSERT(ffep != NULL); 978 979 /* 980 * obtain instanced SERD engine from the upset sp. from this 981 * derive serdname, the string used to identify the SERD engine. 982 */ 983 serdinst = eventprop_lookup(sp, L_engine); 984 985 if (serdinst == NULL) 986 return (NULL); 987 988 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 989 ipath(serdinst->u.stmt.np->u.event.epname)); 990 991 /* handle serd engine "id" property, if there is one */ 992 if ((nid = 993 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 994 struct evalue *gval; 995 char suffixbuf[200]; 996 char *suffix; 997 char *nserdname; 998 size_t nname; 999 1000 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1001 ptree_name_iter(O_ALTFP|O_NONL, nid); 1002 1003 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1004 1005 if ((gval = lut_lookup(fmep->globals, 1006 (void *)nid->u.globid.s, NULL)) == NULL) { 1007 out(O_ALTFP, " undefined"); 1008 } else if (gval->t == UINT64) { 1009 out(O_ALTFP, " %llu", gval->v); 1010 (void) sprintf(suffixbuf, "%llu", gval->v); 1011 suffix = suffixbuf; 1012 } else { 1013 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1014 suffix = (char *)(uintptr_t)gval->v; 1015 } 1016 1017 nname = strlen(serdname) + strlen(suffix) + 2; 1018 nserdname = MALLOC(nname); 1019 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1020 FREE(serdname); 1021 serdname = nserdname; 1022 } 1023 1024 if (!fmd_serd_exists(hdl, serdname)) { 1025 struct node *nN, *nT; 1026 1027 /* no SERD engine yet, so create it */ 1028 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, NULL); 1029 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, NULL); 1030 1031 ASSERT(nN->t == T_NUM); 1032 ASSERT(nT->t == T_TIMEVAL); 1033 1034 fmd_serd_create(hdl, serdname, (uint_t)nN->u.ull, 1035 (hrtime_t)nT->u.ull); 1036 } 1037 1038 newentp = MALLOC(sizeof (*newentp)); 1039 newentp->ename = serdinst->u.stmt.np->u.event.ename->u.name.s; 1040 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1041 newentp->hdl = hdl; 1042 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1043 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1044 (void *)NULL, (lut_cmp)serd_cmp); 1045 Serd_need_save = 1; 1046 serd_save(); 1047 } else { 1048 FREE(newentp); 1049 } 1050 1051 1052 /* 1053 * increment SERD engine. if engine fires, reset serd 1054 * engine and return trip_strcode 1055 */ 1056 if (fmd_serd_record(hdl, serdname, ffep)) { 1057 struct node *tripinst = lut_lookup(serdinst->u.stmt.lutp, 1058 (void *)L_trip, NULL); 1059 1060 ASSERT(tripinst != NULL); 1061 1062 *enamep = tripinst->u.event.ename->u.name.s; 1063 *ippp = ipath(tripinst->u.event.epname); 1064 1065 fmd_case_add_serd(hdl, fmcase, serdname); 1066 fmd_serd_reset(hdl, serdname); 1067 out(O_ALTFP|O_NONL, "[engine fired: %s, sending: ", serdname); 1068 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1069 out(O_ALTFP, "]"); 1070 1071 FREE(serdname); 1072 return (1); 1073 } 1074 1075 FREE(serdname); 1076 return (0); 1077 } 1078 1079 /* 1080 * search a suspect list for upsets. feed each upset to serd_eval() and 1081 * build up tripped[], an array of ereports produced by the firing of 1082 * any SERD engines. then feed each ereport back into 1083 * fme_receive_report(). 1084 * 1085 * returns ntrip, the number of these ereports produced. 1086 */ 1087 static int 1088 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1089 { 1090 /* we build an array of tripped ereports that we send ourselves */ 1091 struct { 1092 const char *ename; 1093 const struct ipath *ipp; 1094 } *tripped; 1095 struct event *sp; 1096 int ntrip, nupset, i; 1097 1098 /* 1099 * count the number of upsets to determine the upper limit on 1100 * expected trip ereport strings. remember that one upset can 1101 * lead to at most one ereport. 1102 */ 1103 nupset = 0; 1104 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1105 if (sp->t == N_UPSET) 1106 nupset++; 1107 } 1108 1109 if (nupset == 0) 1110 return (0); 1111 1112 /* 1113 * get to this point if we have upsets and expect some trip 1114 * ereports 1115 */ 1116 tripped = alloca(sizeof (*tripped) * nupset); 1117 bzero((void *)tripped, sizeof (*tripped) * nupset); 1118 1119 ntrip = 0; 1120 for (sp = fmep->suspects; sp; sp = sp->suspects) 1121 if (sp->t == N_UPSET && 1122 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1123 &tripped[ntrip].ename, &tripped[ntrip].ipp)) 1124 ntrip++; 1125 1126 for (i = 0; i < ntrip; i++) 1127 fme_receive_report(fmep->hdl, ffep, 1128 tripped[i].ename, tripped[i].ipp, NULL); 1129 1130 return (ntrip); 1131 } 1132 1133 /* 1134 * fme_receive_external_report -- call when an external ereport comes in 1135 * 1136 * this routine just converts the relevant information from the ereport 1137 * into a format used internally and passes it on to fme_receive_report(). 1138 */ 1139 void 1140 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1141 const char *eventstring) 1142 { 1143 struct node *epnamenp = platform_getpath(nvl); 1144 const struct ipath *ipp; 1145 1146 /* 1147 * XFILE: If we ended up without a path, it's an X-file. 1148 * For now, use our undiagnosable interface. 1149 */ 1150 if (epnamenp == NULL) { 1151 fmd_case_t *fmcase; 1152 1153 out(O_ALTFP, "XFILE: Unable to get path from ereport"); 1154 Undiag_reason = UD_NOPATH; 1155 fmcase = fmd_case_open(hdl, NULL); 1156 publish_undiagnosable(hdl, ffep, fmcase); 1157 return; 1158 } 1159 1160 ipp = ipath(epnamenp); 1161 tree_free(epnamenp); 1162 fme_receive_report(hdl, ffep, stable(eventstring), ipp, nvl); 1163 } 1164 1165 /*ARGSUSED*/ 1166 void 1167 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1168 const char *eventstring) 1169 { 1170 char *uuid; 1171 nvlist_t **nva; 1172 uint_t nvc; 1173 const struct ipath *ipp; 1174 1175 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1176 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1177 &nva, &nvc) != 0) { 1178 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1179 return; 1180 } 1181 1182 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1183 1184 while (nvc-- != 0) { 1185 /* 1186 * Reset any istat or serd engine associated with this path. 1187 */ 1188 char *path; 1189 1190 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1191 continue; 1192 1193 path = ipath2str(NULL, ipp); 1194 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1195 path); 1196 FREE(path); 1197 1198 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1199 istat_save(); 1200 1201 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1202 serd_save(); 1203 } 1204 } 1205 1206 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1207 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1208 1209 /* ARGSUSED */ 1210 static void 1211 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1212 { 1213 struct bubble *bp; 1214 struct arrowlist *ap; 1215 1216 ep->cached_state = 0; 1217 ep->keep_in_tree = 0; 1218 for (bp = itree_next_bubble(ep, NULL); bp; 1219 bp = itree_next_bubble(ep, bp)) { 1220 if (bp->t != B_FROM) 1221 continue; 1222 bp->mark = 0; 1223 for (ap = itree_next_arrow(bp, NULL); ap; 1224 ap = itree_next_arrow(bp, ap)) 1225 ap->arrowp->mark = 0; 1226 } 1227 } 1228 1229 static void 1230 fme_reload_cfgdata(struct fme *fmep) 1231 { 1232 size_t rawsz; 1233 1234 fmep->cfgdata = MALLOC(sizeof (struct cfgdata)); 1235 fmep->cfgdata->cooked = NULL; 1236 fmep->cfgdata->devcache = NULL; 1237 fmep->cfgdata->cpucache = NULL; 1238 fmep->cfgdata->cooked_refcnt = 0; 1239 fmep->cfgdata->raw_refcnt = 1; 1240 fmd_buf_read(fmep->hdl, fmep->fmcase, WOBUF_CFGLEN, 1241 (void *)&rawsz, sizeof (size_t)); 1242 if (rawsz > 0) { 1243 fmep->cfgdata->begin = MALLOC(rawsz); 1244 fmep->cfgdata->end = fmep->cfgdata->nextfree = 1245 fmep->cfgdata->begin + rawsz; 1246 fmd_buf_read(fmep->hdl, fmep->fmcase, WOBUF_CFG, 1247 fmep->cfgdata->begin, rawsz); 1248 config_cook(fmep->cfgdata); 1249 FREE(fmep->cfgdata->begin); 1250 } 1251 fmep->cfgdata->begin = NULL; 1252 fmep->cfgdata->end = NULL; 1253 fmep->cfgdata->nextfree = NULL; 1254 } 1255 1256 static void 1257 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1258 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1259 { 1260 struct event *ep; 1261 struct fme *fmep = NULL; 1262 struct fme *ofmep = NULL; 1263 struct fme *cfmep, *svfmep; 1264 int matched = 0; 1265 nvlist_t *defect; 1266 fmd_case_t *fmcase; 1267 1268 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1269 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1270 out(O_ALTFP|O_STAMP, NULL); 1271 1272 /* decide which FME it goes to */ 1273 for (fmep = FMElist; fmep; fmep = fmep->next) { 1274 int prev_verbose; 1275 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1276 enum fme_state state; 1277 nvlist_t *pre_peek_nvp = NULL; 1278 1279 if (fmep->overflow) { 1280 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1281 ofmep = fmep; 1282 1283 continue; 1284 } 1285 1286 /* 1287 * ignore solved or closed cases 1288 */ 1289 if (fmep->posted_suspects || 1290 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1291 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1292 continue; 1293 1294 /* look up event in event tree for this FME */ 1295 if ((ep = itree_lookup(fmep->eventtree, 1296 eventstring, ipp)) == NULL) 1297 continue; 1298 1299 /* note observation */ 1300 fmep->ecurrent = ep; 1301 if (ep->count++ == 0) { 1302 /* link it into list of observations seen */ 1303 ep->observations = fmep->observations; 1304 fmep->observations = ep; 1305 ep->nvp = evnv_dupnvl(nvl); 1306 } else { 1307 /* use new payload values for peek */ 1308 pre_peek_nvp = ep->nvp; 1309 ep->nvp = evnv_dupnvl(nvl); 1310 } 1311 1312 /* tell hypothesise() not to mess with suspect list */ 1313 fmep->peek = 1; 1314 1315 /* don't want this to be verbose (unless Debug is set) */ 1316 prev_verbose = Verbose; 1317 if (Debug == 0) 1318 Verbose = 0; 1319 1320 fme_reload_cfgdata(fmep); 1321 1322 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1323 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1324 1325 fmep->peek = 0; 1326 1327 /* put verbose flag back */ 1328 Verbose = prev_verbose; 1329 1330 if (state != FME_DISPROVED) { 1331 /* found an FME that explains the ereport */ 1332 matched++; 1333 out(O_ALTFP|O_NONL, "["); 1334 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1335 out(O_ALTFP, " explained by FME%d]", fmep->id); 1336 1337 if (pre_peek_nvp) 1338 nvlist_free(pre_peek_nvp); 1339 1340 if (ep->count == 1) 1341 serialize_observation(fmep, eventstring, ipp); 1342 1343 if (ffep) 1344 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1345 1346 stats_counter_bump(fmep->Rcount); 1347 1348 /* re-eval FME */ 1349 fme_eval(fmep, ffep); 1350 } else { 1351 1352 /* not a match, undo noting of observation */ 1353 config_free(fmep->cfgdata); 1354 fmep->cfgdata = NULL; 1355 fmep->ecurrent = NULL; 1356 if (--ep->count == 0) { 1357 /* unlink it from observations */ 1358 fmep->observations = ep->observations; 1359 ep->observations = NULL; 1360 nvlist_free(ep->nvp); 1361 ep->nvp = NULL; 1362 } else { 1363 nvlist_free(ep->nvp); 1364 ep->nvp = pre_peek_nvp; 1365 } 1366 } 1367 } 1368 1369 if (matched) 1370 return; /* explained by at least one existing FME */ 1371 1372 /* clean up closed fmes */ 1373 cfmep = ClosedFMEs; 1374 while (cfmep != NULL) { 1375 svfmep = cfmep->next; 1376 destroy_fme(cfmep); 1377 cfmep = svfmep; 1378 } 1379 ClosedFMEs = NULL; 1380 prune_propagations(eventstring, ipp); 1381 1382 if (ofmep) { 1383 out(O_ALTFP|O_NONL, "["); 1384 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1385 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1386 if (ffep) 1387 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1388 1389 return; 1390 1391 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1392 out(O_ALTFP|O_NONL, "["); 1393 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1394 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1395 1396 fmcase = fmd_case_open(hdl, NULL); 1397 1398 /* Create overflow fme */ 1399 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1400 out(O_ALTFP|O_NONL, "["); 1401 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1402 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1403 publish_undiagnosable(hdl, ffep, fmcase); 1404 return; 1405 } 1406 1407 Open_fme_count++; 1408 1409 init_fme_bufs(fmep); 1410 fmep->overflow = B_TRUE; 1411 1412 if (ffep) 1413 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1414 1415 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 1416 NULL, NULL, NULL); 1417 (void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME); 1418 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1419 fmd_case_solve(hdl, fmep->fmcase); 1420 return; 1421 } 1422 1423 /* open a case */ 1424 fmcase = fmd_case_open(hdl, NULL); 1425 1426 /* start a new FME */ 1427 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1428 out(O_ALTFP|O_NONL, "["); 1429 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1430 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1431 publish_undiagnosable(hdl, ffep, fmcase); 1432 return; 1433 } 1434 1435 Open_fme_count++; 1436 1437 init_fme_bufs(fmep); 1438 1439 out(O_ALTFP|O_NONL, "["); 1440 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1441 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1442 fmd_case_uuid(hdl, fmep->fmcase)); 1443 1444 ep = fmep->e0; 1445 ASSERT(ep != NULL); 1446 1447 /* note observation */ 1448 fmep->ecurrent = ep; 1449 if (ep->count++ == 0) { 1450 /* link it into list of observations seen */ 1451 ep->observations = fmep->observations; 1452 fmep->observations = ep; 1453 ep->nvp = evnv_dupnvl(nvl); 1454 serialize_observation(fmep, eventstring, ipp); 1455 } else { 1456 /* new payload overrides any previous */ 1457 nvlist_free(ep->nvp); 1458 ep->nvp = evnv_dupnvl(nvl); 1459 } 1460 1461 stats_counter_bump(fmep->Rcount); 1462 1463 if (ffep) { 1464 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1465 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1466 fmep->e0r = ffep; 1467 } 1468 1469 /* give the diagnosis algorithm a shot at the new FME state */ 1470 fme_eval(fmep, ffep); 1471 } 1472 1473 void 1474 fme_status(int flags) 1475 { 1476 struct fme *fmep; 1477 1478 if (FMElist == NULL) { 1479 out(flags, "No fault management exercises underway."); 1480 return; 1481 } 1482 1483 for (fmep = FMElist; fmep; fmep = fmep->next) 1484 fme_print(flags, fmep); 1485 } 1486 1487 /* 1488 * "indent" routines used mostly for nicely formatted debug output, but also 1489 * for sanity checking for infinite recursion bugs. 1490 */ 1491 1492 #define MAX_INDENT 1024 1493 static const char *indent_s[MAX_INDENT]; 1494 static int current_indent; 1495 1496 static void 1497 indent_push(const char *s) 1498 { 1499 if (current_indent < MAX_INDENT) 1500 indent_s[current_indent++] = s; 1501 else 1502 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1503 } 1504 1505 static void 1506 indent_set(const char *s) 1507 { 1508 current_indent = 0; 1509 indent_push(s); 1510 } 1511 1512 static void 1513 indent_pop(void) 1514 { 1515 if (current_indent > 0) 1516 current_indent--; 1517 else 1518 out(O_DIE, "recursion underflow"); 1519 } 1520 1521 static void 1522 indent(void) 1523 { 1524 int i; 1525 if (!Verbose) 1526 return; 1527 for (i = 0; i < current_indent; i++) 1528 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1529 } 1530 1531 #define SLNEW 1 1532 #define SLCHANGED 2 1533 #define SLWAIT 3 1534 #define SLDISPROVED 4 1535 1536 static void 1537 print_suspects(int circumstance, struct fme *fmep) 1538 { 1539 struct event *ep; 1540 1541 out(O_ALTFP|O_NONL, "["); 1542 if (circumstance == SLCHANGED) { 1543 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1544 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1545 } else if (circumstance == SLWAIT) { 1546 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1547 fmep->timer); 1548 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1549 } else if (circumstance == SLDISPROVED) { 1550 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1551 } else { 1552 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1553 } 1554 1555 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1556 out(O_ALTFP, "]"); 1557 return; 1558 } 1559 1560 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1561 out(O_ALTFP|O_NONL, " "); 1562 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1563 } 1564 out(O_ALTFP, "]"); 1565 } 1566 1567 static struct node * 1568 eventprop_lookup(struct event *ep, const char *propname) 1569 { 1570 return (lut_lookup(ep->props, (void *)propname, NULL)); 1571 } 1572 1573 #define MAXDIGITIDX 23 1574 static char numbuf[MAXDIGITIDX + 1]; 1575 1576 static int 1577 node2uint(struct node *n, uint_t *valp) 1578 { 1579 struct evalue value; 1580 struct lut *globals = NULL; 1581 1582 if (n == NULL) 1583 return (1); 1584 1585 /* 1586 * check value.v since we are being asked to convert an unsigned 1587 * long long int to an unsigned int 1588 */ 1589 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1590 value.t != UINT64 || value.v > (1ULL << 32)) 1591 return (1); 1592 1593 *valp = (uint_t)value.v; 1594 1595 return (0); 1596 } 1597 1598 static nvlist_t * 1599 node2fmri(struct node *n) 1600 { 1601 nvlist_t **pa, *f, *p; 1602 struct node *nc; 1603 uint_t depth = 0; 1604 char *numstr, *nullbyte; 1605 char *failure; 1606 int err, i; 1607 1608 /* XXX do we need to be able to handle a non-T_NAME node? */ 1609 if (n == NULL || n->t != T_NAME) 1610 return (NULL); 1611 1612 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1613 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1614 break; 1615 depth++; 1616 } 1617 1618 if (nc != NULL) { 1619 /* We bailed early, something went wrong */ 1620 return (NULL); 1621 } 1622 1623 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1624 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1625 pa = alloca(depth * sizeof (nvlist_t *)); 1626 for (i = 0; i < depth; i++) 1627 pa[i] = NULL; 1628 1629 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1630 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1631 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1632 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1633 if (err != 0) { 1634 failure = "basic construction of FMRI failed"; 1635 goto boom; 1636 } 1637 1638 numbuf[MAXDIGITIDX] = '\0'; 1639 nullbyte = &numbuf[MAXDIGITIDX]; 1640 i = 0; 1641 1642 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1643 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1644 if (err != 0) { 1645 failure = "alloc of an hc-pair failed"; 1646 goto boom; 1647 } 1648 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1649 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1650 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1651 if (err != 0) { 1652 failure = "construction of an hc-pair failed"; 1653 goto boom; 1654 } 1655 pa[i++] = p; 1656 } 1657 1658 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1659 if (err == 0) { 1660 for (i = 0; i < depth; i++) 1661 if (pa[i] != NULL) 1662 nvlist_free(pa[i]); 1663 return (f); 1664 } 1665 failure = "addition of hc-pair array to FMRI failed"; 1666 1667 boom: 1668 for (i = 0; i < depth; i++) 1669 if (pa[i] != NULL) 1670 nvlist_free(pa[i]); 1671 nvlist_free(f); 1672 out(O_DIE, "%s", failure); 1673 /*NOTREACHED*/ 1674 return (NULL); 1675 } 1676 1677 static uint_t 1678 avg(uint_t sum, uint_t cnt) 1679 { 1680 unsigned long long s = sum * 10; 1681 1682 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 1683 } 1684 1685 static uint8_t 1686 percentof(uint_t part, uint_t whole) 1687 { 1688 unsigned long long p = part * 1000; 1689 1690 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 1691 } 1692 1693 struct rsl { 1694 struct event *suspect; 1695 nvlist_t *asru; 1696 nvlist_t *fru; 1697 nvlist_t *rsrc; 1698 }; 1699 1700 /* 1701 * rslfree -- free internal members of struct rsl not expected to be 1702 * freed elsewhere. 1703 */ 1704 static void 1705 rslfree(struct rsl *freeme) 1706 { 1707 if (freeme->asru != NULL) 1708 nvlist_free(freeme->asru); 1709 if (freeme->fru != NULL) 1710 nvlist_free(freeme->fru); 1711 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 1712 nvlist_free(freeme->rsrc); 1713 } 1714 1715 /* 1716 * rslcmp -- compare two rsl structures. Use the following 1717 * comparisons to establish cardinality: 1718 * 1719 * 1. Name of the suspect's class. (simple strcmp) 1720 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 1721 * 1722 */ 1723 static int 1724 rslcmp(const void *a, const void *b) 1725 { 1726 struct rsl *r1 = (struct rsl *)a; 1727 struct rsl *r2 = (struct rsl *)b; 1728 int rv; 1729 1730 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 1731 r2->suspect->enode->u.event.ename->u.name.s); 1732 if (rv != 0) 1733 return (rv); 1734 1735 if (r1->asru == NULL && r2->asru == NULL) 1736 return (0); 1737 if (r1->asru == NULL) 1738 return (-1); 1739 if (r2->asru == NULL) 1740 return (1); 1741 return (evnv_cmpnvl(r1->asru, r2->asru, 0)); 1742 } 1743 1744 /* 1745 * rsluniq -- given an array of rsl structures, seek out and "remove" 1746 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 1747 * of the array element. Removal also means updating the number of 1748 * problems and the number of problems which are not faults. User 1749 * provides the first and last element pointers. 1750 */ 1751 static void 1752 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 1753 { 1754 struct rsl *cr; 1755 1756 if (*nprobs == 1) 1757 return; 1758 1759 /* 1760 * At this point, we only expect duplicate defects. 1761 * Eversholt's diagnosis algorithm prevents duplicate 1762 * suspects, but we rewrite defects in the platform code after 1763 * the diagnosis is made, and that can introduce new 1764 * duplicates. 1765 */ 1766 while (first <= last) { 1767 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 1768 first++; 1769 continue; 1770 } 1771 cr = first + 1; 1772 while (cr <= last) { 1773 if (is_defect(first->suspect->t)) { 1774 if (rslcmp(first, cr) == 0) { 1775 cr->suspect = NULL; 1776 rslfree(cr); 1777 (*nprobs)--; 1778 (*nnonf)--; 1779 } 1780 } 1781 /* 1782 * assume all defects are in order after our 1783 * sort and short circuit here with "else break" ? 1784 */ 1785 cr++; 1786 } 1787 first++; 1788 } 1789 } 1790 1791 /* 1792 * get_resources -- for a given suspect, determine what ASRU, FRU and 1793 * RSRC nvlists should be advertised in the final suspect list. 1794 */ 1795 void 1796 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 1797 { 1798 struct node *asrudef, *frudef; 1799 nvlist_t *asru, *fru; 1800 nvlist_t *rsrc = NULL; 1801 char *pathstr; 1802 1803 /* 1804 * First find any ASRU and/or FRU defined in the 1805 * initial fault tree. 1806 */ 1807 asrudef = eventprop_lookup(sp, L_ASRU); 1808 frudef = eventprop_lookup(sp, L_FRU); 1809 1810 /* 1811 * Create FMRIs based on those definitions 1812 */ 1813 asru = node2fmri(asrudef); 1814 fru = node2fmri(frudef); 1815 pathstr = ipath2str(NULL, sp->ipp); 1816 1817 /* 1818 * Allow for platform translations of the FMRIs 1819 */ 1820 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 1821 pathstr); 1822 1823 FREE(pathstr); 1824 rsrcs->suspect = sp; 1825 rsrcs->asru = asru; 1826 rsrcs->fru = fru; 1827 rsrcs->rsrc = rsrc; 1828 } 1829 1830 /* 1831 * trim_suspects -- prior to publishing, we may need to remove some 1832 * suspects from the list. If we're auto-closing upsets, we don't 1833 * want any of those in the published list. If the ASRUs for multiple 1834 * defects resolve to the same ASRU (driver) we only want to publish 1835 * that as a single suspect. 1836 */ 1837 static void 1838 trim_suspects(struct fme *fmep, boolean_t no_upsets, struct rsl **begin, 1839 struct rsl **end) 1840 { 1841 struct event *ep; 1842 struct rsl *rp; 1843 int rpcnt; 1844 1845 /* 1846 * First save the suspects in the psuspects, then copy back 1847 * only the ones we wish to retain. This resets nsuspects to 1848 * zero. 1849 */ 1850 rpcnt = fmep->nsuspects; 1851 save_suspects(fmep); 1852 1853 /* 1854 * allocate an array of resource pointers for the suspects. 1855 * We may end up using less than the full allocation, but this 1856 * is a very short-lived array. publish_suspects() will free 1857 * this array when it's done using it. 1858 */ 1859 rp = *begin = MALLOC(rpcnt * sizeof (struct rsl)); 1860 bzero(rp, rpcnt * sizeof (struct rsl)); 1861 1862 /* first pass, remove any unwanted upsets and populate our array */ 1863 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 1864 if (no_upsets && is_upset(ep->t)) 1865 continue; 1866 get_resources(ep, rp, fmep->cfgdata->cooked); 1867 rp++; 1868 fmep->nsuspects++; 1869 if (!is_fault(ep->t)) 1870 fmep->nonfault++; 1871 } 1872 1873 /* if all we had was unwanted upsets, we're done */ 1874 if (fmep->nsuspects == 0) 1875 return; 1876 1877 *end = rp - 1; 1878 1879 /* sort the array */ 1880 qsort(*begin, fmep->nsuspects, sizeof (struct rsl), rslcmp); 1881 rsluniq(*begin, *end, &fmep->nsuspects, &fmep->nonfault); 1882 } 1883 1884 /* 1885 * addpayloadprop -- add a payload prop to a problem 1886 */ 1887 static void 1888 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 1889 { 1890 ASSERT(fault != NULL); 1891 ASSERT(lhs != NULL); 1892 ASSERT(rhs != NULL); 1893 1894 if (rhs->t == UINT64) { 1895 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 1896 1897 if (nvlist_add_uint64(fault, lhs, rhs->v) != 0) 1898 out(O_DIE, 1899 "cannot add payloadprop \"%s\" to fault", lhs); 1900 } else { 1901 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 1902 lhs, (char *)(uintptr_t)rhs->v); 1903 1904 if (nvlist_add_string(fault, lhs, (char *)(uintptr_t)rhs->v) != 1905 0) 1906 out(O_DIE, 1907 "cannot add payloadprop \"%s\" to fault", lhs); 1908 } 1909 } 1910 1911 static char *Istatbuf; 1912 static char *Istatbufptr; 1913 static int Istatsz; 1914 1915 /* 1916 * istataddsize -- calculate size of istat and add it to Istatsz 1917 */ 1918 /*ARGSUSED2*/ 1919 static void 1920 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 1921 { 1922 int val; 1923 1924 ASSERT(lhs != NULL); 1925 ASSERT(rhs != NULL); 1926 1927 if ((val = stats_counter_value(rhs)) == 0) 1928 return; /* skip zero-valued stats */ 1929 1930 /* count up the size of the stat name */ 1931 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 1932 Istatsz++; /* for the trailing NULL byte */ 1933 1934 /* count up the size of the stat value */ 1935 Istatsz += snprintf(NULL, 0, "%d", val); 1936 Istatsz++; /* for the trailing NULL byte */ 1937 } 1938 1939 /* 1940 * istat2str -- serialize an istat, writing result to *Istatbufptr 1941 */ 1942 /*ARGSUSED2*/ 1943 static void 1944 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 1945 { 1946 char *str; 1947 int len; 1948 int val; 1949 1950 ASSERT(lhs != NULL); 1951 ASSERT(rhs != NULL); 1952 1953 if ((val = stats_counter_value(rhs)) == 0) 1954 return; /* skip zero-valued stats */ 1955 1956 /* serialize the stat name */ 1957 str = ipath2str(lhs->ename, lhs->ipath); 1958 len = strlen(str); 1959 1960 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 1961 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 1962 Istatbufptr += len; 1963 FREE(str); 1964 *Istatbufptr++ = '\0'; 1965 1966 /* serialize the stat value */ 1967 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 1968 "%d", val); 1969 *Istatbufptr++ = '\0'; 1970 1971 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 1972 } 1973 1974 void 1975 istat_save() 1976 { 1977 if (Istat_need_save == 0) 1978 return; 1979 1980 /* figure out how big the serialzed info is */ 1981 Istatsz = 0; 1982 lut_walk(Istats, (lut_cb)istataddsize, NULL); 1983 1984 if (Istatsz == 0) { 1985 /* no stats to save */ 1986 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 1987 return; 1988 } 1989 1990 /* create the serialized buffer */ 1991 Istatbufptr = Istatbuf = MALLOC(Istatsz); 1992 lut_walk(Istats, (lut_cb)istat2str, NULL); 1993 1994 /* clear out current saved stats */ 1995 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 1996 1997 /* write out the new version */ 1998 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 1999 FREE(Istatbuf); 2000 2001 Istat_need_save = 0; 2002 } 2003 2004 int 2005 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2006 { 2007 if (ent1->ename != ent2->ename) 2008 return (ent2->ename - ent1->ename); 2009 if (ent1->ipath != ent2->ipath) 2010 return ((char *)ent2->ipath - (char *)ent1->ipath); 2011 2012 return (0); 2013 } 2014 2015 /* 2016 * istat-verify -- verify the component associated with a stat still exists 2017 * 2018 * if the component no longer exists, this routine resets the stat and 2019 * returns 0. if the component still exists, it returns 1. 2020 */ 2021 static int 2022 istat_verify(struct node *snp, struct istat_entry *entp) 2023 { 2024 struct stats *statp; 2025 nvlist_t *fmri; 2026 2027 fmri = node2fmri(snp->u.event.epname); 2028 if (platform_path_exists(fmri)) { 2029 nvlist_free(fmri); 2030 return (1); 2031 } 2032 nvlist_free(fmri); 2033 2034 /* component no longer in system. zero out the associated stats */ 2035 if ((statp = (struct stats *) 2036 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2037 stats_counter_value(statp) == 0) 2038 return (0); /* stat is already reset */ 2039 2040 Istat_need_save = 1; 2041 stats_counter_reset(statp); 2042 return (0); 2043 } 2044 2045 static void 2046 istat_bump(struct node *snp, int n) 2047 { 2048 struct stats *statp; 2049 struct istat_entry ent; 2050 2051 ASSERT(snp != NULL); 2052 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2053 ASSERT(snp->u.event.epname != NULL); 2054 2055 /* class name should be hoisted into a single stable entry */ 2056 ASSERT(snp->u.event.ename->u.name.next == NULL); 2057 ent.ename = snp->u.event.ename->u.name.s; 2058 ent.ipath = ipath(snp->u.event.epname); 2059 2060 if (!istat_verify(snp, &ent)) { 2061 /* component no longer exists in system, nothing to do */ 2062 return; 2063 } 2064 2065 if ((statp = (struct stats *) 2066 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2067 /* need to create the counter */ 2068 int cnt = 0; 2069 struct node *np; 2070 char *sname; 2071 char *snamep; 2072 struct istat_entry *newentp; 2073 2074 /* count up the size of the stat name */ 2075 np = snp->u.event.ename; 2076 while (np != NULL) { 2077 cnt += strlen(np->u.name.s); 2078 cnt++; /* for the '.' or '@' */ 2079 np = np->u.name.next; 2080 } 2081 np = snp->u.event.epname; 2082 while (np != NULL) { 2083 cnt += snprintf(NULL, 0, "%s%llu", 2084 np->u.name.s, np->u.name.child->u.ull); 2085 cnt++; /* for the '/' or trailing NULL byte */ 2086 np = np->u.name.next; 2087 } 2088 2089 /* build the stat name */ 2090 snamep = sname = alloca(cnt); 2091 np = snp->u.event.ename; 2092 while (np != NULL) { 2093 snamep += snprintf(snamep, &sname[cnt] - snamep, 2094 "%s", np->u.name.s); 2095 np = np->u.name.next; 2096 if (np) 2097 *snamep++ = '.'; 2098 } 2099 *snamep++ = '@'; 2100 np = snp->u.event.epname; 2101 while (np != NULL) { 2102 snamep += snprintf(snamep, &sname[cnt] - snamep, 2103 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2104 np = np->u.name.next; 2105 if (np) 2106 *snamep++ = '/'; 2107 } 2108 *snamep++ = '\0'; 2109 2110 /* create the new stat & add it to our list */ 2111 newentp = MALLOC(sizeof (*newentp)); 2112 *newentp = ent; 2113 statp = stats_new_counter(NULL, sname, 0); 2114 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2115 (lut_cmp)istat_cmp); 2116 } 2117 2118 /* if n is non-zero, set that value instead of bumping */ 2119 if (n) { 2120 stats_counter_reset(statp); 2121 stats_counter_add(statp, n); 2122 } else 2123 stats_counter_bump(statp); 2124 Istat_need_save = 1; 2125 2126 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2127 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2128 stats_counter_value(statp)); 2129 } 2130 2131 /*ARGSUSED*/ 2132 static void 2133 istat_destructor(void *left, void *right, void *arg) 2134 { 2135 struct istat_entry *entp = (struct istat_entry *)left; 2136 struct stats *statp = (struct stats *)right; 2137 FREE(entp); 2138 stats_delete(statp); 2139 } 2140 2141 /* 2142 * Callback used in a walk of the Istats to reset matching stat counters. 2143 */ 2144 static void 2145 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2146 const struct ipath *ipp) 2147 { 2148 char *path; 2149 2150 if (entp->ipath == ipp) { 2151 path = ipath2str(entp->ename, ipp); 2152 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2153 FREE(path); 2154 stats_counter_reset(statp); 2155 Istat_need_save = 1; 2156 } 2157 } 2158 2159 void 2160 istat_fini(void) 2161 { 2162 lut_free(Istats, istat_destructor, NULL); 2163 } 2164 2165 static char *Serdbuf; 2166 static char *Serdbufptr; 2167 static int Serdsz; 2168 2169 /* 2170 * serdaddsize -- calculate size of serd and add it to Serdsz 2171 */ 2172 /*ARGSUSED*/ 2173 static void 2174 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2175 { 2176 ASSERT(lhs != NULL); 2177 2178 /* count up the size of the stat name */ 2179 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2180 Serdsz++; /* for the trailing NULL byte */ 2181 } 2182 2183 /* 2184 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2185 */ 2186 /*ARGSUSED*/ 2187 static void 2188 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2189 { 2190 char *str; 2191 int len; 2192 2193 ASSERT(lhs != NULL); 2194 2195 /* serialize the serd engine name */ 2196 str = ipath2str(lhs->ename, lhs->ipath); 2197 len = strlen(str); 2198 2199 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2200 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2201 Serdbufptr += len; 2202 FREE(str); 2203 *Serdbufptr++ = '\0'; 2204 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2205 } 2206 2207 void 2208 serd_save() 2209 { 2210 if (Serd_need_save == 0) 2211 return; 2212 2213 /* figure out how big the serialzed info is */ 2214 Serdsz = 0; 2215 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2216 2217 if (Serdsz == 0) { 2218 /* no serd engines to save */ 2219 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2220 return; 2221 } 2222 2223 /* create the serialized buffer */ 2224 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2225 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2226 2227 /* clear out current saved stats */ 2228 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2229 2230 /* write out the new version */ 2231 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2232 FREE(Serdbuf); 2233 Serd_need_save = 0; 2234 } 2235 2236 int 2237 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2238 { 2239 if (ent1->ename != ent2->ename) 2240 return (ent2->ename - ent1->ename); 2241 if (ent1->ipath != ent2->ipath) 2242 return ((char *)ent2->ipath - (char *)ent1->ipath); 2243 2244 return (0); 2245 } 2246 2247 void 2248 fme_serd_load(fmd_hdl_t *hdl) 2249 { 2250 int sz; 2251 char *sbuf; 2252 char *sepptr; 2253 char *ptr; 2254 struct serd_entry *newentp; 2255 struct node *epname; 2256 nvlist_t *fmri; 2257 char *namestring; 2258 2259 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2260 return; 2261 sbuf = alloca(sz); 2262 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2263 ptr = sbuf; 2264 while (ptr < &sbuf[sz]) { 2265 sepptr = strchr(ptr, '@'); 2266 *sepptr = '\0'; 2267 namestring = ptr; 2268 sepptr++; 2269 ptr = sepptr; 2270 ptr += strlen(ptr); 2271 ptr++; /* move past the '\0' separating paths */ 2272 epname = pathstring2epnamenp(sepptr); 2273 fmri = node2fmri(epname); 2274 if (platform_path_exists(fmri)) { 2275 newentp = MALLOC(sizeof (*newentp)); 2276 newentp->hdl = hdl; 2277 newentp->ipath = ipath(epname); 2278 newentp->ename = stable(namestring); 2279 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2280 (void *)NULL, (lut_cmp)serd_cmp); 2281 } else 2282 Serd_need_save = 1; 2283 nvlist_free(fmri); 2284 } 2285 /* save it back again in case some of the paths no longer exist */ 2286 serd_save(); 2287 } 2288 2289 /*ARGSUSED*/ 2290 static void 2291 serd_destructor(void *left, void *right, void *arg) 2292 { 2293 struct serd_entry *entp = (struct serd_entry *)left; 2294 FREE(entp); 2295 } 2296 2297 /* 2298 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2299 */ 2300 /*ARGSUSED*/ 2301 static void 2302 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2303 { 2304 char *path; 2305 2306 if (entp->ipath == ipp) { 2307 path = ipath2str(entp->ename, ipp); 2308 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2309 fmd_serd_reset(entp->hdl, path); 2310 FREE(path); 2311 Serd_need_save = 1; 2312 } 2313 } 2314 2315 void 2316 serd_fini(void) 2317 { 2318 lut_free(SerdEngines, serd_destructor, NULL); 2319 } 2320 2321 static void 2322 publish_suspects(struct fme *fmep) 2323 { 2324 struct rsl *srl = NULL; 2325 struct rsl *erl; 2326 struct rsl *rp; 2327 nvlist_t *fault; 2328 uint8_t cert; 2329 uint_t *frs; 2330 uint_t fravg, frsum, fr; 2331 uint_t messval; 2332 struct node *snp; 2333 int frcnt, fridx; 2334 boolean_t no_upsets = B_FALSE; 2335 boolean_t allfaulty = B_TRUE; 2336 2337 stats_counter_bump(fmep->diags); 2338 2339 /* 2340 * If we're auto-closing upsets, we don't want to include them 2341 * in any produced suspect lists or certainty accounting. 2342 */ 2343 if (Autoclose != NULL) 2344 if (strcmp(Autoclose, "true") == 0 || 2345 strcmp(Autoclose, "all") == 0 || 2346 strcmp(Autoclose, "upsets") == 0) 2347 no_upsets = B_TRUE; 2348 2349 trim_suspects(fmep, no_upsets, &srl, &erl); 2350 2351 /* 2352 * If the resulting suspect list has no members, we're 2353 * done. Returning here will simply close the case. 2354 */ 2355 if (fmep->nsuspects == 0) { 2356 out(O_ALTFP, 2357 "[FME%d, case %s (all suspects are upsets)]", 2358 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2359 FREE(srl); 2360 restore_suspects(fmep); 2361 return; 2362 } 2363 2364 /* 2365 * If the suspect list is all faults, then for a given fault, 2366 * say X of N, X's certainty is computed via: 2367 * 2368 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 2369 * 2370 * If none of the suspects are faults, and there are N suspects, 2371 * the certainty of a given suspect is 100/N. 2372 * 2373 * If there are are a mixture of faults and other problems in 2374 * the suspect list, we take an average of the faults' 2375 * FITrates and treat this average as the FITrate for any 2376 * non-faults. The fitrate of any given suspect is then 2377 * computed per the first formula above. 2378 */ 2379 if (fmep->nonfault == fmep->nsuspects) { 2380 /* NO faults in the suspect list */ 2381 cert = percentof(1, fmep->nsuspects); 2382 } else { 2383 /* sum the fitrates */ 2384 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2385 fridx = frcnt = frsum = 0; 2386 2387 for (rp = srl; rp <= erl; rp++) { 2388 struct node *n; 2389 2390 if (rp->suspect == NULL) 2391 continue; 2392 if (!is_fault(rp->suspect->t)) { 2393 frs[fridx++] = 0; 2394 continue; 2395 } 2396 n = eventprop_lookup(rp->suspect, L_FITrate); 2397 if (node2uint(n, &fr) != 0) { 2398 out(O_DEBUG|O_NONL, "event "); 2399 ipath_print(O_DEBUG|O_NONL, 2400 rp->suspect->enode->u.event.ename->u.name.s, 2401 rp->suspect->ipp); 2402 out(O_DEBUG, " has no FITrate (using 1)"); 2403 fr = 1; 2404 } else if (fr == 0) { 2405 out(O_DEBUG|O_NONL, "event "); 2406 ipath_print(O_DEBUG|O_NONL, 2407 rp->suspect->enode->u.event.ename->u.name.s, 2408 rp->suspect->ipp); 2409 out(O_DEBUG, " has zero FITrate (using 1)"); 2410 fr = 1; 2411 } 2412 2413 frs[fridx++] = fr; 2414 frsum += fr; 2415 frcnt++; 2416 } 2417 fravg = avg(frsum, frcnt); 2418 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 2419 if (frs[fridx] == 0) { 2420 frs[fridx] = fravg; 2421 frsum += fravg; 2422 } 2423 } 2424 2425 /* Add them in reverse order of our sort, as fmd reverses order */ 2426 for (rp = erl; rp >= srl; rp--) { 2427 if (rp->suspect == NULL) 2428 continue; 2429 if (!is_fault(rp->suspect->t)) 2430 allfaulty = B_FALSE; 2431 if (fmep->nonfault != fmep->nsuspects) 2432 cert = percentof(frs[--fridx], frsum); 2433 fault = fmd_nvl_create_fault(fmep->hdl, 2434 rp->suspect->enode->u.event.ename->u.name.s, 2435 cert, 2436 rp->asru, 2437 rp->fru, 2438 rp->rsrc); 2439 if (fault == NULL) 2440 out(O_DIE, "fault creation failed"); 2441 /* if "message" property exists, add it to the fault */ 2442 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2443 &messval) == 0) { 2444 2445 out(O_ALTFP, 2446 "[FME%d, %s adds message=%d to suspect list]", 2447 fmep->id, 2448 rp->suspect->enode->u.event.ename->u.name.s, 2449 messval); 2450 if (nvlist_add_boolean_value(fault, 2451 FM_SUSPECT_MESSAGE, 2452 (messval) ? B_TRUE : B_FALSE) != 0) { 2453 out(O_DIE, "cannot add no-message to fault"); 2454 } 2455 } 2456 /* add any payload properties */ 2457 lut_walk(rp->suspect->payloadprops, 2458 (lut_cb)addpayloadprop, (void *)fault); 2459 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2460 rslfree(rp); 2461 2462 /* 2463 * If "action" property exists, evaluate it; this must be done 2464 * before the dupclose check below since some actions may 2465 * modify the asru to be used in fmd_nvl_fmri_faulty. This 2466 * needs to be restructured if any new actions are introduced 2467 * that have effects that we do not want to be visible if 2468 * we decide not to publish in the dupclose check below. 2469 */ 2470 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2471 struct evalue evalue; 2472 2473 out(O_ALTFP|O_NONL, 2474 "[FME%d, %s action ", fmep->id, 2475 rp->suspect->enode->u.event.ename->u.name.s); 2476 ptree_name_iter(O_ALTFP|O_NONL, snp); 2477 out(O_ALTFP, "]"); 2478 Action_nvl = fault; 2479 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2480 NULL, 0, &evalue); 2481 } 2482 2483 /* 2484 * if "dupclose" tunable is set, check if the asru is 2485 * already marked as "faulty". 2486 */ 2487 if (Dupclose && allfaulty) { 2488 nvlist_t *asru; 2489 2490 out(O_ALTFP|O_VERB, "FMD%d dupclose check ", fmep->id); 2491 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2492 out(O_ALTFP|O_VERB|O_NONL, " "); 2493 if (nvlist_lookup_nvlist(fault, 2494 FM_FAULT_ASRU, &asru) != 0) { 2495 out(O_ALTFP|O_VERB, "NULL asru"); 2496 allfaulty = B_FALSE; 2497 } else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) { 2498 out(O_ALTFP|O_VERB, "faulty"); 2499 } else { 2500 out(O_ALTFP|O_VERB, "not faulty"); 2501 allfaulty = B_FALSE; 2502 } 2503 } 2504 2505 } 2506 2507 /* 2508 * Close the case if all asrus are already known to be faulty and if 2509 * Dupclose is enabled. Otherwise we are going to publish so take 2510 * any pre-publication actions. 2511 */ 2512 if (Dupclose && allfaulty) { 2513 out(O_ALTFP, "[dupclose FME%d, case %s]", fmep->id, 2514 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2515 fmd_case_close(fmep->hdl, fmep->fmcase); 2516 } else { 2517 for (rp = erl; rp >= srl; rp--) { 2518 struct event *suspect = rp->suspect; 2519 2520 if (suspect == NULL) 2521 continue; 2522 2523 /* if "count" exists, increment the appropriate stat */ 2524 if ((snp = eventprop_lookup(suspect, 2525 L_count)) != NULL) { 2526 out(O_ALTFP|O_NONL, 2527 "[FME%d, %s count ", fmep->id, 2528 suspect->enode->u.event.ename->u.name.s); 2529 ptree_name_iter(O_ALTFP|O_NONL, snp); 2530 out(O_ALTFP, "]"); 2531 istat_bump(snp, 0); 2532 2533 } 2534 } 2535 istat_save(); /* write out any istat changes */ 2536 2537 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 2538 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2539 fmd_case_solve(fmep->hdl, fmep->fmcase); 2540 } 2541 2542 /* 2543 * revert to the original suspect list 2544 */ 2545 FREE(srl); 2546 restore_suspects(fmep); 2547 } 2548 2549 static void 2550 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase) 2551 { 2552 struct case_list *newcase; 2553 nvlist_t *defect; 2554 2555 out(O_ALTFP, 2556 "[undiagnosable ereport received, " 2557 "creating and closing a new case (%s)]", 2558 Undiag_reason ? Undiag_reason : "reason not provided"); 2559 2560 newcase = MALLOC(sizeof (struct case_list)); 2561 newcase->next = NULL; 2562 newcase->fmcase = fmcase; 2563 if (Undiagablecaselist != NULL) 2564 newcase->next = Undiagablecaselist; 2565 Undiagablecaselist = newcase; 2566 2567 if (ffep != NULL) 2568 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 2569 2570 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 2571 NULL, NULL, NULL); 2572 if (Undiag_reason != NULL) 2573 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 2574 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 2575 2576 fmd_case_solve(hdl, newcase->fmcase); 2577 fmd_case_close(hdl, newcase->fmcase); 2578 } 2579 2580 static void 2581 fme_undiagnosable(struct fme *f) 2582 { 2583 nvlist_t *defect; 2584 2585 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 2586 f->id, fmd_case_uuid(f->hdl, f->fmcase), 2587 Undiag_reason ? Undiag_reason : "undiagnosable"); 2588 2589 defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100, 2590 NULL, NULL, NULL); 2591 if (Undiag_reason != NULL) 2592 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 2593 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 2594 fmd_case_solve(f->hdl, f->fmcase); 2595 fmd_case_close(f->hdl, f->fmcase); 2596 } 2597 2598 /* 2599 * fme_close_case 2600 * 2601 * Find the requested case amongst our fmes and close it. Free up 2602 * the related fme. 2603 */ 2604 void 2605 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 2606 { 2607 struct case_list *ucasep, *prevcasep = NULL; 2608 struct fme *prev = NULL; 2609 struct fme *fmep; 2610 2611 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 2612 if (fmcase != ucasep->fmcase) { 2613 prevcasep = ucasep; 2614 continue; 2615 } 2616 2617 if (prevcasep == NULL) 2618 Undiagablecaselist = Undiagablecaselist->next; 2619 else 2620 prevcasep->next = ucasep->next; 2621 2622 FREE(ucasep); 2623 return; 2624 } 2625 2626 for (fmep = FMElist; fmep; fmep = fmep->next) { 2627 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 2628 break; 2629 prev = fmep; 2630 } 2631 2632 if (fmep == NULL) { 2633 out(O_WARN, "Eft asked to close unrecognized case [%s].", 2634 fmd_case_uuid(hdl, fmcase)); 2635 return; 2636 } 2637 2638 if (EFMElist == fmep) 2639 EFMElist = prev; 2640 2641 if (prev == NULL) 2642 FMElist = FMElist->next; 2643 else 2644 prev->next = fmep->next; 2645 2646 fmep->next = NULL; 2647 2648 /* Get rid of any timer this fme has set */ 2649 if (fmep->wull != 0) 2650 fmd_timer_remove(fmep->hdl, fmep->timer); 2651 2652 if (ClosedFMEs == NULL) { 2653 ClosedFMEs = fmep; 2654 } else { 2655 fmep->next = ClosedFMEs; 2656 ClosedFMEs = fmep; 2657 } 2658 2659 Open_fme_count--; 2660 2661 /* See if we can close the overflow FME */ 2662 if (Open_fme_count <= Max_fme) { 2663 for (fmep = FMElist; fmep; fmep = fmep->next) { 2664 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 2665 fmep->fmcase))) 2666 break; 2667 } 2668 2669 if (fmep != NULL) 2670 fmd_case_close(fmep->hdl, fmep->fmcase); 2671 } 2672 } 2673 2674 /* 2675 * fme_set_timer() 2676 * If the time we need to wait for the given FME is less than the 2677 * current timer, kick that old timer out and establish a new one. 2678 */ 2679 static int 2680 fme_set_timer(struct fme *fmep, unsigned long long wull) 2681 { 2682 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 2683 ptree_timeval(O_ALTFP|O_VERB, &wull); 2684 2685 if (wull <= fmep->pull) { 2686 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 2687 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 2688 out(O_ALTFP|O_VERB, NULL); 2689 /* we've waited at least wull already, don't need timer */ 2690 return (0); 2691 } 2692 2693 out(O_ALTFP|O_VERB|O_NONL, " currently "); 2694 if (fmep->wull != 0) { 2695 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 2696 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 2697 out(O_ALTFP|O_VERB, NULL); 2698 } else { 2699 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 2700 out(O_ALTFP|O_VERB, NULL); 2701 } 2702 2703 if (fmep->wull != 0) 2704 if (wull >= fmep->wull) 2705 /* New timer would fire later than established timer */ 2706 return (0); 2707 2708 if (fmep->wull != 0) { 2709 fmd_timer_remove(fmep->hdl, fmep->timer); 2710 } 2711 2712 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 2713 fmep->e0r, wull); 2714 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 2715 fmep->wull = wull; 2716 return (1); 2717 } 2718 2719 void 2720 fme_timer_fired(struct fme *fmep, id_t tid) 2721 { 2722 struct fme *ffmep = NULL; 2723 2724 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 2725 if (ffmep == fmep) 2726 break; 2727 2728 if (ffmep == NULL) { 2729 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 2730 (void *)fmep); 2731 return; 2732 } 2733 2734 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 2735 fmep->pull = fmep->wull; 2736 fmep->wull = 0; 2737 fmd_buf_write(fmep->hdl, fmep->fmcase, 2738 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 2739 2740 fme_reload_cfgdata(fmep); 2741 2742 fme_eval(fmep, fmep->e0r); 2743 } 2744 2745 /* 2746 * Preserve the fme's suspect list in its psuspects list, NULLing the 2747 * suspects list in the meantime. 2748 */ 2749 static void 2750 save_suspects(struct fme *fmep) 2751 { 2752 struct event *ep; 2753 struct event *nextep; 2754 2755 /* zero out the previous suspect list */ 2756 for (ep = fmep->psuspects; ep; ep = nextep) { 2757 nextep = ep->psuspects; 2758 ep->psuspects = NULL; 2759 } 2760 fmep->psuspects = NULL; 2761 2762 /* zero out the suspect list, copying it to previous suspect list */ 2763 fmep->psuspects = fmep->suspects; 2764 for (ep = fmep->suspects; ep; ep = nextep) { 2765 nextep = ep->suspects; 2766 ep->psuspects = ep->suspects; 2767 ep->suspects = NULL; 2768 ep->is_suspect = 0; 2769 } 2770 fmep->suspects = NULL; 2771 fmep->nsuspects = 0; 2772 fmep->nonfault = 0; 2773 } 2774 2775 /* 2776 * Retrieve the fme's suspect list from its psuspects list. 2777 */ 2778 static void 2779 restore_suspects(struct fme *fmep) 2780 { 2781 struct event *ep; 2782 struct event *nextep; 2783 2784 fmep->nsuspects = fmep->nonfault = 0; 2785 fmep->suspects = fmep->psuspects; 2786 for (ep = fmep->psuspects; ep; ep = nextep) { 2787 fmep->nsuspects++; 2788 if (!is_fault(ep->t)) 2789 fmep->nonfault++; 2790 nextep = ep->psuspects; 2791 ep->suspects = ep->psuspects; 2792 } 2793 } 2794 2795 /* 2796 * this is what we use to call the Emrys prototype code instead of main() 2797 */ 2798 static void 2799 fme_eval(struct fme *fmep, fmd_event_t *ffep) 2800 { 2801 struct event *ep; 2802 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 2803 2804 save_suspects(fmep); 2805 2806 out(O_ALTFP, "Evaluate FME %d", fmep->id); 2807 indent_set(" "); 2808 2809 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 2810 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 2811 2812 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 2813 fme_state2str(fmep->state)); 2814 for (ep = fmep->suspects; ep; ep = ep->suspects) { 2815 out(O_ALTFP|O_NONL, " "); 2816 itree_pevent_brief(O_ALTFP|O_NONL, ep); 2817 } 2818 out(O_ALTFP, NULL); 2819 2820 switch (fmep->state) { 2821 case FME_CREDIBLE: 2822 print_suspects(SLNEW, fmep); 2823 (void) upsets_eval(fmep, ffep); 2824 2825 /* 2826 * we may have already posted suspects in upsets_eval() which 2827 * can recurse into fme_eval() again. If so then just return. 2828 */ 2829 if (fmep->posted_suspects) 2830 return; 2831 2832 publish_suspects(fmep); 2833 fmep->posted_suspects = 1; 2834 fmd_buf_write(fmep->hdl, fmep->fmcase, 2835 WOBUF_POSTD, 2836 (void *)&fmep->posted_suspects, 2837 sizeof (fmep->posted_suspects)); 2838 2839 /* 2840 * Now the suspects have been posted, we can clear up 2841 * the instance tree as we won't be looking at it again. 2842 * Also cancel the timer as the case is now solved. 2843 */ 2844 if (fmep->wull != 0) { 2845 fmd_timer_remove(fmep->hdl, fmep->timer); 2846 fmep->wull = 0; 2847 } 2848 break; 2849 2850 case FME_WAIT: 2851 ASSERT(my_delay > fmep->ull); 2852 (void) fme_set_timer(fmep, my_delay); 2853 print_suspects(SLWAIT, fmep); 2854 itree_prune(fmep->eventtree); 2855 config_free(fmep->cfgdata); 2856 fmep->cfgdata = NULL; 2857 return; 2858 2859 case FME_DISPROVED: 2860 print_suspects(SLDISPROVED, fmep); 2861 Undiag_reason = UD_UNSOLVD; 2862 fme_undiagnosable(fmep); 2863 break; 2864 } 2865 2866 if (fmep->posted_suspects == 1 && Autoclose != NULL) { 2867 int doclose = 0; 2868 2869 if (strcmp(Autoclose, "true") == 0 || 2870 strcmp(Autoclose, "all") == 0) 2871 doclose = 1; 2872 2873 if (strcmp(Autoclose, "upsets") == 0) { 2874 doclose = 1; 2875 for (ep = fmep->suspects; ep; ep = ep->suspects) { 2876 if (ep->t != N_UPSET) { 2877 doclose = 0; 2878 break; 2879 } 2880 } 2881 } 2882 2883 if (doclose) { 2884 out(O_ALTFP, "[closing FME%d, case %s (autoclose)]", 2885 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 2886 fmd_case_close(fmep->hdl, fmep->fmcase); 2887 } 2888 } 2889 itree_free(fmep->eventtree); 2890 fmep->eventtree = NULL; 2891 config_free(fmep->cfgdata); 2892 fmep->cfgdata = NULL; 2893 destroy_fme_bufs(fmep); 2894 } 2895 2896 static void indent(void); 2897 static int triggered(struct fme *fmep, struct event *ep, int mark); 2898 static enum fme_state effects_test(struct fme *fmep, 2899 struct event *fault_event, unsigned long long at_latest_by, 2900 unsigned long long *pdelay); 2901 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 2902 unsigned long long at_latest_by, unsigned long long *pdelay); 2903 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 2904 unsigned long long at_latest_by, unsigned long long *pdelay); 2905 2906 static int 2907 checkconstraints(struct fme *fmep, struct arrow *arrowp) 2908 { 2909 struct constraintlist *ctp; 2910 struct evalue value; 2911 char *sep = ""; 2912 2913 if (arrowp->forever_false) { 2914 indent(); 2915 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 2916 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 2917 out(O_ALTFP|O_VERB|O_NONL, sep); 2918 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2919 sep = ", "; 2920 } 2921 out(O_ALTFP|O_VERB, NULL); 2922 return (0); 2923 } 2924 if (arrowp->forever_true) { 2925 indent(); 2926 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 2927 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 2928 out(O_ALTFP|O_VERB|O_NONL, sep); 2929 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2930 sep = ", "; 2931 } 2932 out(O_ALTFP|O_VERB, NULL); 2933 return (1); 2934 } 2935 2936 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 2937 if (eval_expr(ctp->cnode, NULL, NULL, 2938 &fmep->globals, fmep->cfgdata->cooked, 2939 arrowp, 0, &value)) { 2940 /* evaluation successful */ 2941 if (value.t == UNDEFINED || value.v == 0) { 2942 /* known false */ 2943 arrowp->forever_false = 1; 2944 indent(); 2945 out(O_ALTFP|O_VERB|O_NONL, 2946 " False constraint: "); 2947 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2948 out(O_ALTFP|O_VERB, NULL); 2949 return (0); 2950 } 2951 } else { 2952 /* evaluation unsuccessful -- unknown value */ 2953 indent(); 2954 out(O_ALTFP|O_VERB|O_NONL, 2955 " Deferred constraint: "); 2956 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2957 out(O_ALTFP|O_VERB, NULL); 2958 return (1); 2959 } 2960 } 2961 /* known true */ 2962 arrowp->forever_true = 1; 2963 indent(); 2964 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 2965 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 2966 out(O_ALTFP|O_VERB|O_NONL, sep); 2967 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 2968 sep = ", "; 2969 } 2970 out(O_ALTFP|O_VERB, NULL); 2971 return (1); 2972 } 2973 2974 static int 2975 triggered(struct fme *fmep, struct event *ep, int mark) 2976 { 2977 struct bubble *bp; 2978 struct arrowlist *ap; 2979 int count = 0; 2980 2981 stats_counter_bump(fmep->Tcallcount); 2982 for (bp = itree_next_bubble(ep, NULL); bp; 2983 bp = itree_next_bubble(ep, bp)) { 2984 if (bp->t != B_TO) 2985 continue; 2986 for (ap = itree_next_arrow(bp, NULL); ap; 2987 ap = itree_next_arrow(bp, ap)) { 2988 /* check count of marks against K in the bubble */ 2989 if ((ap->arrowp->mark & mark) && 2990 ++count >= bp->nork) 2991 return (1); 2992 } 2993 } 2994 return (0); 2995 } 2996 2997 static int 2998 mark_arrows(struct fme *fmep, struct event *ep, int mark, 2999 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3000 { 3001 struct bubble *bp; 3002 struct arrowlist *ap; 3003 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3004 unsigned long long my_delay; 3005 enum fme_state result; 3006 int retval = 0; 3007 3008 for (bp = itree_next_bubble(ep, NULL); bp; 3009 bp = itree_next_bubble(ep, bp)) { 3010 if (bp->t != B_FROM) 3011 continue; 3012 stats_counter_bump(fmep->Marrowcount); 3013 for (ap = itree_next_arrow(bp, NULL); ap; 3014 ap = itree_next_arrow(bp, ap)) { 3015 struct event *ep2 = ap->arrowp->head->myevent; 3016 /* 3017 * if we're clearing marks, we can avoid doing 3018 * all that work evaluating constraints. 3019 */ 3020 if (mark == 0) { 3021 if (ap->arrowp->arrow_marked == 0) 3022 continue; 3023 ap->arrowp->arrow_marked = 0; 3024 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3025 if (keep && (ep2->cached_state & 3026 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3027 ep2->keep_in_tree = 1; 3028 ep2->cached_state &= 3029 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3030 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3031 keep); 3032 continue; 3033 } 3034 ap->arrowp->arrow_marked = 1; 3035 if (ep2->cached_state & REQMNTS_DISPROVED) { 3036 indent(); 3037 out(O_ALTFP|O_VERB|O_NONL, 3038 " ALREADY DISPROVED "); 3039 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3040 out(O_ALTFP|O_VERB, NULL); 3041 continue; 3042 } 3043 if (ep2->cached_state & WAIT_EFFECT) { 3044 indent(); 3045 out(O_ALTFP|O_VERB|O_NONL, 3046 " ALREADY EFFECTS WAIT "); 3047 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3048 out(O_ALTFP|O_VERB, NULL); 3049 continue; 3050 } 3051 if (ep2->cached_state & CREDIBLE_EFFECT) { 3052 indent(); 3053 out(O_ALTFP|O_VERB|O_NONL, 3054 " ALREADY EFFECTS CREDIBLE "); 3055 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3056 out(O_ALTFP|O_VERB, NULL); 3057 continue; 3058 } 3059 if ((ep2->cached_state & PARENT_WAIT) && 3060 (mark & PARENT_WAIT)) { 3061 indent(); 3062 out(O_ALTFP|O_VERB|O_NONL, 3063 " ALREADY PARENT EFFECTS WAIT "); 3064 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3065 out(O_ALTFP|O_VERB, NULL); 3066 continue; 3067 } 3068 platform_set_payloadnvp(ep2->nvp); 3069 if (checkconstraints(fmep, ap->arrowp) == 0) { 3070 platform_set_payloadnvp(NULL); 3071 indent(); 3072 out(O_ALTFP|O_VERB|O_NONL, 3073 " CONSTRAINTS FAIL "); 3074 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3075 out(O_ALTFP|O_VERB, NULL); 3076 continue; 3077 } 3078 platform_set_payloadnvp(NULL); 3079 ap->arrowp->mark |= EFFECTS_COUNTER; 3080 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3081 indent(); 3082 out(O_ALTFP|O_VERB|O_NONL, 3083 " K-COUNT NOT YET MET "); 3084 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3085 out(O_ALTFP|O_VERB, NULL); 3086 continue; 3087 } 3088 ep2->cached_state &= ~PARENT_WAIT; 3089 /* 3090 * if we've reached an ereport and no propagation time 3091 * is specified, use the Hesitate value 3092 */ 3093 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3094 ap->arrowp->maxdelay == 0ULL) { 3095 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3096 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3097 out(O_ALTFP|O_VERB, NULL); 3098 result = requirements_test(fmep, ep2, Hesitate, 3099 &my_delay); 3100 } else { 3101 result = requirements_test(fmep, ep2, 3102 at_latest_by + ap->arrowp->maxdelay, 3103 &my_delay); 3104 } 3105 if (result == FME_WAIT) { 3106 retval = WAIT_EFFECT; 3107 if (overall_delay > my_delay) 3108 overall_delay = my_delay; 3109 ep2->cached_state |= WAIT_EFFECT; 3110 indent(); 3111 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3112 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3113 out(O_ALTFP|O_VERB, NULL); 3114 indent_push(" E"); 3115 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3116 at_latest_by, &my_delay, 0) == 3117 WAIT_EFFECT) { 3118 retval = WAIT_EFFECT; 3119 if (overall_delay > my_delay) 3120 overall_delay = my_delay; 3121 } 3122 indent_pop(); 3123 } else if (result == FME_DISPROVED) { 3124 indent(); 3125 out(O_ALTFP|O_VERB|O_NONL, 3126 " EFFECTS DISPROVED "); 3127 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3128 out(O_ALTFP|O_VERB, NULL); 3129 } else { 3130 ep2->cached_state |= mark; 3131 indent(); 3132 if (mark == CREDIBLE_EFFECT) 3133 out(O_ALTFP|O_VERB|O_NONL, 3134 " EFFECTS CREDIBLE "); 3135 else 3136 out(O_ALTFP|O_VERB|O_NONL, 3137 " PARENT EFFECTS WAIT "); 3138 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3139 out(O_ALTFP|O_VERB, NULL); 3140 indent_push(" E"); 3141 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3142 &my_delay, 0) == WAIT_EFFECT) { 3143 retval = WAIT_EFFECT; 3144 if (overall_delay > my_delay) 3145 overall_delay = my_delay; 3146 } 3147 indent_pop(); 3148 } 3149 } 3150 } 3151 if (retval == WAIT_EFFECT) 3152 *pdelay = overall_delay; 3153 return (retval); 3154 } 3155 3156 static enum fme_state 3157 effects_test(struct fme *fmep, struct event *fault_event, 3158 unsigned long long at_latest_by, unsigned long long *pdelay) 3159 { 3160 struct event *error_event; 3161 enum fme_state return_value = FME_CREDIBLE; 3162 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3163 unsigned long long my_delay; 3164 3165 stats_counter_bump(fmep->Ecallcount); 3166 indent_push(" E"); 3167 indent(); 3168 out(O_ALTFP|O_VERB|O_NONL, "->"); 3169 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3170 out(O_ALTFP|O_VERB, NULL); 3171 3172 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3173 &my_delay, 0) == WAIT_EFFECT) { 3174 return_value = FME_WAIT; 3175 if (overall_delay > my_delay) 3176 overall_delay = my_delay; 3177 } 3178 for (error_event = fmep->observations; 3179 error_event; error_event = error_event->observations) { 3180 indent(); 3181 out(O_ALTFP|O_VERB|O_NONL, " "); 3182 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3183 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3184 if (error_event->cached_state & 3185 (PARENT_WAIT|WAIT_EFFECT)) { 3186 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3187 continue; 3188 } 3189 return_value = FME_DISPROVED; 3190 out(O_ALTFP|O_VERB, " NOT triggered"); 3191 break; 3192 } else { 3193 out(O_ALTFP|O_VERB, " triggered"); 3194 } 3195 } 3196 if (return_value == FME_DISPROVED) { 3197 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3198 } else { 3199 fault_event->keep_in_tree = 1; 3200 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3201 } 3202 3203 indent(); 3204 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3205 fme_state2str(return_value)); 3206 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3207 out(O_ALTFP|O_VERB, NULL); 3208 indent_pop(); 3209 if (return_value == FME_WAIT) 3210 *pdelay = overall_delay; 3211 return (return_value); 3212 } 3213 3214 static enum fme_state 3215 requirements_test(struct fme *fmep, struct event *ep, 3216 unsigned long long at_latest_by, unsigned long long *pdelay) 3217 { 3218 int waiting_events; 3219 int credible_events; 3220 int deferred_events; 3221 enum fme_state return_value = FME_CREDIBLE; 3222 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3223 unsigned long long arrow_delay; 3224 unsigned long long my_delay; 3225 struct event *ep2; 3226 struct bubble *bp; 3227 struct arrowlist *ap; 3228 3229 if (ep->cached_state & REQMNTS_CREDIBLE) { 3230 indent(); 3231 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3232 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3233 out(O_ALTFP|O_VERB, NULL); 3234 return (FME_CREDIBLE); 3235 } 3236 if (ep->cached_state & REQMNTS_DISPROVED) { 3237 indent(); 3238 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3239 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3240 out(O_ALTFP|O_VERB, NULL); 3241 return (FME_DISPROVED); 3242 } 3243 if (ep->cached_state & REQMNTS_WAIT) { 3244 indent(); 3245 *pdelay = ep->cached_delay; 3246 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3247 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3248 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3249 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3250 out(O_ALTFP|O_VERB, NULL); 3251 return (FME_WAIT); 3252 } 3253 stats_counter_bump(fmep->Rcallcount); 3254 indent_push(" R"); 3255 indent(); 3256 out(O_ALTFP|O_VERB|O_NONL, "->"); 3257 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3258 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3259 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3260 out(O_ALTFP|O_VERB, NULL); 3261 3262 if (ep->t == N_EREPORT) { 3263 if (ep->count == 0) { 3264 if (fmep->pull >= at_latest_by) { 3265 return_value = FME_DISPROVED; 3266 } else { 3267 ep->cached_delay = *pdelay = at_latest_by; 3268 return_value = FME_WAIT; 3269 } 3270 } 3271 3272 indent(); 3273 switch (return_value) { 3274 case FME_CREDIBLE: 3275 ep->cached_state |= REQMNTS_CREDIBLE; 3276 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3277 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3278 break; 3279 case FME_DISPROVED: 3280 ep->cached_state |= REQMNTS_DISPROVED; 3281 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3282 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3283 break; 3284 case FME_WAIT: 3285 ep->cached_state |= REQMNTS_WAIT; 3286 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3287 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3288 out(O_ALTFP|O_VERB|O_NONL, " to "); 3289 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3290 break; 3291 default: 3292 out(O_DIE, "requirements_test: unexpected fme_state"); 3293 break; 3294 } 3295 out(O_ALTFP|O_VERB, NULL); 3296 indent_pop(); 3297 3298 return (return_value); 3299 } 3300 3301 /* this event is not a report, descend the tree */ 3302 for (bp = itree_next_bubble(ep, NULL); bp; 3303 bp = itree_next_bubble(ep, bp)) { 3304 int n; 3305 3306 if (bp->t != B_FROM) 3307 continue; 3308 3309 n = bp->nork; 3310 3311 credible_events = 0; 3312 waiting_events = 0; 3313 deferred_events = 0; 3314 arrow_delay = TIMEVAL_EVENTUALLY; 3315 /* 3316 * n is -1 for 'A' so adjust it. 3317 * XXX just count up the arrows for now. 3318 */ 3319 if (n < 0) { 3320 n = 0; 3321 for (ap = itree_next_arrow(bp, NULL); ap; 3322 ap = itree_next_arrow(bp, ap)) 3323 n++; 3324 indent(); 3325 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3326 } else { 3327 indent(); 3328 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3329 } 3330 3331 if (n == 0) 3332 continue; 3333 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3334 for (ap = itree_next_arrow(bp, NULL); ap; 3335 ap = itree_next_arrow(bp, ap)) { 3336 ep2 = ap->arrowp->head->myevent; 3337 platform_set_payloadnvp(ep2->nvp); 3338 if (checkconstraints(fmep, ap->arrowp) == 0) { 3339 /* 3340 * if any arrow is invalidated by the 3341 * constraints, then we should elide the 3342 * whole bubble to be consistant with 3343 * the tree creation time behaviour 3344 */ 3345 bp->mark |= BUBBLE_ELIDED; 3346 platform_set_payloadnvp(NULL); 3347 break; 3348 } 3349 platform_set_payloadnvp(NULL); 3350 } 3351 } 3352 if (bp->mark & BUBBLE_ELIDED) 3353 continue; 3354 bp->mark |= BUBBLE_OK; 3355 for (ap = itree_next_arrow(bp, NULL); ap; 3356 ap = itree_next_arrow(bp, ap)) { 3357 ep2 = ap->arrowp->head->myevent; 3358 if (n <= credible_events) 3359 break; 3360 3361 ap->arrowp->mark |= REQMNTS_COUNTER; 3362 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3363 /* XXX adding max timevals! */ 3364 switch (requirements_test(fmep, ep2, 3365 at_latest_by + ap->arrowp->maxdelay, 3366 &my_delay)) { 3367 case FME_DEFERRED: 3368 deferred_events++; 3369 break; 3370 case FME_CREDIBLE: 3371 credible_events++; 3372 break; 3373 case FME_DISPROVED: 3374 break; 3375 case FME_WAIT: 3376 if (my_delay < arrow_delay) 3377 arrow_delay = my_delay; 3378 waiting_events++; 3379 break; 3380 default: 3381 out(O_DIE, 3382 "Bug in requirements_test."); 3383 } 3384 else 3385 deferred_events++; 3386 } 3387 indent(); 3388 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3389 credible_events + deferred_events, waiting_events); 3390 if (credible_events + deferred_events + waiting_events < n) { 3391 /* Can never meet requirements */ 3392 ep->cached_state |= REQMNTS_DISPROVED; 3393 indent(); 3394 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3395 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3396 out(O_ALTFP|O_VERB, NULL); 3397 indent_pop(); 3398 return (FME_DISPROVED); 3399 } 3400 if (credible_events + deferred_events < n) { 3401 /* will have to wait */ 3402 /* wait time is shortest known */ 3403 if (arrow_delay < overall_delay) 3404 overall_delay = arrow_delay; 3405 return_value = FME_WAIT; 3406 } else if (credible_events < n) { 3407 if (return_value != FME_WAIT) 3408 return_value = FME_DEFERRED; 3409 } 3410 } 3411 3412 /* 3413 * don't mark as FME_DEFERRED. If this event isn't reached by another 3414 * path, then this will be considered FME_CREDIBLE. But if it is 3415 * reached by a different path so the K-count is met, then might 3416 * get overridden by FME_WAIT or FME_DISPROVED. 3417 */ 3418 if (return_value == FME_WAIT) { 3419 ep->cached_state |= REQMNTS_WAIT; 3420 ep->cached_delay = *pdelay = overall_delay; 3421 } else if (return_value == FME_CREDIBLE) { 3422 ep->cached_state |= REQMNTS_CREDIBLE; 3423 } 3424 indent(); 3425 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 3426 fme_state2str(return_value)); 3427 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3428 out(O_ALTFP|O_VERB, NULL); 3429 indent_pop(); 3430 return (return_value); 3431 } 3432 3433 static enum fme_state 3434 causes_test(struct fme *fmep, struct event *ep, 3435 unsigned long long at_latest_by, unsigned long long *pdelay) 3436 { 3437 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3438 unsigned long long my_delay; 3439 int credible_results = 0; 3440 int waiting_results = 0; 3441 enum fme_state fstate; 3442 struct event *tail_event; 3443 struct bubble *bp; 3444 struct arrowlist *ap; 3445 int k = 1; 3446 3447 stats_counter_bump(fmep->Ccallcount); 3448 indent_push(" C"); 3449 indent(); 3450 out(O_ALTFP|O_VERB|O_NONL, "->"); 3451 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3452 out(O_ALTFP|O_VERB, NULL); 3453 3454 for (bp = itree_next_bubble(ep, NULL); bp; 3455 bp = itree_next_bubble(ep, bp)) { 3456 if (bp->t != B_TO) 3457 continue; 3458 k = bp->nork; /* remember the K value */ 3459 for (ap = itree_next_arrow(bp, NULL); ap; 3460 ap = itree_next_arrow(bp, ap)) { 3461 int do_not_follow = 0; 3462 3463 /* 3464 * if we get to the same event multiple times 3465 * only worry about the first one. 3466 */ 3467 if (ap->arrowp->tail->myevent->cached_state & 3468 CAUSES_TESTED) { 3469 indent(); 3470 out(O_ALTFP|O_VERB|O_NONL, 3471 " causes test already run for "); 3472 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3473 ap->arrowp->tail->myevent); 3474 out(O_ALTFP|O_VERB, NULL); 3475 continue; 3476 } 3477 3478 /* 3479 * see if false constraint prevents us 3480 * from traversing this arrow 3481 */ 3482 platform_set_payloadnvp(ep->nvp); 3483 if (checkconstraints(fmep, ap->arrowp) == 0) 3484 do_not_follow = 1; 3485 platform_set_payloadnvp(NULL); 3486 if (do_not_follow) { 3487 indent(); 3488 out(O_ALTFP|O_VERB|O_NONL, 3489 " False arrow from "); 3490 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3491 ap->arrowp->tail->myevent); 3492 out(O_ALTFP|O_VERB, NULL); 3493 continue; 3494 } 3495 3496 ap->arrowp->tail->myevent->cached_state |= 3497 CAUSES_TESTED; 3498 tail_event = ap->arrowp->tail->myevent; 3499 fstate = hypothesise(fmep, tail_event, at_latest_by, 3500 &my_delay); 3501 3502 switch (fstate) { 3503 case FME_WAIT: 3504 if (my_delay < overall_delay) 3505 overall_delay = my_delay; 3506 waiting_results++; 3507 break; 3508 case FME_CREDIBLE: 3509 credible_results++; 3510 break; 3511 case FME_DISPROVED: 3512 break; 3513 default: 3514 out(O_DIE, "Bug in causes_test"); 3515 } 3516 } 3517 } 3518 /* compare against K */ 3519 if (credible_results + waiting_results < k) { 3520 indent(); 3521 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 3522 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3523 out(O_ALTFP|O_VERB, NULL); 3524 indent_pop(); 3525 return (FME_DISPROVED); 3526 } 3527 if (waiting_results != 0) { 3528 *pdelay = overall_delay; 3529 indent(); 3530 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 3531 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3532 out(O_ALTFP|O_VERB|O_NONL, " to "); 3533 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3534 out(O_ALTFP|O_VERB, NULL); 3535 indent_pop(); 3536 return (FME_WAIT); 3537 } 3538 indent(); 3539 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 3540 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3541 out(O_ALTFP|O_VERB, NULL); 3542 indent_pop(); 3543 return (FME_CREDIBLE); 3544 } 3545 3546 static enum fme_state 3547 hypothesise(struct fme *fmep, struct event *ep, 3548 unsigned long long at_latest_by, unsigned long long *pdelay) 3549 { 3550 enum fme_state rtr, otr; 3551 unsigned long long my_delay; 3552 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3553 3554 stats_counter_bump(fmep->Hcallcount); 3555 indent_push(" H"); 3556 indent(); 3557 out(O_ALTFP|O_VERB|O_NONL, "->"); 3558 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3559 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3560 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3561 out(O_ALTFP|O_VERB, NULL); 3562 3563 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 3564 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 3565 overall_delay = my_delay; 3566 if (rtr != FME_DISPROVED) { 3567 if (is_problem(ep->t)) { 3568 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 3569 if (otr != FME_DISPROVED) { 3570 if (fmep->peek == 0 && ep->is_suspect == 0) { 3571 ep->suspects = fmep->suspects; 3572 ep->is_suspect = 1; 3573 fmep->suspects = ep; 3574 fmep->nsuspects++; 3575 if (!is_fault(ep->t)) 3576 fmep->nonfault++; 3577 } 3578 } 3579 } else 3580 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 3581 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 3582 overall_delay = my_delay; 3583 if ((otr != FME_DISPROVED) && 3584 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 3585 *pdelay = overall_delay; 3586 } 3587 if (rtr == FME_DISPROVED) { 3588 indent(); 3589 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 3590 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3591 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 3592 indent_pop(); 3593 return (FME_DISPROVED); 3594 } 3595 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 3596 indent(); 3597 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 3598 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3599 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 3600 indent_pop(); 3601 return (FME_DISPROVED); 3602 } 3603 if (otr == FME_DISPROVED) { 3604 indent(); 3605 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 3606 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3607 out(O_ALTFP|O_VERB, " (causes are not credible)"); 3608 indent_pop(); 3609 return (FME_DISPROVED); 3610 } 3611 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 3612 indent(); 3613 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 3614 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3615 out(O_ALTFP|O_VERB|O_NONL, " to "); 3616 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 3617 out(O_ALTFP|O_VERB, NULL); 3618 indent_pop(); 3619 return (FME_WAIT); 3620 } 3621 indent(); 3622 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 3623 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3624 out(O_ALTFP|O_VERB, NULL); 3625 indent_pop(); 3626 return (FME_CREDIBLE); 3627 } 3628 3629 /* 3630 * fme_istat_load -- reconstitute any persistent istats 3631 */ 3632 void 3633 fme_istat_load(fmd_hdl_t *hdl) 3634 { 3635 int sz; 3636 char *sbuf; 3637 char *ptr; 3638 3639 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 3640 out(O_ALTFP, "fme_istat_load: No stats"); 3641 return; 3642 } 3643 3644 sbuf = alloca(sz); 3645 3646 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 3647 3648 /* 3649 * pick apart the serialized stats 3650 * 3651 * format is: 3652 * <class-name>, '@', <path>, '\0', <value>, '\0' 3653 * for example: 3654 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 3655 * 3656 * since this is parsing our own serialized data, any parsing issues 3657 * are fatal, so we check for them all with ASSERT() below. 3658 */ 3659 ptr = sbuf; 3660 while (ptr < &sbuf[sz]) { 3661 char *sepptr; 3662 struct node *np; 3663 int val; 3664 3665 sepptr = strchr(ptr, '@'); 3666 ASSERT(sepptr != NULL); 3667 *sepptr = '\0'; 3668 3669 /* construct the event */ 3670 np = newnode(T_EVENT, NULL, 0); 3671 np->u.event.ename = newnode(T_NAME, NULL, 0); 3672 np->u.event.ename->u.name.t = N_STAT; 3673 np->u.event.ename->u.name.s = stable(ptr); 3674 np->u.event.ename->u.name.it = IT_ENAME; 3675 np->u.event.ename->u.name.last = np->u.event.ename; 3676 3677 ptr = sepptr + 1; 3678 ASSERT(ptr < &sbuf[sz]); 3679 ptr += strlen(ptr); 3680 ptr++; /* move past the '\0' separating path from value */ 3681 ASSERT(ptr < &sbuf[sz]); 3682 ASSERT(isdigit(*ptr)); 3683 val = atoi(ptr); 3684 ASSERT(val > 0); 3685 ptr += strlen(ptr); 3686 ptr++; /* move past the final '\0' for this entry */ 3687 3688 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 3689 ASSERT(np->u.event.epname != NULL); 3690 3691 istat_bump(np, val); 3692 tree_free(np); 3693 } 3694 3695 istat_save(); 3696 } 3697