1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <strings.h> 37 #include <ctype.h> 38 #include <alloca.h> 39 #include <libnvpair.h> 40 #include <sys/fm/protocol.h> 41 #include <fm/fmd_api.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 #include "esclex.h" 58 59 /* imported from eft.c... */ 60 extern hrtime_t Hesitate; 61 extern char *Serd_Override; 62 extern nv_alloc_t Eft_nv_hdl; 63 extern int Max_fme; 64 extern fmd_hdl_t *Hdl; 65 66 static int Istat_need_save; 67 static int Serd_need_save; 68 void istat_save(void); 69 void serd_save(void); 70 71 /* fme under construction is global so we can free it on module abort */ 72 static struct fme *Nfmep; 73 74 static const char *Undiag_reason; 75 76 static int Nextid = 0; 77 78 static int Open_fme_count = 0; /* Count of open FMEs */ 79 80 /* list of fault management exercises underway */ 81 static struct fme { 82 struct fme *next; /* next exercise */ 83 unsigned long long ull; /* time when fme was created */ 84 int id; /* FME id */ 85 struct config *config; /* cooked configuration data */ 86 struct lut *eventtree; /* propagation tree for this FME */ 87 /* 88 * The initial error report that created this FME is kept in 89 * two forms. e0 points to the instance tree node and is used 90 * by fme_eval() as the starting point for the inference 91 * algorithm. e0r is the event handle FMD passed to us when 92 * the ereport first arrived and is used when setting timers, 93 * which are always relative to the time of this initial 94 * report. 95 */ 96 struct event *e0; 97 fmd_event_t *e0r; 98 99 id_t timer; /* for setting an fmd time-out */ 100 101 struct event *ecurrent; /* ereport under consideration */ 102 struct event *suspects; /* current suspect list */ 103 struct event *psuspects; /* previous suspect list */ 104 int nsuspects; /* count of suspects */ 105 int nonfault; /* zero if all suspects T_FAULT */ 106 int posted_suspects; /* true if we've posted a diagnosis */ 107 int uniqobs; /* number of unique events observed */ 108 int peek; /* just peeking, don't track suspects */ 109 int overflow; /* true if overflow FME */ 110 enum fme_state { 111 FME_NOTHING = 5000, /* not evaluated yet */ 112 FME_WAIT, /* need to wait for more info */ 113 FME_CREDIBLE, /* suspect list is credible */ 114 FME_DISPROVED, /* no valid suspects found */ 115 FME_DEFERRED /* don't know yet (k-count not met) */ 116 } state; 117 118 unsigned long long pull; /* time passed since created */ 119 unsigned long long wull; /* wait until this time for re-eval */ 120 struct event *observations; /* observation list */ 121 struct lut *globals; /* values of global variables */ 122 /* fmd interfacing */ 123 fmd_hdl_t *hdl; /* handle for talking with fmd */ 124 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 125 /* stats */ 126 struct stats *Rcount; 127 struct stats *Hcallcount; 128 struct stats *Rcallcount; 129 struct stats *Ccallcount; 130 struct stats *Ecallcount; 131 struct stats *Tcallcount; 132 struct stats *Marrowcount; 133 struct stats *diags; 134 } *FMElist, *EFMElist, *ClosedFMEs; 135 136 static struct case_list { 137 fmd_case_t *fmcase; 138 struct case_list *next; 139 } *Undiagablecaselist; 140 141 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 142 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 143 unsigned long long at_latest_by, unsigned long long *pdelay); 144 static struct node *eventprop_lookup(struct event *ep, const char *propname); 145 static struct node *pathstring2epnamenp(char *path); 146 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 147 fmd_case_t *fmcase); 148 static void restore_suspects(struct fme *fmep); 149 static void save_suspects(struct fme *fmep); 150 static void destroy_fme(struct fme *f); 151 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 152 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 153 static void istat_counter_reset_cb(struct istat_entry *entp, 154 struct stats *statp, const struct ipath *ipp); 155 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 156 struct stats *statp, void *unused); 157 static void serd_reset_cb(struct serd_entry *entp, void *unused, 158 const struct ipath *ipp); 159 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 160 void *unused2); 161 static void destroy_fme_bufs(struct fme *fp); 162 163 static struct fme * 164 alloc_fme(void) 165 { 166 struct fme *fmep; 167 168 fmep = MALLOC(sizeof (*fmep)); 169 bzero(fmep, sizeof (*fmep)); 170 return (fmep); 171 } 172 173 /* 174 * fme_ready -- called when all initialization of the FME (except for 175 * stats) has completed successfully. Adds the fme to global lists 176 * and establishes its stats. 177 */ 178 static struct fme * 179 fme_ready(struct fme *fmep) 180 { 181 char nbuf[100]; 182 183 Nfmep = NULL; /* don't need to free this on module abort now */ 184 185 if (EFMElist) { 186 EFMElist->next = fmep; 187 EFMElist = fmep; 188 } else 189 FMElist = EFMElist = fmep; 190 191 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 192 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 193 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 194 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 195 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 196 fmep->Rcallcount = stats_new_counter(nbuf, 197 "calls to requirements_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 199 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 200 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 201 fmep->Ecallcount = 202 stats_new_counter(nbuf, "calls to effects_test()", 1); 203 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 204 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 205 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 206 fmep->Marrowcount = stats_new_counter(nbuf, 207 "arrows marked by mark_arrows()", 1); 208 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 209 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 210 211 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 212 config_print(O_ALTFP|O_VERB2, fmep->config); 213 214 return (fmep); 215 } 216 217 extern void ipath_dummy_lut(struct arrow *); 218 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 219 220 /* ARGSUSED */ 221 static void 222 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 223 { 224 struct bubble *bp; 225 struct arrowlist *ap; 226 227 for (bp = itree_next_bubble(ep, NULL); bp; 228 bp = itree_next_bubble(ep, bp)) { 229 if (bp->t != B_FROM) 230 continue; 231 for (ap = itree_next_arrow(bp, NULL); ap; 232 ap = itree_next_arrow(bp, ap)) { 233 ap->arrowp->pnode->u.arrow.needed = 1; 234 ipath_dummy_lut(ap->arrowp); 235 } 236 } 237 } 238 239 /* ARGSUSED */ 240 static void 241 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 242 { 243 struct bubble *bp; 244 struct arrowlist *ap; 245 246 for (bp = itree_next_bubble(ep, NULL); bp; 247 bp = itree_next_bubble(ep, bp)) { 248 if (bp->t != B_FROM) 249 continue; 250 for (ap = itree_next_arrow(bp, NULL); ap; 251 ap = itree_next_arrow(bp, ap)) 252 ap->arrowp->pnode->u.arrow.needed = 0; 253 } 254 } 255 256 static void globals_destructor(void *left, void *right, void *arg); 257 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 258 259 static void 260 prune_propagations(const char *e0class, const struct ipath *e0ipp) 261 { 262 char nbuf[100]; 263 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 264 extern struct lut *Usednames; 265 266 Nfmep = alloc_fme(); 267 Nfmep->id = Nextid; 268 Nfmep->state = FME_NOTHING; 269 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 270 if ((Nfmep->e0 = 271 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 272 out(O_ALTFP, "prune_propagations: e0 not in instance tree"); 273 itree_free(Nfmep->eventtree); 274 FREE(Nfmep); 275 Nfmep = NULL; 276 return; 277 } 278 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 279 Nfmep->e0->count++; 280 281 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 282 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 283 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 284 Nfmep->Hcallcount = 285 stats_new_counter(nbuf, "calls to hypothesise()", 1); 286 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 287 Nfmep->Rcallcount = stats_new_counter(nbuf, 288 "calls to requirements_test()", 1); 289 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 290 Nfmep->Ccallcount = 291 stats_new_counter(nbuf, "calls to causes_test()", 1); 292 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 293 Nfmep->Ecallcount = 294 stats_new_counter(nbuf, "calls to effects_test()", 1); 295 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 296 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 297 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 298 Nfmep->Marrowcount = stats_new_counter(nbuf, 299 "arrows marked by mark_arrows()", 1); 300 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 301 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 302 303 Nfmep->peek = 1; 304 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 305 lut_free(Usednames, NULL, NULL); 306 Usednames = NULL; 307 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 308 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 309 itree_prune(Nfmep->eventtree); 310 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 311 312 stats_delete(Nfmep->Rcount); 313 stats_delete(Nfmep->Hcallcount); 314 stats_delete(Nfmep->Rcallcount); 315 stats_delete(Nfmep->Ccallcount); 316 stats_delete(Nfmep->Ecallcount); 317 stats_delete(Nfmep->Tcallcount); 318 stats_delete(Nfmep->Marrowcount); 319 stats_delete(Nfmep->diags); 320 itree_free(Nfmep->eventtree); 321 lut_free(Nfmep->globals, globals_destructor, NULL); 322 FREE(Nfmep); 323 } 324 325 static struct fme * 326 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 327 fmd_case_t *fmcase) 328 { 329 struct cfgdata *cfgdata; 330 int init_size; 331 extern int alloc_total(); 332 333 init_size = alloc_total(); 334 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 335 if ((cfgdata = config_snapshot()) == NULL) { 336 out(O_ALTFP, "newfme: NULL configuration"); 337 Undiag_reason = UD_NOCONF; 338 return (NULL); 339 } 340 platform_save_config(hdl, fmcase); 341 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 342 alloc_total() - init_size); 343 344 Nfmep = alloc_fme(); 345 346 Nfmep->id = Nextid++; 347 Nfmep->config = cfgdata->cooked; 348 config_free(cfgdata); 349 Nfmep->posted_suspects = 0; 350 Nfmep->uniqobs = 0; 351 Nfmep->state = FME_NOTHING; 352 Nfmep->pull = 0ULL; 353 Nfmep->overflow = 0; 354 355 Nfmep->fmcase = fmcase; 356 Nfmep->hdl = hdl; 357 358 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 359 out(O_ALTFP, "newfme: NULL instance tree"); 360 Undiag_reason = UD_INSTFAIL; 361 structconfig_free(Nfmep->config); 362 destroy_fme_bufs(Nfmep); 363 FREE(Nfmep); 364 Nfmep = NULL; 365 return (NULL); 366 } 367 368 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 369 370 if ((Nfmep->e0 = 371 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 372 out(O_ALTFP, "newfme: e0 not in instance tree"); 373 Undiag_reason = UD_BADEVENTI; 374 itree_free(Nfmep->eventtree); 375 structconfig_free(Nfmep->config); 376 destroy_fme_bufs(Nfmep); 377 FREE(Nfmep); 378 Nfmep = NULL; 379 return (NULL); 380 } 381 382 return (fme_ready(Nfmep)); 383 } 384 385 void 386 fme_fini(void) 387 { 388 struct fme *sfp, *fp; 389 struct case_list *ucasep, *nextcasep; 390 391 ucasep = Undiagablecaselist; 392 while (ucasep != NULL) { 393 nextcasep = ucasep->next; 394 FREE(ucasep); 395 ucasep = nextcasep; 396 } 397 Undiagablecaselist = NULL; 398 399 /* clean up closed fmes */ 400 fp = ClosedFMEs; 401 while (fp != NULL) { 402 sfp = fp->next; 403 destroy_fme(fp); 404 fp = sfp; 405 } 406 ClosedFMEs = NULL; 407 408 fp = FMElist; 409 while (fp != NULL) { 410 sfp = fp->next; 411 destroy_fme(fp); 412 fp = sfp; 413 } 414 FMElist = EFMElist = NULL; 415 416 /* if we were in the middle of creating an fme, free it now */ 417 if (Nfmep) { 418 destroy_fme(Nfmep); 419 Nfmep = NULL; 420 } 421 } 422 423 /* 424 * Allocated space for a buffer name. 20 bytes allows for 425 * a ridiculous 9,999,999 unique observations. 426 */ 427 #define OBBUFNMSZ 20 428 429 /* 430 * serialize_observation 431 * 432 * Create a recoverable version of the current observation 433 * (f->ecurrent). We keep a serialized version of each unique 434 * observation in order that we may resume correctly the fme in the 435 * correct state if eft or fmd crashes and we're restarted. 436 */ 437 static void 438 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 439 { 440 size_t pkdlen; 441 char tmpbuf[OBBUFNMSZ]; 442 char *pkd = NULL; 443 char *estr; 444 445 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 446 estr = ipath2str(cls, ipp); 447 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 448 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 449 strlen(estr) + 1); 450 FREE(estr); 451 452 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 453 (void) snprintf(tmpbuf, 454 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 455 if (nvlist_xpack(fp->ecurrent->nvp, 456 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 457 out(O_DIE|O_SYS, "pack of observed nvl failed"); 458 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 459 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 460 FREE(pkd); 461 } 462 463 fp->uniqobs++; 464 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 465 sizeof (fp->uniqobs)); 466 } 467 468 /* 469 * init_fme_bufs -- We keep several bits of state about an fme for 470 * use if eft or fmd crashes and we're restarted. 471 */ 472 static void 473 init_fme_bufs(struct fme *fp) 474 { 475 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 476 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 477 sizeof (fp->pull)); 478 479 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 480 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 481 sizeof (fp->id)); 482 483 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 484 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 485 sizeof (fp->uniqobs)); 486 487 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 488 sizeof (fp->posted_suspects)); 489 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 490 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 491 } 492 493 static void 494 destroy_fme_bufs(struct fme *fp) 495 { 496 char tmpbuf[OBBUFNMSZ]; 497 int o; 498 499 platform_restore_config(fp->hdl, fp->fmcase); 500 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 501 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 502 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 503 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 504 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 505 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 506 507 for (o = 0; o < fp->uniqobs; o++) { 508 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 509 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 510 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 511 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 512 } 513 } 514 515 /* 516 * reconstitute_observations -- convert a case's serialized observations 517 * back into struct events. Returns zero if all observations are 518 * successfully reconstituted. 519 */ 520 static int 521 reconstitute_observations(struct fme *fmep) 522 { 523 struct event *ep; 524 struct node *epnamenp = NULL; 525 size_t pkdlen; 526 char *pkd = NULL; 527 char *tmpbuf = alloca(OBBUFNMSZ); 528 char *sepptr; 529 char *estr; 530 int ocnt; 531 int elen; 532 533 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 534 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 535 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 536 if (elen == 0) { 537 out(O_ALTFP, 538 "reconstitute_observation: no %s buffer found.", 539 tmpbuf); 540 Undiag_reason = UD_MISSINGOBS; 541 break; 542 } 543 544 estr = MALLOC(elen); 545 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 546 sepptr = strchr(estr, '@'); 547 if (sepptr == NULL) { 548 out(O_ALTFP, 549 "reconstitute_observation: %s: " 550 "missing @ separator in %s.", 551 tmpbuf, estr); 552 Undiag_reason = UD_MISSINGPATH; 553 FREE(estr); 554 break; 555 } 556 557 *sepptr = '\0'; 558 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 559 out(O_ALTFP, 560 "reconstitute_observation: %s: " 561 "trouble converting path string \"%s\" " 562 "to internal representation.", 563 tmpbuf, sepptr + 1); 564 Undiag_reason = UD_MISSINGPATH; 565 FREE(estr); 566 break; 567 } 568 569 /* construct the event */ 570 ep = itree_lookup(fmep->eventtree, 571 stable(estr), ipath(epnamenp)); 572 if (ep == NULL) { 573 out(O_ALTFP, 574 "reconstitute_observation: %s: " 575 "lookup of \"%s\" in itree failed.", 576 tmpbuf, ipath2str(estr, ipath(epnamenp))); 577 Undiag_reason = UD_BADOBS; 578 tree_free(epnamenp); 579 FREE(estr); 580 break; 581 } 582 tree_free(epnamenp); 583 584 /* 585 * We may or may not have a saved nvlist for the observation 586 */ 587 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 588 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 589 if (pkdlen != 0) { 590 pkd = MALLOC(pkdlen); 591 fmd_buf_read(fmep->hdl, 592 fmep->fmcase, tmpbuf, pkd, pkdlen); 593 ASSERT(ep->nvp == NULL); 594 if (nvlist_xunpack(pkd, 595 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 596 out(O_DIE|O_SYS, "pack of observed nvl failed"); 597 FREE(pkd); 598 } 599 600 if (ocnt == 0) 601 fmep->e0 = ep; 602 603 FREE(estr); 604 fmep->ecurrent = ep; 605 ep->count++; 606 607 /* link it into list of observations seen */ 608 ep->observations = fmep->observations; 609 fmep->observations = ep; 610 } 611 612 if (ocnt == fmep->uniqobs) { 613 (void) fme_ready(fmep); 614 return (0); 615 } 616 617 return (1); 618 } 619 620 /* 621 * restart_fme -- called during eft initialization. Reconstitutes 622 * an in-progress fme. 623 */ 624 void 625 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 626 { 627 nvlist_t *defect; 628 struct case_list *bad; 629 struct fme *fmep; 630 struct cfgdata *cfgdata; 631 size_t rawsz; 632 struct event *ep; 633 char *tmpbuf = alloca(OBBUFNMSZ); 634 char *sepptr; 635 char *estr; 636 int elen; 637 struct node *epnamenp = NULL; 638 int init_size; 639 extern int alloc_total(); 640 641 /* 642 * ignore solved or closed cases 643 */ 644 if (fmd_case_solved(hdl, inprogress) || 645 fmd_case_closed(hdl, inprogress)) 646 return; 647 648 fmep = alloc_fme(); 649 fmep->fmcase = inprogress; 650 fmep->hdl = hdl; 651 652 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 653 out(O_ALTFP, "restart_fme: no saved posted status"); 654 Undiag_reason = UD_MISSINGINFO; 655 goto badcase; 656 } else { 657 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 658 (void *)&fmep->posted_suspects, 659 sizeof (fmep->posted_suspects)); 660 } 661 662 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 663 out(O_ALTFP, "restart_fme: no saved id"); 664 Undiag_reason = UD_MISSINGINFO; 665 goto badcase; 666 } else { 667 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 668 sizeof (fmep->id)); 669 } 670 if (Nextid <= fmep->id) 671 Nextid = fmep->id + 1; 672 673 out(O_ALTFP, "Replay FME %d", fmep->id); 674 675 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 676 out(O_ALTFP, "restart_fme: No config data"); 677 Undiag_reason = UD_MISSINGINFO; 678 goto badcase; 679 } 680 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 681 sizeof (size_t)); 682 683 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 684 out(O_ALTFP, "restart_fme: No event zero"); 685 Undiag_reason = UD_MISSINGZERO; 686 goto badcase; 687 } 688 689 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 690 out(O_ALTFP, "restart_fme: no saved wait time"); 691 Undiag_reason = UD_MISSINGINFO; 692 goto badcase; 693 } else { 694 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 695 sizeof (fmep->pull)); 696 } 697 698 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 699 out(O_ALTFP, "restart_fme: no count of observations"); 700 Undiag_reason = UD_MISSINGINFO; 701 goto badcase; 702 } else { 703 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 704 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 705 } 706 707 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 708 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 709 if (elen == 0) { 710 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 711 tmpbuf); 712 Undiag_reason = UD_MISSINGOBS; 713 goto badcase; 714 } 715 estr = MALLOC(elen); 716 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 717 sepptr = strchr(estr, '@'); 718 if (sepptr == NULL) { 719 out(O_ALTFP, "reconstitute_observation: %s: " 720 "missing @ separator in %s.", 721 tmpbuf, estr); 722 Undiag_reason = UD_MISSINGPATH; 723 FREE(estr); 724 goto badcase; 725 } 726 *sepptr = '\0'; 727 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 728 out(O_ALTFP, "reconstitute_observation: %s: " 729 "trouble converting path string \"%s\" " 730 "to internal representation.", tmpbuf, sepptr + 1); 731 Undiag_reason = UD_MISSINGPATH; 732 FREE(estr); 733 goto badcase; 734 } 735 prune_propagations(stable(estr), ipath(epnamenp)); 736 tree_free(epnamenp); 737 FREE(estr); 738 739 init_size = alloc_total(); 740 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 741 cfgdata = MALLOC(sizeof (struct cfgdata)); 742 cfgdata->cooked = NULL; 743 cfgdata->devcache = NULL; 744 cfgdata->devidcache = NULL; 745 cfgdata->cpucache = NULL; 746 cfgdata->raw_refcnt = 1; 747 748 if (rawsz > 0) { 749 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 750 out(O_ALTFP, "restart_fme: Config data size mismatch"); 751 Undiag_reason = UD_CFGMISMATCH; 752 goto badcase; 753 } 754 cfgdata->begin = MALLOC(rawsz); 755 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 756 fmd_buf_read(hdl, 757 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 758 } else { 759 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 760 } 761 762 config_cook(cfgdata); 763 fmep->config = cfgdata->cooked; 764 config_free(cfgdata); 765 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 766 alloc_total() - init_size); 767 768 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 769 /* case not properly saved or irretrievable */ 770 out(O_ALTFP, "restart_fme: NULL instance tree"); 771 Undiag_reason = UD_INSTFAIL; 772 goto badcase; 773 } 774 775 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 776 777 if (reconstitute_observations(fmep) != 0) 778 goto badcase; 779 780 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 781 for (ep = fmep->observations; ep; ep = ep->observations) { 782 out(O_ALTFP|O_NONL, " "); 783 itree_pevent_brief(O_ALTFP|O_NONL, ep); 784 } 785 out(O_ALTFP, NULL); 786 787 Open_fme_count++; 788 789 /* give the diagnosis algorithm a shot at the new FME state */ 790 fme_eval(fmep, fmep->e0r); 791 return; 792 793 badcase: 794 if (fmep->eventtree != NULL) 795 itree_free(fmep->eventtree); 796 if (fmep->config) 797 structconfig_free(fmep->config); 798 destroy_fme_bufs(fmep); 799 FREE(fmep); 800 801 /* 802 * Since we're unable to restart the case, add it to the undiagable 803 * list and solve and close it as appropriate. 804 */ 805 bad = MALLOC(sizeof (struct case_list)); 806 bad->next = NULL; 807 808 if (Undiagablecaselist != NULL) 809 bad->next = Undiagablecaselist; 810 Undiagablecaselist = bad; 811 bad->fmcase = inprogress; 812 813 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 814 fmd_case_uuid(hdl, bad->fmcase)); 815 816 if (fmd_case_solved(hdl, bad->fmcase)) { 817 out(O_ALTFP|O_NONL, "already solved, "); 818 } else { 819 out(O_ALTFP|O_NONL, "solving, "); 820 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 821 NULL, NULL, NULL); 822 if (Undiag_reason != NULL) 823 (void) nvlist_add_string(defect, 824 UNDIAG_REASON, Undiag_reason); 825 fmd_case_add_suspect(hdl, bad->fmcase, defect); 826 fmd_case_solve(hdl, bad->fmcase); 827 } 828 829 if (fmd_case_closed(hdl, bad->fmcase)) { 830 out(O_ALTFP, "already closed ]"); 831 } else { 832 out(O_ALTFP, "closing ]"); 833 fmd_case_close(hdl, bad->fmcase); 834 } 835 } 836 837 /*ARGSUSED*/ 838 static void 839 globals_destructor(void *left, void *right, void *arg) 840 { 841 struct evalue *evp = (struct evalue *)right; 842 if (evp->t == NODEPTR) 843 tree_free((struct node *)(uintptr_t)evp->v); 844 evp->v = (uintptr_t)NULL; 845 FREE(evp); 846 } 847 848 void 849 destroy_fme(struct fme *f) 850 { 851 stats_delete(f->Rcount); 852 stats_delete(f->Hcallcount); 853 stats_delete(f->Rcallcount); 854 stats_delete(f->Ccallcount); 855 stats_delete(f->Ecallcount); 856 stats_delete(f->Tcallcount); 857 stats_delete(f->Marrowcount); 858 stats_delete(f->diags); 859 860 if (f->eventtree != NULL) 861 itree_free(f->eventtree); 862 if (f->config) 863 structconfig_free(f->config); 864 lut_free(f->globals, globals_destructor, NULL); 865 FREE(f); 866 } 867 868 static const char * 869 fme_state2str(enum fme_state s) 870 { 871 switch (s) { 872 case FME_NOTHING: return ("NOTHING"); 873 case FME_WAIT: return ("WAIT"); 874 case FME_CREDIBLE: return ("CREDIBLE"); 875 case FME_DISPROVED: return ("DISPROVED"); 876 case FME_DEFERRED: return ("DEFERRED"); 877 default: return ("UNKNOWN"); 878 } 879 } 880 881 static int 882 is_problem(enum nametype t) 883 { 884 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 885 } 886 887 static int 888 is_fault(enum nametype t) 889 { 890 return (t == N_FAULT); 891 } 892 893 static int 894 is_defect(enum nametype t) 895 { 896 return (t == N_DEFECT); 897 } 898 899 static int 900 is_upset(enum nametype t) 901 { 902 return (t == N_UPSET); 903 } 904 905 static void 906 fme_print(int flags, struct fme *fmep) 907 { 908 struct event *ep; 909 910 out(flags, "Fault Management Exercise %d", fmep->id); 911 out(flags, "\t State: %s", fme_state2str(fmep->state)); 912 out(flags|O_NONL, "\t Start time: "); 913 ptree_timeval(flags|O_NONL, &fmep->ull); 914 out(flags, NULL); 915 if (fmep->wull) { 916 out(flags|O_NONL, "\t Wait time: "); 917 ptree_timeval(flags|O_NONL, &fmep->wull); 918 out(flags, NULL); 919 } 920 out(flags|O_NONL, "\t E0: "); 921 if (fmep->e0) 922 itree_pevent_brief(flags|O_NONL, fmep->e0); 923 else 924 out(flags|O_NONL, "NULL"); 925 out(flags, NULL); 926 out(flags|O_NONL, "\tObservations:"); 927 for (ep = fmep->observations; ep; ep = ep->observations) { 928 out(flags|O_NONL, " "); 929 itree_pevent_brief(flags|O_NONL, ep); 930 } 931 out(flags, NULL); 932 out(flags|O_NONL, "\tSuspect list:"); 933 for (ep = fmep->suspects; ep; ep = ep->suspects) { 934 out(flags|O_NONL, " "); 935 itree_pevent_brief(flags|O_NONL, ep); 936 } 937 out(flags, NULL); 938 if (fmep->eventtree != NULL) { 939 out(flags|O_VERB2, "\t Tree:"); 940 itree_ptree(flags|O_VERB2, fmep->eventtree); 941 } 942 } 943 944 static struct node * 945 pathstring2epnamenp(char *path) 946 { 947 char *sep = "/"; 948 struct node *ret; 949 char *ptr; 950 951 if ((ptr = strtok(path, sep)) == NULL) 952 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 953 954 ret = tree_iname(stable(ptr), NULL, 0); 955 956 while ((ptr = strtok(NULL, sep)) != NULL) 957 ret = tree_name_append(ret, 958 tree_iname(stable(ptr), NULL, 0)); 959 960 return (ret); 961 } 962 963 /* 964 * for a given upset sp, increment the corresponding SERD engine. if the 965 * SERD engine trips, return the ename and ipp of the resulting ereport. 966 * returns true if engine tripped and *enamep and *ippp were filled in. 967 */ 968 static int 969 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 970 fmd_case_t *fmcase, struct event *sp, const char **enamep, 971 const struct ipath **ippp) 972 { 973 struct node *serdinst; 974 char *serdname; 975 struct node *nid; 976 struct serd_entry *newentp; 977 int i, serdn = -1, serdincrement = 1; 978 char *serdsuffix = NULL, *serdt = NULL; 979 struct evalue *ep; 980 981 ASSERT(sp->t == N_UPSET); 982 ASSERT(ffep != NULL); 983 984 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 985 (void *)"n", (lut_cmp)strcmp)) != NULL) { 986 ASSERT(ep->t == UINT64); 987 serdn = (int)ep->v; 988 } 989 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 990 (void *)"t", (lut_cmp)strcmp)) != NULL) { 991 ASSERT(ep->t == STRING); 992 serdt = (char *)(uintptr_t)ep->v; 993 } 994 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 995 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 996 ASSERT(ep->t == STRING); 997 serdsuffix = (char *)(uintptr_t)ep->v; 998 } 999 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1000 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1001 ASSERT(ep->t == UINT64); 1002 serdincrement = (int)ep->v; 1003 } 1004 1005 /* 1006 * obtain instanced SERD engine from the upset sp. from this 1007 * derive serdname, the string used to identify the SERD engine. 1008 */ 1009 serdinst = eventprop_lookup(sp, L_engine); 1010 1011 if (serdinst == NULL) 1012 return (-1); 1013 1014 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 1015 ipath(serdinst->u.stmt.np->u.event.epname)); 1016 1017 if (serdsuffix != NULL) { 1018 int len = strlen(serdname) + strlen(serdsuffix) + 1; 1019 char *ptr = MALLOC(len); 1020 (void) snprintf(ptr, len, "%s%s", serdname, serdsuffix); 1021 FREE(serdname); 1022 serdname = ptr; 1023 } 1024 1025 /* handle serd engine "id" property, if there is one */ 1026 if ((nid = 1027 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1028 struct evalue *gval; 1029 char suffixbuf[200]; 1030 char *suffix; 1031 char *nserdname; 1032 size_t nname; 1033 1034 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1035 ptree_name_iter(O_ALTFP|O_NONL, nid); 1036 1037 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1038 1039 if ((gval = lut_lookup(fmep->globals, 1040 (void *)nid->u.globid.s, NULL)) == NULL) { 1041 out(O_ALTFP, " undefined"); 1042 } else if (gval->t == UINT64) { 1043 out(O_ALTFP, " %llu", gval->v); 1044 (void) sprintf(suffixbuf, "%llu", gval->v); 1045 suffix = suffixbuf; 1046 } else { 1047 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1048 suffix = (char *)(uintptr_t)gval->v; 1049 } 1050 1051 nname = strlen(serdname) + strlen(suffix) + 2; 1052 nserdname = MALLOC(nname); 1053 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1054 FREE(serdname); 1055 serdname = nserdname; 1056 } 1057 1058 /* 1059 * if the engine is empty, and we have an override for n/t then 1060 * destroy and recreate it. 1061 */ 1062 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1063 fmd_serd_empty(hdl, serdname)) 1064 fmd_serd_destroy(hdl, serdname); 1065 1066 if (!fmd_serd_exists(hdl, serdname)) { 1067 struct node *nN, *nT; 1068 const char *s; 1069 struct node *nodep; 1070 struct config *cp; 1071 char *path; 1072 uint_t nval; 1073 hrtime_t tval; 1074 const char *name; 1075 char *serd_name; 1076 int i; 1077 char *ptr; 1078 int got_n_override = 0, got_t_override = 0; 1079 1080 /* no SERD engine yet, so create it */ 1081 nodep = serdinst->u.stmt.np->u.event.epname; 1082 name = serdinst->u.stmt.np->u.event.ename->u.name.s; 1083 path = ipath2str(NULL, ipath(nodep)); 1084 cp = config_lookup(fmep->config, path, 0); 1085 FREE((void *)path); 1086 1087 /* 1088 * We allow serd paramaters to be overridden, either from 1089 * eft.conf file values (if Serd_Override is set) or from 1090 * driver properties (for "serd.io.device" engines). 1091 */ 1092 if (Serd_Override != NULL) { 1093 char *save_ptr, *ptr1, *ptr2, *ptr3; 1094 ptr3 = save_ptr = STRDUP(Serd_Override); 1095 while (*ptr3 != '\0') { 1096 ptr1 = strchr(ptr3, ','); 1097 *ptr1 = '\0'; 1098 if (strcmp(ptr3, name) == 0) { 1099 ptr2 = strchr(ptr1 + 1, ','); 1100 *ptr2 = '\0'; 1101 nval = atoi(ptr1 + 1); 1102 out(O_ALTFP, "serd override %s_n %d", 1103 name, nval); 1104 ptr3 = strchr(ptr2 + 1, ' '); 1105 if (ptr3) 1106 *ptr3 = '\0'; 1107 ptr = STRDUP(ptr2 + 1); 1108 out(O_ALTFP, "serd override %s_t %s", 1109 name, ptr); 1110 got_n_override = 1; 1111 got_t_override = 1; 1112 break; 1113 } else { 1114 ptr2 = strchr(ptr1 + 1, ','); 1115 ptr3 = strchr(ptr2 + 1, ' '); 1116 if (ptr3 == NULL) 1117 break; 1118 } 1119 ptr3++; 1120 } 1121 FREE(save_ptr); 1122 } 1123 1124 if (cp && got_n_override == 0) { 1125 /* 1126 * convert serd engine name into property name 1127 */ 1128 serd_name = MALLOC(strlen(name) + 3); 1129 for (i = 0; i < strlen(name); i++) { 1130 if (name[i] == '.') 1131 serd_name[i] = '_'; 1132 else 1133 serd_name[i] = name[i]; 1134 } 1135 serd_name[i++] = '_'; 1136 serd_name[i++] = 'n'; 1137 serd_name[i] = '\0'; 1138 if (s = config_getprop(cp, serd_name)) { 1139 nval = atoi(s); 1140 out(O_ALTFP, "serd override %s_n %s", name, s); 1141 got_n_override = 1; 1142 } 1143 serd_name[i - 1] = 't'; 1144 if (s = config_getprop(cp, serd_name)) { 1145 ptr = STRDUP(s); 1146 out(O_ALTFP, "serd override %s_t %s", name, s); 1147 got_t_override = 1; 1148 } 1149 FREE(serd_name); 1150 } 1151 1152 if (serdn != -1 && got_n_override == 0) { 1153 nval = serdn; 1154 out(O_ALTFP, "serd override %s_n %d", name, serdn); 1155 got_n_override = 1; 1156 } 1157 if (serdt != NULL && got_t_override == 0) { 1158 ptr = STRDUP(serdt); 1159 out(O_ALTFP, "serd override %s_t %s", name, serdt); 1160 got_t_override = 1; 1161 } 1162 1163 if (!got_n_override) { 1164 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1165 NULL); 1166 ASSERT(nN->t == T_NUM); 1167 nval = (uint_t)nN->u.ull; 1168 } 1169 if (!got_t_override) { 1170 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1171 NULL); 1172 ASSERT(nT->t == T_TIMEVAL); 1173 tval = (hrtime_t)nT->u.ull; 1174 } else { 1175 const unsigned long long *ullp; 1176 const char *suffix; 1177 int len; 1178 1179 len = strspn(ptr, "0123456789"); 1180 suffix = stable(&ptr[len]); 1181 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1182 (void *)suffix, NULL); 1183 ptr[len] = '\0'; 1184 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1185 FREE(ptr); 1186 } 1187 fmd_serd_create(hdl, serdname, nval, tval); 1188 } 1189 1190 newentp = MALLOC(sizeof (*newentp)); 1191 newentp->ename = stable(serdinst->u.stmt.np->u.event.ename->u.name.s); 1192 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1193 newentp->hdl = hdl; 1194 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1195 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1196 (void *)newentp, (lut_cmp)serd_cmp); 1197 Serd_need_save = 1; 1198 serd_save(); 1199 } else { 1200 FREE(newentp); 1201 } 1202 1203 1204 /* 1205 * increment SERD engine. if engine fires, reset serd 1206 * engine and return trip_strcode if required. 1207 */ 1208 for (i = 0; i < serdincrement; i++) { 1209 if (fmd_serd_record(hdl, serdname, ffep)) { 1210 fmd_case_add_serd(hdl, fmcase, serdname); 1211 fmd_serd_reset(hdl, serdname); 1212 1213 if (ippp) { 1214 struct node *tripinst = 1215 lut_lookup(serdinst->u.stmt.lutp, 1216 (void *)L_trip, NULL); 1217 ASSERT(tripinst != NULL); 1218 *enamep = tripinst->u.event.ename->u.name.s; 1219 *ippp = ipath(tripinst->u.event.epname); 1220 out(O_ALTFP|O_NONL, 1221 "[engine fired: %s, sending: ", serdname); 1222 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1223 out(O_ALTFP, "]"); 1224 } else { 1225 out(O_ALTFP, "[engine fired: %s, no trip]", 1226 serdname); 1227 } 1228 FREE(serdname); 1229 return (1); 1230 } 1231 } 1232 1233 FREE(serdname); 1234 return (0); 1235 } 1236 1237 /* 1238 * search a suspect list for upsets. feed each upset to serd_eval() and 1239 * build up tripped[], an array of ereports produced by the firing of 1240 * any SERD engines. then feed each ereport back into 1241 * fme_receive_report(). 1242 * 1243 * returns ntrip, the number of these ereports produced. 1244 */ 1245 static int 1246 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1247 { 1248 /* we build an array of tripped ereports that we send ourselves */ 1249 struct { 1250 const char *ename; 1251 const struct ipath *ipp; 1252 } *tripped; 1253 struct event *sp; 1254 int ntrip, nupset, i; 1255 1256 /* 1257 * count the number of upsets to determine the upper limit on 1258 * expected trip ereport strings. remember that one upset can 1259 * lead to at most one ereport. 1260 */ 1261 nupset = 0; 1262 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1263 if (sp->t == N_UPSET) 1264 nupset++; 1265 } 1266 1267 if (nupset == 0) 1268 return (0); 1269 1270 /* 1271 * get to this point if we have upsets and expect some trip 1272 * ereports 1273 */ 1274 tripped = alloca(sizeof (*tripped) * nupset); 1275 bzero((void *)tripped, sizeof (*tripped) * nupset); 1276 1277 ntrip = 0; 1278 for (sp = fmep->suspects; sp; sp = sp->suspects) 1279 if (sp->t == N_UPSET && 1280 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1281 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1282 ntrip++; 1283 1284 for (i = 0; i < ntrip; i++) { 1285 struct event *ep, *nep; 1286 struct fme *nfmep; 1287 fmd_case_t *fmcase; 1288 const struct ipath *ipp; 1289 const char *eventstring; 1290 int prev_verbose; 1291 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1292 enum fme_state state; 1293 1294 /* 1295 * First try and evaluate a case with the trip ereport plus 1296 * all the other ereports that cause the trip. If that fails 1297 * to evaluate then try again with just this ereport on its own. 1298 */ 1299 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1300 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1301 out(O_ALTFP|O_STAMP, NULL); 1302 ep = fmep->e0; 1303 eventstring = ep->enode->u.event.ename->u.name.s; 1304 ipp = ep->ipp; 1305 prune_propagations(eventstring, ipp); 1306 1307 /* 1308 * create a duplicate fme and case 1309 */ 1310 fmcase = fmd_case_open(fmep->hdl, NULL); 1311 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1312 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1313 out(O_ALTFP, " ]"); 1314 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1315 fmcase)) == NULL) { 1316 out(O_ALTFP|O_NONL, "["); 1317 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1318 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1319 publish_undiagnosable(fmep->hdl, ffep, fmcase); 1320 continue; 1321 } 1322 Open_fme_count++; 1323 nfmep->pull = fmep->pull; 1324 init_fme_bufs(nfmep); 1325 out(O_ALTFP|O_NONL, "["); 1326 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1327 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1328 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1329 if (ffep) { 1330 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1331 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1332 nfmep->e0r = ffep; 1333 } 1334 1335 /* 1336 * add the original ereports 1337 */ 1338 for (ep = fmep->observations; ep; ep = ep->observations) { 1339 eventstring = ep->enode->u.event.ename->u.name.s; 1340 ipp = ep->ipp; 1341 out(O_ALTFP|O_NONL, "adding event ["); 1342 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1343 out(O_ALTFP, " ]"); 1344 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1345 if (nep->count++ == 0) { 1346 nep->observations = nfmep->observations; 1347 nfmep->observations = nep; 1348 serialize_observation(nfmep, eventstring, ipp); 1349 nep->nvp = evnv_dupnvl(ep->nvp); 1350 } 1351 if (ep->ffep && ep->ffep != ffep) 1352 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1353 ep->ffep); 1354 stats_counter_bump(nfmep->Rcount); 1355 } 1356 1357 /* 1358 * add the serd trigger ereport 1359 */ 1360 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1361 tripped[i].ipp)) == NULL) { 1362 /* 1363 * The trigger ereport is not in the instance tree. It 1364 * was presumably removed by prune_propagations() as 1365 * this combination of events is not present in the 1366 * rules. 1367 */ 1368 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1369 Undiag_reason = UD_BADEVENTI; 1370 goto retry_lone_ereport; 1371 } 1372 out(O_ALTFP|O_NONL, "adding event ["); 1373 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1374 out(O_ALTFP, " ]"); 1375 nfmep->ecurrent = ep; 1376 ep->nvp = NULL; 1377 ep->count = 1; 1378 ep->observations = nfmep->observations; 1379 nfmep->observations = ep; 1380 1381 /* 1382 * just peek first. 1383 */ 1384 nfmep->peek = 1; 1385 prev_verbose = Verbose; 1386 if (Debug == 0) 1387 Verbose = 0; 1388 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1389 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1390 nfmep->peek = 0; 1391 Verbose = prev_verbose; 1392 if (state == FME_DISPROVED) { 1393 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1394 Undiag_reason = UD_UNSOLVD; 1395 retry_lone_ereport: 1396 /* 1397 * However the trigger ereport on its own might be 1398 * diagnosable, so check for that. Undo the new fme 1399 * and case we just created and call fme_receive_report. 1400 */ 1401 out(O_ALTFP|O_NONL, "["); 1402 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1403 tripped[i].ipp); 1404 out(O_ALTFP, " retrying with just trigger ereport]"); 1405 itree_free(nfmep->eventtree); 1406 nfmep->eventtree = NULL; 1407 structconfig_free(nfmep->config); 1408 nfmep->config = NULL; 1409 destroy_fme_bufs(nfmep); 1410 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1411 fme_receive_report(fmep->hdl, ffep, 1412 tripped[i].ename, tripped[i].ipp, NULL); 1413 continue; 1414 } 1415 1416 /* 1417 * and evaluate 1418 */ 1419 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1420 fme_eval(nfmep, ffep); 1421 } 1422 1423 return (ntrip); 1424 } 1425 1426 /* 1427 * fme_receive_external_report -- call when an external ereport comes in 1428 * 1429 * this routine just converts the relevant information from the ereport 1430 * into a format used internally and passes it on to fme_receive_report(). 1431 */ 1432 void 1433 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1434 const char *class) 1435 { 1436 struct node *epnamenp; 1437 fmd_case_t *fmcase; 1438 const struct ipath *ipp; 1439 1440 class = stable(class); 1441 1442 /* Get the component path from the ereport */ 1443 epnamenp = platform_getpath(nvl); 1444 1445 /* See if we ended up without a path. */ 1446 if (epnamenp == NULL) { 1447 /* See if class permits silent discard on unknown component. */ 1448 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1449 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1450 "to component path, but silent discard allowed.", 1451 class); 1452 } else { 1453 /* 1454 * XFILE: Failure to find a component is bad unless 1455 * 'discard_if_config_unknown=1' was specified in the 1456 * ereport definition. Indicate undiagnosable. 1457 */ 1458 out(O_ALTFP, "XFILE: Unable to map \"%s\" ereport " 1459 "to component path.", class); 1460 Undiag_reason = UD_NOPATH; 1461 fmcase = fmd_case_open(hdl, NULL); 1462 publish_undiagnosable(hdl, ffep, fmcase); 1463 } 1464 return; 1465 } 1466 1467 ipp = ipath(epnamenp); 1468 tree_free(epnamenp); 1469 fme_receive_report(hdl, ffep, class, ipp, nvl); 1470 } 1471 1472 /*ARGSUSED*/ 1473 void 1474 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1475 const char *eventstring) 1476 { 1477 char *uuid; 1478 nvlist_t **nva; 1479 uint_t nvc; 1480 const struct ipath *ipp; 1481 1482 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1483 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1484 &nva, &nvc) != 0) { 1485 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1486 return; 1487 } 1488 1489 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1490 1491 while (nvc-- != 0) { 1492 /* 1493 * Reset any istat or serd engine associated with this path. 1494 */ 1495 char *path; 1496 1497 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1498 continue; 1499 1500 path = ipath2str(NULL, ipp); 1501 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1502 path); 1503 FREE(path); 1504 1505 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1506 istat_save(); 1507 1508 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1509 serd_save(); 1510 } 1511 } 1512 1513 /*ARGSUSED*/ 1514 void 1515 fme_receive_topology_change(void) 1516 { 1517 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1518 istat_save(); 1519 1520 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1521 serd_save(); 1522 } 1523 1524 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1525 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1526 1527 /* ARGSUSED */ 1528 static void 1529 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1530 { 1531 struct bubble *bp; 1532 struct arrowlist *ap; 1533 1534 ep->cached_state = 0; 1535 ep->keep_in_tree = 0; 1536 for (bp = itree_next_bubble(ep, NULL); bp; 1537 bp = itree_next_bubble(ep, bp)) { 1538 if (bp->t != B_FROM) 1539 continue; 1540 bp->mark = 0; 1541 for (ap = itree_next_arrow(bp, NULL); ap; 1542 ap = itree_next_arrow(bp, ap)) 1543 ap->arrowp->mark = 0; 1544 } 1545 } 1546 1547 static void 1548 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1549 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1550 { 1551 struct event *ep; 1552 struct fme *fmep = NULL; 1553 struct fme *ofmep = NULL; 1554 struct fme *cfmep, *svfmep; 1555 int matched = 0; 1556 nvlist_t *defect; 1557 fmd_case_t *fmcase; 1558 1559 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1560 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1561 out(O_ALTFP|O_STAMP, NULL); 1562 1563 /* decide which FME it goes to */ 1564 for (fmep = FMElist; fmep; fmep = fmep->next) { 1565 int prev_verbose; 1566 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1567 enum fme_state state; 1568 nvlist_t *pre_peek_nvp = NULL; 1569 1570 if (fmep->overflow) { 1571 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1572 ofmep = fmep; 1573 1574 continue; 1575 } 1576 1577 /* 1578 * ignore solved or closed cases 1579 */ 1580 if (fmep->posted_suspects || 1581 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1582 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1583 continue; 1584 1585 /* look up event in event tree for this FME */ 1586 if ((ep = itree_lookup(fmep->eventtree, 1587 eventstring, ipp)) == NULL) 1588 continue; 1589 1590 /* note observation */ 1591 fmep->ecurrent = ep; 1592 if (ep->count++ == 0) { 1593 /* link it into list of observations seen */ 1594 ep->observations = fmep->observations; 1595 fmep->observations = ep; 1596 ep->nvp = evnv_dupnvl(nvl); 1597 } else { 1598 /* use new payload values for peek */ 1599 pre_peek_nvp = ep->nvp; 1600 ep->nvp = evnv_dupnvl(nvl); 1601 } 1602 1603 /* tell hypothesise() not to mess with suspect list */ 1604 fmep->peek = 1; 1605 1606 /* don't want this to be verbose (unless Debug is set) */ 1607 prev_verbose = Verbose; 1608 if (Debug == 0) 1609 Verbose = 0; 1610 1611 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1612 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1613 1614 fmep->peek = 0; 1615 1616 /* put verbose flag back */ 1617 Verbose = prev_verbose; 1618 1619 if (state != FME_DISPROVED) { 1620 /* found an FME that explains the ereport */ 1621 matched++; 1622 out(O_ALTFP|O_NONL, "["); 1623 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1624 out(O_ALTFP, " explained by FME%d]", fmep->id); 1625 1626 if (pre_peek_nvp) 1627 nvlist_free(pre_peek_nvp); 1628 1629 if (ep->count == 1) 1630 serialize_observation(fmep, eventstring, ipp); 1631 1632 if (ffep) { 1633 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1634 ep->ffep = ffep; 1635 } 1636 1637 stats_counter_bump(fmep->Rcount); 1638 1639 /* re-eval FME */ 1640 fme_eval(fmep, ffep); 1641 } else { 1642 1643 /* not a match, undo noting of observation */ 1644 fmep->ecurrent = NULL; 1645 if (--ep->count == 0) { 1646 /* unlink it from observations */ 1647 fmep->observations = ep->observations; 1648 ep->observations = NULL; 1649 nvlist_free(ep->nvp); 1650 ep->nvp = NULL; 1651 } else { 1652 nvlist_free(ep->nvp); 1653 ep->nvp = pre_peek_nvp; 1654 } 1655 } 1656 } 1657 1658 if (matched) 1659 return; /* explained by at least one existing FME */ 1660 1661 /* clean up closed fmes */ 1662 cfmep = ClosedFMEs; 1663 while (cfmep != NULL) { 1664 svfmep = cfmep->next; 1665 destroy_fme(cfmep); 1666 cfmep = svfmep; 1667 } 1668 ClosedFMEs = NULL; 1669 prune_propagations(eventstring, ipp); 1670 1671 if (ofmep) { 1672 out(O_ALTFP|O_NONL, "["); 1673 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1674 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1675 if (ffep) 1676 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1677 1678 return; 1679 1680 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1681 out(O_ALTFP|O_NONL, "["); 1682 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1683 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1684 1685 fmcase = fmd_case_open(hdl, NULL); 1686 1687 /* Create overflow fme */ 1688 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1689 out(O_ALTFP|O_NONL, "["); 1690 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1691 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1692 publish_undiagnosable(hdl, ffep, fmcase); 1693 return; 1694 } 1695 1696 Open_fme_count++; 1697 1698 init_fme_bufs(fmep); 1699 fmep->overflow = B_TRUE; 1700 1701 if (ffep) 1702 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1703 1704 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 1705 NULL, NULL, NULL); 1706 (void) nvlist_add_string(defect, UNDIAG_REASON, UD_MAXFME); 1707 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1708 fmd_case_solve(hdl, fmep->fmcase); 1709 return; 1710 } 1711 1712 /* open a case */ 1713 fmcase = fmd_case_open(hdl, NULL); 1714 1715 /* start a new FME */ 1716 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1717 out(O_ALTFP|O_NONL, "["); 1718 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1719 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1720 publish_undiagnosable(hdl, ffep, fmcase); 1721 return; 1722 } 1723 1724 Open_fme_count++; 1725 1726 init_fme_bufs(fmep); 1727 1728 out(O_ALTFP|O_NONL, "["); 1729 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1730 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1731 fmd_case_uuid(hdl, fmep->fmcase)); 1732 1733 ep = fmep->e0; 1734 ASSERT(ep != NULL); 1735 1736 /* note observation */ 1737 fmep->ecurrent = ep; 1738 if (ep->count++ == 0) { 1739 /* link it into list of observations seen */ 1740 ep->observations = fmep->observations; 1741 fmep->observations = ep; 1742 ep->nvp = evnv_dupnvl(nvl); 1743 serialize_observation(fmep, eventstring, ipp); 1744 } else { 1745 /* new payload overrides any previous */ 1746 nvlist_free(ep->nvp); 1747 ep->nvp = evnv_dupnvl(nvl); 1748 } 1749 1750 stats_counter_bump(fmep->Rcount); 1751 1752 if (ffep) { 1753 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1754 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1755 fmep->e0r = ffep; 1756 ep->ffep = ffep; 1757 } 1758 1759 /* give the diagnosis algorithm a shot at the new FME state */ 1760 fme_eval(fmep, ffep); 1761 } 1762 1763 void 1764 fme_status(int flags) 1765 { 1766 struct fme *fmep; 1767 1768 if (FMElist == NULL) { 1769 out(flags, "No fault management exercises underway."); 1770 return; 1771 } 1772 1773 for (fmep = FMElist; fmep; fmep = fmep->next) 1774 fme_print(flags, fmep); 1775 } 1776 1777 /* 1778 * "indent" routines used mostly for nicely formatted debug output, but also 1779 * for sanity checking for infinite recursion bugs. 1780 */ 1781 1782 #define MAX_INDENT 1024 1783 static const char *indent_s[MAX_INDENT]; 1784 static int current_indent; 1785 1786 static void 1787 indent_push(const char *s) 1788 { 1789 if (current_indent < MAX_INDENT) 1790 indent_s[current_indent++] = s; 1791 else 1792 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1793 } 1794 1795 static void 1796 indent_set(const char *s) 1797 { 1798 current_indent = 0; 1799 indent_push(s); 1800 } 1801 1802 static void 1803 indent_pop(void) 1804 { 1805 if (current_indent > 0) 1806 current_indent--; 1807 else 1808 out(O_DIE, "recursion underflow"); 1809 } 1810 1811 static void 1812 indent(void) 1813 { 1814 int i; 1815 if (!Verbose) 1816 return; 1817 for (i = 0; i < current_indent; i++) 1818 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1819 } 1820 1821 #define SLNEW 1 1822 #define SLCHANGED 2 1823 #define SLWAIT 3 1824 #define SLDISPROVED 4 1825 1826 static void 1827 print_suspects(int circumstance, struct fme *fmep) 1828 { 1829 struct event *ep; 1830 1831 out(O_ALTFP|O_NONL, "["); 1832 if (circumstance == SLCHANGED) { 1833 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1834 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1835 } else if (circumstance == SLWAIT) { 1836 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1837 fmep->timer); 1838 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1839 } else if (circumstance == SLDISPROVED) { 1840 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1841 } else { 1842 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1843 } 1844 1845 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1846 out(O_ALTFP, "]"); 1847 return; 1848 } 1849 1850 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1851 out(O_ALTFP|O_NONL, " "); 1852 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1853 } 1854 out(O_ALTFP, "]"); 1855 } 1856 1857 static struct node * 1858 eventprop_lookup(struct event *ep, const char *propname) 1859 { 1860 return (lut_lookup(ep->props, (void *)propname, NULL)); 1861 } 1862 1863 #define MAXDIGITIDX 23 1864 static char numbuf[MAXDIGITIDX + 1]; 1865 1866 static int 1867 node2uint(struct node *n, uint_t *valp) 1868 { 1869 struct evalue value; 1870 struct lut *globals = NULL; 1871 1872 if (n == NULL) 1873 return (1); 1874 1875 /* 1876 * check value.v since we are being asked to convert an unsigned 1877 * long long int to an unsigned int 1878 */ 1879 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1880 value.t != UINT64 || value.v > (1ULL << 32)) 1881 return (1); 1882 1883 *valp = (uint_t)value.v; 1884 1885 return (0); 1886 } 1887 1888 static nvlist_t * 1889 node2fmri(struct node *n) 1890 { 1891 nvlist_t **pa, *f, *p; 1892 struct node *nc; 1893 uint_t depth = 0; 1894 char *numstr, *nullbyte; 1895 char *failure; 1896 int err, i; 1897 1898 /* XXX do we need to be able to handle a non-T_NAME node? */ 1899 if (n == NULL || n->t != T_NAME) 1900 return (NULL); 1901 1902 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1903 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1904 break; 1905 depth++; 1906 } 1907 1908 if (nc != NULL) { 1909 /* We bailed early, something went wrong */ 1910 return (NULL); 1911 } 1912 1913 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1914 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1915 pa = alloca(depth * sizeof (nvlist_t *)); 1916 for (i = 0; i < depth; i++) 1917 pa[i] = NULL; 1918 1919 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1920 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1921 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1922 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1923 if (err != 0) { 1924 failure = "basic construction of FMRI failed"; 1925 goto boom; 1926 } 1927 1928 numbuf[MAXDIGITIDX] = '\0'; 1929 nullbyte = &numbuf[MAXDIGITIDX]; 1930 i = 0; 1931 1932 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1933 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1934 if (err != 0) { 1935 failure = "alloc of an hc-pair failed"; 1936 goto boom; 1937 } 1938 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1939 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1940 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1941 if (err != 0) { 1942 failure = "construction of an hc-pair failed"; 1943 goto boom; 1944 } 1945 pa[i++] = p; 1946 } 1947 1948 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1949 if (err == 0) { 1950 for (i = 0; i < depth; i++) 1951 if (pa[i] != NULL) 1952 nvlist_free(pa[i]); 1953 return (f); 1954 } 1955 failure = "addition of hc-pair array to FMRI failed"; 1956 1957 boom: 1958 for (i = 0; i < depth; i++) 1959 if (pa[i] != NULL) 1960 nvlist_free(pa[i]); 1961 nvlist_free(f); 1962 out(O_DIE, "%s", failure); 1963 /*NOTREACHED*/ 1964 return (NULL); 1965 } 1966 1967 /* an ipath cache entry is an array of these, with s==NULL at the end */ 1968 struct ipath { 1969 const char *s; /* component name (in stable) */ 1970 int i; /* instance number */ 1971 }; 1972 1973 static nvlist_t * 1974 ipath2fmri(struct ipath *ipath) 1975 { 1976 nvlist_t **pa, *f, *p; 1977 uint_t depth = 0; 1978 char *numstr, *nullbyte; 1979 char *failure; 1980 int err, i; 1981 struct ipath *ipp; 1982 1983 for (ipp = ipath; ipp->s != NULL; ipp++) 1984 depth++; 1985 1986 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1987 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1988 pa = alloca(depth * sizeof (nvlist_t *)); 1989 for (i = 0; i < depth; i++) 1990 pa[i] = NULL; 1991 1992 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1993 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1994 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1995 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1996 if (err != 0) { 1997 failure = "basic construction of FMRI failed"; 1998 goto boom; 1999 } 2000 2001 numbuf[MAXDIGITIDX] = '\0'; 2002 nullbyte = &numbuf[MAXDIGITIDX]; 2003 i = 0; 2004 2005 for (ipp = ipath; ipp->s != NULL; ipp++) { 2006 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2007 if (err != 0) { 2008 failure = "alloc of an hc-pair failed"; 2009 goto boom; 2010 } 2011 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2012 numstr = ulltostr(ipp->i, nullbyte); 2013 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2014 if (err != 0) { 2015 failure = "construction of an hc-pair failed"; 2016 goto boom; 2017 } 2018 pa[i++] = p; 2019 } 2020 2021 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2022 if (err == 0) { 2023 for (i = 0; i < depth; i++) 2024 if (pa[i] != NULL) 2025 nvlist_free(pa[i]); 2026 return (f); 2027 } 2028 failure = "addition of hc-pair array to FMRI failed"; 2029 2030 boom: 2031 for (i = 0; i < depth; i++) 2032 if (pa[i] != NULL) 2033 nvlist_free(pa[i]); 2034 nvlist_free(f); 2035 out(O_DIE, "%s", failure); 2036 /*NOTREACHED*/ 2037 return (NULL); 2038 } 2039 2040 static uint_t 2041 avg(uint_t sum, uint_t cnt) 2042 { 2043 unsigned long long s = sum * 10; 2044 2045 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 2046 } 2047 2048 static uint8_t 2049 percentof(uint_t part, uint_t whole) 2050 { 2051 unsigned long long p = part * 1000; 2052 2053 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2054 } 2055 2056 struct rsl { 2057 struct event *suspect; 2058 nvlist_t *asru; 2059 nvlist_t *fru; 2060 nvlist_t *rsrc; 2061 }; 2062 2063 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2064 2065 /* 2066 * rslfree -- free internal members of struct rsl not expected to be 2067 * freed elsewhere. 2068 */ 2069 static void 2070 rslfree(struct rsl *freeme) 2071 { 2072 if (freeme->asru != NULL) 2073 nvlist_free(freeme->asru); 2074 if (freeme->fru != NULL) 2075 nvlist_free(freeme->fru); 2076 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2077 nvlist_free(freeme->rsrc); 2078 } 2079 2080 /* 2081 * rslcmp -- compare two rsl structures. Use the following 2082 * comparisons to establish cardinality: 2083 * 2084 * 1. Name of the suspect's class. (simple strcmp) 2085 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2086 * 2087 */ 2088 static int 2089 rslcmp(const void *a, const void *b) 2090 { 2091 struct rsl *r1 = (struct rsl *)a; 2092 struct rsl *r2 = (struct rsl *)b; 2093 int rv; 2094 2095 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2096 r2->suspect->enode->u.event.ename->u.name.s); 2097 if (rv != 0) 2098 return (rv); 2099 2100 if (r1->rsrc == NULL && r2->rsrc == NULL) 2101 return (0); 2102 if (r1->rsrc == NULL) 2103 return (-1); 2104 if (r2->rsrc == NULL) 2105 return (1); 2106 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2107 } 2108 2109 /* 2110 * rsluniq -- given an array of rsl structures, seek out and "remove" 2111 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 2112 * of the array element. Removal also means updating the number of 2113 * problems and the number of problems which are not faults. User 2114 * provides the first and last element pointers. 2115 */ 2116 static void 2117 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 2118 { 2119 struct rsl *cr; 2120 2121 if (*nprobs == 1) 2122 return; 2123 2124 /* 2125 * At this point, we only expect duplicate defects. 2126 * Eversholt's diagnosis algorithm prevents duplicate 2127 * suspects, but we rewrite defects in the platform code after 2128 * the diagnosis is made, and that can introduce new 2129 * duplicates. 2130 */ 2131 while (first <= last) { 2132 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 2133 first++; 2134 continue; 2135 } 2136 cr = first + 1; 2137 while (cr <= last) { 2138 if (is_defect(first->suspect->t)) { 2139 if (rslcmp(first, cr) == 0) { 2140 cr->suspect = NULL; 2141 rslfree(cr); 2142 (*nprobs)--; 2143 (*nnonf)--; 2144 } 2145 } 2146 /* 2147 * assume all defects are in order after our 2148 * sort and short circuit here with "else break" ? 2149 */ 2150 cr++; 2151 } 2152 first++; 2153 } 2154 } 2155 2156 /* 2157 * get_resources -- for a given suspect, determine what ASRU, FRU and 2158 * RSRC nvlists should be advertised in the final suspect list. 2159 */ 2160 void 2161 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2162 { 2163 struct node *asrudef, *frudef; 2164 nvlist_t *asru, *fru; 2165 nvlist_t *rsrc = NULL; 2166 char *pathstr; 2167 2168 /* 2169 * First find any ASRU and/or FRU defined in the 2170 * initial fault tree. 2171 */ 2172 asrudef = eventprop_lookup(sp, L_ASRU); 2173 frudef = eventprop_lookup(sp, L_FRU); 2174 2175 /* 2176 * Create FMRIs based on those definitions 2177 */ 2178 asru = node2fmri(asrudef); 2179 fru = node2fmri(frudef); 2180 pathstr = ipath2str(NULL, sp->ipp); 2181 2182 /* 2183 * Allow for platform translations of the FMRIs 2184 */ 2185 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2186 pathstr); 2187 2188 FREE(pathstr); 2189 rsrcs->suspect = sp; 2190 rsrcs->asru = asru; 2191 rsrcs->fru = fru; 2192 rsrcs->rsrc = rsrc; 2193 } 2194 2195 /* 2196 * trim_suspects -- prior to publishing, we may need to remove some 2197 * suspects from the list. If we're auto-closing upsets, we don't 2198 * want any of those in the published list. If the ASRUs for multiple 2199 * defects resolve to the same ASRU (driver) we only want to publish 2200 * that as a single suspect. 2201 */ 2202 static int 2203 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2204 fmd_event_t *ffep, int *mess_zero_nonfaultp) 2205 { 2206 struct event *ep; 2207 struct rsl *rp = begin; 2208 struct rsl *rp2 = begin2; 2209 int mess_zero_count = 0; 2210 int serd_rval; 2211 uint_t messval; 2212 2213 /* remove any unwanted upsets and populate our array */ 2214 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2215 if (is_upset(ep->t)) 2216 continue; 2217 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2218 NULL, NULL); 2219 if (serd_rval == 0) 2220 continue; 2221 if (node2uint(eventprop_lookup(ep, L_message), 2222 &messval) == 0 && messval == 0) { 2223 get_resources(ep, rp2, fmep->config); 2224 rp2++; 2225 mess_zero_count++; 2226 if (!is_fault(ep->t)) 2227 (*mess_zero_nonfaultp)++; 2228 } else { 2229 get_resources(ep, rp, fmep->config); 2230 rp++; 2231 fmep->nsuspects++; 2232 if (!is_fault(ep->t)) 2233 fmep->nonfault++; 2234 } 2235 } 2236 return (mess_zero_count); 2237 } 2238 2239 /* 2240 * addpayloadprop -- add a payload prop to a problem 2241 */ 2242 static void 2243 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2244 { 2245 nvlist_t *rsrc, *hcs; 2246 2247 ASSERT(fault != NULL); 2248 ASSERT(lhs != NULL); 2249 ASSERT(rhs != NULL); 2250 2251 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2252 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2253 2254 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2255 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2256 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2257 out(O_DIE, 2258 "cannot add payloadprop \"%s\" to fault", lhs); 2259 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2260 out(O_DIE, 2261 "cannot add payloadprop \"%s\" to fault", lhs); 2262 nvlist_free(hcs); 2263 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2264 out(O_DIE, 2265 "cannot add payloadprop \"%s\" to fault", lhs); 2266 } else 2267 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2268 2269 if (rhs->t == UINT64) { 2270 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2271 2272 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2273 out(O_DIE, 2274 "cannot add payloadprop \"%s\" to fault", lhs); 2275 } else { 2276 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2277 lhs, (char *)(uintptr_t)rhs->v); 2278 2279 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2280 out(O_DIE, 2281 "cannot add payloadprop \"%s\" to fault", lhs); 2282 } 2283 } 2284 2285 static char *Istatbuf; 2286 static char *Istatbufptr; 2287 static int Istatsz; 2288 2289 /* 2290 * istataddsize -- calculate size of istat and add it to Istatsz 2291 */ 2292 /*ARGSUSED2*/ 2293 static void 2294 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2295 { 2296 int val; 2297 2298 ASSERT(lhs != NULL); 2299 ASSERT(rhs != NULL); 2300 2301 if ((val = stats_counter_value(rhs)) == 0) 2302 return; /* skip zero-valued stats */ 2303 2304 /* count up the size of the stat name */ 2305 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2306 Istatsz++; /* for the trailing NULL byte */ 2307 2308 /* count up the size of the stat value */ 2309 Istatsz += snprintf(NULL, 0, "%d", val); 2310 Istatsz++; /* for the trailing NULL byte */ 2311 } 2312 2313 /* 2314 * istat2str -- serialize an istat, writing result to *Istatbufptr 2315 */ 2316 /*ARGSUSED2*/ 2317 static void 2318 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2319 { 2320 char *str; 2321 int len; 2322 int val; 2323 2324 ASSERT(lhs != NULL); 2325 ASSERT(rhs != NULL); 2326 2327 if ((val = stats_counter_value(rhs)) == 0) 2328 return; /* skip zero-valued stats */ 2329 2330 /* serialize the stat name */ 2331 str = ipath2str(lhs->ename, lhs->ipath); 2332 len = strlen(str); 2333 2334 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2335 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2336 Istatbufptr += len; 2337 FREE(str); 2338 *Istatbufptr++ = '\0'; 2339 2340 /* serialize the stat value */ 2341 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2342 "%d", val); 2343 *Istatbufptr++ = '\0'; 2344 2345 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2346 } 2347 2348 void 2349 istat_save() 2350 { 2351 if (Istat_need_save == 0) 2352 return; 2353 2354 /* figure out how big the serialzed info is */ 2355 Istatsz = 0; 2356 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2357 2358 if (Istatsz == 0) { 2359 /* no stats to save */ 2360 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2361 return; 2362 } 2363 2364 /* create the serialized buffer */ 2365 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2366 lut_walk(Istats, (lut_cb)istat2str, NULL); 2367 2368 /* clear out current saved stats */ 2369 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2370 2371 /* write out the new version */ 2372 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2373 FREE(Istatbuf); 2374 2375 Istat_need_save = 0; 2376 } 2377 2378 int 2379 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2380 { 2381 if (ent1->ename != ent2->ename) 2382 return (ent2->ename - ent1->ename); 2383 if (ent1->ipath != ent2->ipath) 2384 return ((char *)ent2->ipath - (char *)ent1->ipath); 2385 2386 return (0); 2387 } 2388 2389 /* 2390 * istat-verify -- verify the component associated with a stat still exists 2391 * 2392 * if the component no longer exists, this routine resets the stat and 2393 * returns 0. if the component still exists, it returns 1. 2394 */ 2395 static int 2396 istat_verify(struct node *snp, struct istat_entry *entp) 2397 { 2398 struct stats *statp; 2399 nvlist_t *fmri; 2400 2401 fmri = node2fmri(snp->u.event.epname); 2402 if (platform_path_exists(fmri)) { 2403 nvlist_free(fmri); 2404 return (1); 2405 } 2406 nvlist_free(fmri); 2407 2408 /* component no longer in system. zero out the associated stats */ 2409 if ((statp = (struct stats *) 2410 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2411 stats_counter_value(statp) == 0) 2412 return (0); /* stat is already reset */ 2413 2414 Istat_need_save = 1; 2415 stats_counter_reset(statp); 2416 return (0); 2417 } 2418 2419 static void 2420 istat_bump(struct node *snp, int n) 2421 { 2422 struct stats *statp; 2423 struct istat_entry ent; 2424 2425 ASSERT(snp != NULL); 2426 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2427 ASSERT(snp->u.event.epname != NULL); 2428 2429 /* class name should be hoisted into a single stable entry */ 2430 ASSERT(snp->u.event.ename->u.name.next == NULL); 2431 ent.ename = snp->u.event.ename->u.name.s; 2432 ent.ipath = ipath(snp->u.event.epname); 2433 2434 if (!istat_verify(snp, &ent)) { 2435 /* component no longer exists in system, nothing to do */ 2436 return; 2437 } 2438 2439 if ((statp = (struct stats *) 2440 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2441 /* need to create the counter */ 2442 int cnt = 0; 2443 struct node *np; 2444 char *sname; 2445 char *snamep; 2446 struct istat_entry *newentp; 2447 2448 /* count up the size of the stat name */ 2449 np = snp->u.event.ename; 2450 while (np != NULL) { 2451 cnt += strlen(np->u.name.s); 2452 cnt++; /* for the '.' or '@' */ 2453 np = np->u.name.next; 2454 } 2455 np = snp->u.event.epname; 2456 while (np != NULL) { 2457 cnt += snprintf(NULL, 0, "%s%llu", 2458 np->u.name.s, np->u.name.child->u.ull); 2459 cnt++; /* for the '/' or trailing NULL byte */ 2460 np = np->u.name.next; 2461 } 2462 2463 /* build the stat name */ 2464 snamep = sname = alloca(cnt); 2465 np = snp->u.event.ename; 2466 while (np != NULL) { 2467 snamep += snprintf(snamep, &sname[cnt] - snamep, 2468 "%s", np->u.name.s); 2469 np = np->u.name.next; 2470 if (np) 2471 *snamep++ = '.'; 2472 } 2473 *snamep++ = '@'; 2474 np = snp->u.event.epname; 2475 while (np != NULL) { 2476 snamep += snprintf(snamep, &sname[cnt] - snamep, 2477 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2478 np = np->u.name.next; 2479 if (np) 2480 *snamep++ = '/'; 2481 } 2482 *snamep++ = '\0'; 2483 2484 /* create the new stat & add it to our list */ 2485 newentp = MALLOC(sizeof (*newentp)); 2486 *newentp = ent; 2487 statp = stats_new_counter(NULL, sname, 0); 2488 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2489 (lut_cmp)istat_cmp); 2490 } 2491 2492 /* if n is non-zero, set that value instead of bumping */ 2493 if (n) { 2494 stats_counter_reset(statp); 2495 stats_counter_add(statp, n); 2496 } else 2497 stats_counter_bump(statp); 2498 Istat_need_save = 1; 2499 2500 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2501 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2502 stats_counter_value(statp)); 2503 } 2504 2505 /*ARGSUSED*/ 2506 static void 2507 istat_destructor(void *left, void *right, void *arg) 2508 { 2509 struct istat_entry *entp = (struct istat_entry *)left; 2510 struct stats *statp = (struct stats *)right; 2511 FREE(entp); 2512 stats_delete(statp); 2513 } 2514 2515 /* 2516 * Callback used in a walk of the Istats to reset matching stat counters. 2517 */ 2518 static void 2519 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2520 const struct ipath *ipp) 2521 { 2522 char *path; 2523 2524 if (entp->ipath == ipp) { 2525 path = ipath2str(entp->ename, ipp); 2526 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2527 FREE(path); 2528 stats_counter_reset(statp); 2529 Istat_need_save = 1; 2530 } 2531 } 2532 2533 /*ARGSUSED*/ 2534 static void 2535 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2536 void *unused) 2537 { 2538 char *path; 2539 nvlist_t *fmri; 2540 2541 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2542 if (!platform_path_exists(fmri)) { 2543 path = ipath2str(entp->ename, entp->ipath); 2544 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2545 FREE(path); 2546 stats_counter_reset(statp); 2547 Istat_need_save = 1; 2548 } 2549 nvlist_free(fmri); 2550 } 2551 2552 void 2553 istat_fini(void) 2554 { 2555 lut_free(Istats, istat_destructor, NULL); 2556 } 2557 2558 static char *Serdbuf; 2559 static char *Serdbufptr; 2560 static int Serdsz; 2561 2562 /* 2563 * serdaddsize -- calculate size of serd and add it to Serdsz 2564 */ 2565 /*ARGSUSED*/ 2566 static void 2567 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2568 { 2569 ASSERT(lhs != NULL); 2570 2571 /* count up the size of the stat name */ 2572 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2573 Serdsz++; /* for the trailing NULL byte */ 2574 } 2575 2576 /* 2577 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2578 */ 2579 /*ARGSUSED*/ 2580 static void 2581 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2582 { 2583 char *str; 2584 int len; 2585 2586 ASSERT(lhs != NULL); 2587 2588 /* serialize the serd engine name */ 2589 str = ipath2str(lhs->ename, lhs->ipath); 2590 len = strlen(str); 2591 2592 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2593 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2594 Serdbufptr += len; 2595 FREE(str); 2596 *Serdbufptr++ = '\0'; 2597 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2598 } 2599 2600 void 2601 serd_save() 2602 { 2603 if (Serd_need_save == 0) 2604 return; 2605 2606 /* figure out how big the serialzed info is */ 2607 Serdsz = 0; 2608 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2609 2610 if (Serdsz == 0) { 2611 /* no serd engines to save */ 2612 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2613 return; 2614 } 2615 2616 /* create the serialized buffer */ 2617 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2618 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2619 2620 /* clear out current saved stats */ 2621 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2622 2623 /* write out the new version */ 2624 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2625 FREE(Serdbuf); 2626 Serd_need_save = 0; 2627 } 2628 2629 int 2630 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2631 { 2632 if (ent1->ename != ent2->ename) 2633 return (ent2->ename - ent1->ename); 2634 if (ent1->ipath != ent2->ipath) 2635 return ((char *)ent2->ipath - (char *)ent1->ipath); 2636 2637 return (0); 2638 } 2639 2640 void 2641 fme_serd_load(fmd_hdl_t *hdl) 2642 { 2643 int sz; 2644 char *sbuf; 2645 char *sepptr; 2646 char *ptr; 2647 struct serd_entry *newentp; 2648 struct node *epname; 2649 nvlist_t *fmri; 2650 char *namestring; 2651 2652 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2653 return; 2654 sbuf = alloca(sz); 2655 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2656 ptr = sbuf; 2657 while (ptr < &sbuf[sz]) { 2658 sepptr = strchr(ptr, '@'); 2659 *sepptr = '\0'; 2660 namestring = ptr; 2661 sepptr++; 2662 ptr = sepptr; 2663 ptr += strlen(ptr); 2664 ptr++; /* move past the '\0' separating paths */ 2665 epname = pathstring2epnamenp(sepptr); 2666 fmri = node2fmri(epname); 2667 if (platform_path_exists(fmri)) { 2668 newentp = MALLOC(sizeof (*newentp)); 2669 newentp->hdl = hdl; 2670 newentp->ipath = ipath(epname); 2671 newentp->ename = stable(namestring); 2672 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2673 (void *)newentp, (lut_cmp)serd_cmp); 2674 } else 2675 Serd_need_save = 1; 2676 tree_free(epname); 2677 nvlist_free(fmri); 2678 } 2679 /* save it back again in case some of the paths no longer exist */ 2680 serd_save(); 2681 } 2682 2683 /*ARGSUSED*/ 2684 static void 2685 serd_destructor(void *left, void *right, void *arg) 2686 { 2687 struct serd_entry *entp = (struct serd_entry *)left; 2688 FREE(entp); 2689 } 2690 2691 /* 2692 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2693 */ 2694 /*ARGSUSED*/ 2695 static void 2696 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2697 { 2698 char *path; 2699 2700 if (entp->ipath == ipp) { 2701 path = ipath2str(entp->ename, ipp); 2702 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2703 fmd_serd_reset(entp->hdl, path); 2704 FREE(path); 2705 Serd_need_save = 1; 2706 } 2707 } 2708 2709 /*ARGSUSED*/ 2710 static void 2711 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2712 { 2713 char *path; 2714 nvlist_t *fmri; 2715 2716 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2717 if (!platform_path_exists(fmri)) { 2718 path = ipath2str(entp->ename, entp->ipath); 2719 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2720 fmd_serd_reset(entp->hdl, path); 2721 FREE(path); 2722 Serd_need_save = 1; 2723 } 2724 nvlist_free(fmri); 2725 } 2726 2727 void 2728 serd_fini(void) 2729 { 2730 lut_free(SerdEngines, serd_destructor, NULL); 2731 } 2732 2733 static void 2734 publish_suspects(struct fme *fmep, struct rsl *srl) 2735 { 2736 struct rsl *rp; 2737 nvlist_t *fault; 2738 uint8_t cert; 2739 uint_t *frs; 2740 uint_t fravg, frsum, fr; 2741 uint_t messval; 2742 uint_t retireval; 2743 uint_t responseval; 2744 struct node *snp; 2745 int frcnt, fridx; 2746 boolean_t allfaulty = B_TRUE; 2747 struct rsl *erl = srl + fmep->nsuspects - 1; 2748 2749 /* 2750 * sort the array 2751 */ 2752 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2753 rsluniq(srl, erl, &fmep->nsuspects, &fmep->nonfault); 2754 2755 /* 2756 * If the suspect list is all faults, then for a given fault, 2757 * say X of N, X's certainty is computed via: 2758 * 2759 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 2760 * 2761 * If none of the suspects are faults, and there are N suspects, 2762 * the certainty of a given suspect is 100/N. 2763 * 2764 * If there are are a mixture of faults and other problems in 2765 * the suspect list, we take an average of the faults' 2766 * FITrates and treat this average as the FITrate for any 2767 * non-faults. The fitrate of any given suspect is then 2768 * computed per the first formula above. 2769 */ 2770 if (fmep->nonfault == fmep->nsuspects) { 2771 /* NO faults in the suspect list */ 2772 cert = percentof(1, fmep->nsuspects); 2773 } else { 2774 /* sum the fitrates */ 2775 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2776 fridx = frcnt = frsum = 0; 2777 2778 for (rp = srl; rp <= erl; rp++) { 2779 struct node *n; 2780 2781 if (rp->suspect == NULL) 2782 continue; 2783 if (!is_fault(rp->suspect->t)) { 2784 frs[fridx++] = 0; 2785 continue; 2786 } 2787 n = eventprop_lookup(rp->suspect, L_FITrate); 2788 if (node2uint(n, &fr) != 0) { 2789 out(O_DEBUG|O_NONL, "event "); 2790 ipath_print(O_DEBUG|O_NONL, 2791 rp->suspect->enode->u.event.ename->u.name.s, 2792 rp->suspect->ipp); 2793 out(O_DEBUG, " has no FITrate (using 1)"); 2794 fr = 1; 2795 } else if (fr == 0) { 2796 out(O_DEBUG|O_NONL, "event "); 2797 ipath_print(O_DEBUG|O_NONL, 2798 rp->suspect->enode->u.event.ename->u.name.s, 2799 rp->suspect->ipp); 2800 out(O_DEBUG, " has zero FITrate (using 1)"); 2801 fr = 1; 2802 } 2803 2804 frs[fridx++] = fr; 2805 frsum += fr; 2806 frcnt++; 2807 } 2808 fravg = avg(frsum, frcnt); 2809 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 2810 if (frs[fridx] == 0) { 2811 frs[fridx] = fravg; 2812 frsum += fravg; 2813 } 2814 } 2815 2816 /* Add them in reverse order of our sort, as fmd reverses order */ 2817 for (rp = erl; rp >= srl; rp--) { 2818 if (rp->suspect == NULL) 2819 continue; 2820 if (!is_fault(rp->suspect->t)) 2821 allfaulty = B_FALSE; 2822 if (fmep->nonfault != fmep->nsuspects) 2823 cert = percentof(frs[--fridx], frsum); 2824 fault = fmd_nvl_create_fault(fmep->hdl, 2825 rp->suspect->enode->u.event.ename->u.name.s, 2826 cert, 2827 rp->asru, 2828 rp->fru, 2829 rp->rsrc); 2830 if (fault == NULL) 2831 out(O_DIE, "fault creation failed"); 2832 /* if "message" property exists, add it to the fault */ 2833 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2834 &messval) == 0) { 2835 2836 out(O_ALTFP, 2837 "[FME%d, %s adds message=%d to suspect list]", 2838 fmep->id, 2839 rp->suspect->enode->u.event.ename->u.name.s, 2840 messval); 2841 if (nvlist_add_boolean_value(fault, 2842 FM_SUSPECT_MESSAGE, 2843 (messval) ? B_TRUE : B_FALSE) != 0) { 2844 out(O_DIE, "cannot add no-message to fault"); 2845 } 2846 } 2847 2848 /* if "retire" property exists, add it to the fault */ 2849 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2850 &retireval) == 0) { 2851 2852 out(O_ALTFP, 2853 "[FME%d, %s adds retire=%d to suspect list]", 2854 fmep->id, 2855 rp->suspect->enode->u.event.ename->u.name.s, 2856 retireval); 2857 if (nvlist_add_boolean_value(fault, 2858 FM_SUSPECT_RETIRE, 2859 (retireval) ? B_TRUE : B_FALSE) != 0) { 2860 out(O_DIE, "cannot add no-retire to fault"); 2861 } 2862 } 2863 2864 /* if "response" property exists, add it to the fault */ 2865 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2866 &responseval) == 0) { 2867 2868 out(O_ALTFP, 2869 "[FME%d, %s adds response=%d to suspect list]", 2870 fmep->id, 2871 rp->suspect->enode->u.event.ename->u.name.s, 2872 responseval); 2873 if (nvlist_add_boolean_value(fault, 2874 FM_SUSPECT_RESPONSE, 2875 (responseval) ? B_TRUE : B_FALSE) != 0) { 2876 out(O_DIE, "cannot add no-response to fault"); 2877 } 2878 } 2879 2880 /* add any payload properties */ 2881 lut_walk(rp->suspect->payloadprops, 2882 (lut_cb)addpayloadprop, (void *)fault); 2883 rslfree(rp); 2884 2885 /* 2886 * If "action" property exists, evaluate it; this must be done 2887 * before the allfaulty check below since some actions may 2888 * modify the asru to be used in fmd_nvl_fmri_faulty. This 2889 * needs to be restructured if any new actions are introduced 2890 * that have effects that we do not want to be visible if 2891 * we decide not to publish in the dupclose check below. 2892 */ 2893 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2894 struct evalue evalue; 2895 2896 out(O_ALTFP|O_NONL, 2897 "[FME%d, %s action ", fmep->id, 2898 rp->suspect->enode->u.event.ename->u.name.s); 2899 ptree_name_iter(O_ALTFP|O_NONL, snp); 2900 out(O_ALTFP, "]"); 2901 Action_nvl = fault; 2902 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2903 NULL, 0, &evalue); 2904 } 2905 2906 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2907 2908 /* 2909 * check if the asru is already marked as "faulty". 2910 */ 2911 if (allfaulty) { 2912 nvlist_t *asru; 2913 2914 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2915 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2916 out(O_ALTFP|O_VERB|O_NONL, " "); 2917 if (nvlist_lookup_nvlist(fault, 2918 FM_FAULT_ASRU, &asru) != 0) { 2919 out(O_ALTFP|O_VERB, "NULL asru"); 2920 allfaulty = B_FALSE; 2921 } else if (fmd_nvl_fmri_faulty(fmep->hdl, asru)) { 2922 out(O_ALTFP|O_VERB, "faulty"); 2923 } else { 2924 out(O_ALTFP|O_VERB, "not faulty"); 2925 allfaulty = B_FALSE; 2926 } 2927 } 2928 2929 } 2930 2931 if (!allfaulty) { 2932 /* 2933 * don't update the count stat if all asrus are already 2934 * present and unrepaired in the asru cache 2935 */ 2936 for (rp = erl; rp >= srl; rp--) { 2937 struct event *suspect = rp->suspect; 2938 2939 if (suspect == NULL) 2940 continue; 2941 2942 /* if "count" exists, increment the appropriate stat */ 2943 if ((snp = eventprop_lookup(suspect, 2944 L_count)) != NULL) { 2945 out(O_ALTFP|O_NONL, 2946 "[FME%d, %s count ", fmep->id, 2947 suspect->enode->u.event.ename->u.name.s); 2948 ptree_name_iter(O_ALTFP|O_NONL, snp); 2949 out(O_ALTFP, "]"); 2950 istat_bump(snp, 0); 2951 2952 } 2953 } 2954 istat_save(); /* write out any istat changes */ 2955 } 2956 } 2957 2958 static void 2959 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase) 2960 { 2961 struct case_list *newcase; 2962 nvlist_t *defect; 2963 2964 out(O_ALTFP, 2965 "[undiagnosable ereport received, " 2966 "creating and closing a new case (%s)]", 2967 Undiag_reason ? Undiag_reason : "reason not provided"); 2968 2969 newcase = MALLOC(sizeof (struct case_list)); 2970 newcase->next = NULL; 2971 newcase->fmcase = fmcase; 2972 if (Undiagablecaselist != NULL) 2973 newcase->next = Undiagablecaselist; 2974 Undiagablecaselist = newcase; 2975 2976 if (ffep != NULL) 2977 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 2978 2979 defect = fmd_nvl_create_fault(hdl, UNDIAGNOSABLE_DEFECT, 100, 2980 NULL, NULL, NULL); 2981 if (Undiag_reason != NULL) 2982 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 2983 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 2984 2985 fmd_case_solve(hdl, newcase->fmcase); 2986 fmd_case_close(hdl, newcase->fmcase); 2987 } 2988 2989 static void 2990 fme_undiagnosable(struct fme *f) 2991 { 2992 nvlist_t *defect; 2993 2994 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 2995 f->id, fmd_case_uuid(f->hdl, f->fmcase), 2996 Undiag_reason ? Undiag_reason : "undiagnosable"); 2997 2998 defect = fmd_nvl_create_fault(f->hdl, UNDIAGNOSABLE_DEFECT, 100, 2999 NULL, NULL, NULL); 3000 if (Undiag_reason != NULL) 3001 (void) nvlist_add_string(defect, UNDIAG_REASON, Undiag_reason); 3002 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3003 fmd_case_solve(f->hdl, f->fmcase); 3004 fmd_case_close(f->hdl, f->fmcase); 3005 } 3006 3007 /* 3008 * fme_close_case 3009 * 3010 * Find the requested case amongst our fmes and close it. Free up 3011 * the related fme. 3012 */ 3013 void 3014 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3015 { 3016 struct case_list *ucasep, *prevcasep = NULL; 3017 struct fme *prev = NULL; 3018 struct fme *fmep; 3019 3020 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3021 if (fmcase != ucasep->fmcase) { 3022 prevcasep = ucasep; 3023 continue; 3024 } 3025 3026 if (prevcasep == NULL) 3027 Undiagablecaselist = Undiagablecaselist->next; 3028 else 3029 prevcasep->next = ucasep->next; 3030 3031 FREE(ucasep); 3032 return; 3033 } 3034 3035 for (fmep = FMElist; fmep; fmep = fmep->next) { 3036 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3037 break; 3038 prev = fmep; 3039 } 3040 3041 if (fmep == NULL) { 3042 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3043 fmd_case_uuid(hdl, fmcase)); 3044 return; 3045 } 3046 3047 if (EFMElist == fmep) 3048 EFMElist = prev; 3049 3050 if (prev == NULL) 3051 FMElist = FMElist->next; 3052 else 3053 prev->next = fmep->next; 3054 3055 fmep->next = NULL; 3056 3057 /* Get rid of any timer this fme has set */ 3058 if (fmep->wull != 0) 3059 fmd_timer_remove(fmep->hdl, fmep->timer); 3060 3061 if (ClosedFMEs == NULL) { 3062 ClosedFMEs = fmep; 3063 } else { 3064 fmep->next = ClosedFMEs; 3065 ClosedFMEs = fmep; 3066 } 3067 3068 Open_fme_count--; 3069 3070 /* See if we can close the overflow FME */ 3071 if (Open_fme_count <= Max_fme) { 3072 for (fmep = FMElist; fmep; fmep = fmep->next) { 3073 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3074 fmep->fmcase))) 3075 break; 3076 } 3077 3078 if (fmep != NULL) 3079 fmd_case_close(fmep->hdl, fmep->fmcase); 3080 } 3081 } 3082 3083 /* 3084 * fme_set_timer() 3085 * If the time we need to wait for the given FME is less than the 3086 * current timer, kick that old timer out and establish a new one. 3087 */ 3088 static int 3089 fme_set_timer(struct fme *fmep, unsigned long long wull) 3090 { 3091 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3092 ptree_timeval(O_ALTFP|O_VERB, &wull); 3093 3094 if (wull <= fmep->pull) { 3095 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3096 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3097 out(O_ALTFP|O_VERB, NULL); 3098 /* we've waited at least wull already, don't need timer */ 3099 return (0); 3100 } 3101 3102 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3103 if (fmep->wull != 0) { 3104 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3105 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3106 out(O_ALTFP|O_VERB, NULL); 3107 } else { 3108 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3109 out(O_ALTFP|O_VERB, NULL); 3110 } 3111 3112 if (fmep->wull != 0) 3113 if (wull >= fmep->wull) 3114 /* New timer would fire later than established timer */ 3115 return (0); 3116 3117 if (fmep->wull != 0) { 3118 fmd_timer_remove(fmep->hdl, fmep->timer); 3119 } 3120 3121 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3122 fmep->e0r, wull); 3123 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3124 fmep->wull = wull; 3125 return (1); 3126 } 3127 3128 void 3129 fme_timer_fired(struct fme *fmep, id_t tid) 3130 { 3131 struct fme *ffmep = NULL; 3132 3133 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3134 if (ffmep == fmep) 3135 break; 3136 3137 if (ffmep == NULL) { 3138 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3139 (void *)fmep); 3140 return; 3141 } 3142 3143 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3144 fmep->pull = fmep->wull; 3145 fmep->wull = 0; 3146 fmd_buf_write(fmep->hdl, fmep->fmcase, 3147 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3148 3149 fme_eval(fmep, fmep->e0r); 3150 } 3151 3152 /* 3153 * Preserve the fme's suspect list in its psuspects list, NULLing the 3154 * suspects list in the meantime. 3155 */ 3156 static void 3157 save_suspects(struct fme *fmep) 3158 { 3159 struct event *ep; 3160 struct event *nextep; 3161 3162 /* zero out the previous suspect list */ 3163 for (ep = fmep->psuspects; ep; ep = nextep) { 3164 nextep = ep->psuspects; 3165 ep->psuspects = NULL; 3166 } 3167 fmep->psuspects = NULL; 3168 3169 /* zero out the suspect list, copying it to previous suspect list */ 3170 fmep->psuspects = fmep->suspects; 3171 for (ep = fmep->suspects; ep; ep = nextep) { 3172 nextep = ep->suspects; 3173 ep->psuspects = ep->suspects; 3174 ep->suspects = NULL; 3175 ep->is_suspect = 0; 3176 } 3177 fmep->suspects = NULL; 3178 fmep->nsuspects = 0; 3179 fmep->nonfault = 0; 3180 } 3181 3182 /* 3183 * Retrieve the fme's suspect list from its psuspects list. 3184 */ 3185 static void 3186 restore_suspects(struct fme *fmep) 3187 { 3188 struct event *ep; 3189 struct event *nextep; 3190 3191 fmep->nsuspects = fmep->nonfault = 0; 3192 fmep->suspects = fmep->psuspects; 3193 for (ep = fmep->psuspects; ep; ep = nextep) { 3194 fmep->nsuspects++; 3195 if (!is_fault(ep->t)) 3196 fmep->nonfault++; 3197 nextep = ep->psuspects; 3198 ep->suspects = ep->psuspects; 3199 } 3200 } 3201 3202 /* 3203 * this is what we use to call the Emrys prototype code instead of main() 3204 */ 3205 static void 3206 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3207 { 3208 struct event *ep; 3209 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3210 struct rsl *srl = NULL; 3211 struct rsl *srl2 = NULL; 3212 int mess_zero_count; 3213 int mess_zero_nonfault = 0; 3214 int rpcnt; 3215 3216 save_suspects(fmep); 3217 3218 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3219 indent_set(" "); 3220 3221 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3222 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3223 3224 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3225 fme_state2str(fmep->state)); 3226 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3227 out(O_ALTFP|O_NONL, " "); 3228 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3229 } 3230 out(O_ALTFP, NULL); 3231 3232 switch (fmep->state) { 3233 case FME_CREDIBLE: 3234 print_suspects(SLNEW, fmep); 3235 (void) upsets_eval(fmep, ffep); 3236 3237 /* 3238 * we may have already posted suspects in upsets_eval() which 3239 * can recurse into fme_eval() again. If so then just return. 3240 */ 3241 if (fmep->posted_suspects) 3242 return; 3243 3244 stats_counter_bump(fmep->diags); 3245 rpcnt = fmep->nsuspects; 3246 save_suspects(fmep); 3247 3248 /* 3249 * create two lists, one for "message=1" faults and one for 3250 * "message=0" faults. If we have a mixture we will generate 3251 * two separate suspect lists. 3252 */ 3253 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3254 bzero(srl, rpcnt * sizeof (struct rsl)); 3255 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3256 bzero(srl2, rpcnt * sizeof (struct rsl)); 3257 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep, 3258 &mess_zero_nonfault); 3259 3260 /* 3261 * If the resulting suspect list has no members, we're 3262 * done so simply close the case. Otherwise sort and publish. 3263 */ 3264 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3265 out(O_ALTFP, 3266 "[FME%d, case %s (all suspects are upsets)]", 3267 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3268 fmd_case_close(fmep->hdl, fmep->fmcase); 3269 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3270 publish_suspects(fmep, srl); 3271 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3272 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3273 fmd_case_solve(fmep->hdl, fmep->fmcase); 3274 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3275 fmep->nsuspects = mess_zero_count; 3276 fmep->nonfault = mess_zero_nonfault; 3277 publish_suspects(fmep, srl2); 3278 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3279 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3280 fmd_case_solve(fmep->hdl, fmep->fmcase); 3281 } else { 3282 struct event *obsp; 3283 struct fme *nfmep; 3284 3285 publish_suspects(fmep, srl); 3286 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3287 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3288 fmd_case_solve(fmep->hdl, fmep->fmcase); 3289 3290 /* 3291 * Got both message=0 and message=1 so create a 3292 * duplicate case. Also need a temporary duplicate fme 3293 * structure for use by publish_suspects(). 3294 */ 3295 nfmep = alloc_fme(); 3296 nfmep->id = Nextid++; 3297 nfmep->hdl = fmep->hdl; 3298 nfmep->nsuspects = mess_zero_count; 3299 nfmep->nonfault = mess_zero_nonfault; 3300 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3301 out(O_ALTFP|O_STAMP, 3302 "[creating parallel FME%d, case %s]", nfmep->id, 3303 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3304 Open_fme_count++; 3305 if (ffep) { 3306 fmd_case_setprincipal(nfmep->hdl, 3307 nfmep->fmcase, ffep); 3308 fmd_case_add_ereport(nfmep->hdl, 3309 nfmep->fmcase, ffep); 3310 } 3311 for (obsp = fmep->observations; obsp; 3312 obsp = obsp->observations) 3313 if (obsp->ffep && obsp->ffep != ffep) 3314 fmd_case_add_ereport(nfmep->hdl, 3315 nfmep->fmcase, obsp->ffep); 3316 3317 publish_suspects(nfmep, srl2); 3318 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3319 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3320 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3321 FREE(nfmep); 3322 } 3323 FREE(srl); 3324 FREE(srl2); 3325 restore_suspects(fmep); 3326 3327 fmep->posted_suspects = 1; 3328 fmd_buf_write(fmep->hdl, fmep->fmcase, 3329 WOBUF_POSTD, 3330 (void *)&fmep->posted_suspects, 3331 sizeof (fmep->posted_suspects)); 3332 3333 /* 3334 * Now the suspects have been posted, we can clear up 3335 * the instance tree as we won't be looking at it again. 3336 * Also cancel the timer as the case is now solved. 3337 */ 3338 if (fmep->wull != 0) { 3339 fmd_timer_remove(fmep->hdl, fmep->timer); 3340 fmep->wull = 0; 3341 } 3342 break; 3343 3344 case FME_WAIT: 3345 ASSERT(my_delay > fmep->ull); 3346 (void) fme_set_timer(fmep, my_delay); 3347 print_suspects(SLWAIT, fmep); 3348 itree_prune(fmep->eventtree); 3349 return; 3350 3351 case FME_DISPROVED: 3352 print_suspects(SLDISPROVED, fmep); 3353 Undiag_reason = UD_UNSOLVD; 3354 fme_undiagnosable(fmep); 3355 break; 3356 } 3357 3358 itree_free(fmep->eventtree); 3359 fmep->eventtree = NULL; 3360 structconfig_free(fmep->config); 3361 fmep->config = NULL; 3362 destroy_fme_bufs(fmep); 3363 } 3364 3365 static void indent(void); 3366 static int triggered(struct fme *fmep, struct event *ep, int mark); 3367 static enum fme_state effects_test(struct fme *fmep, 3368 struct event *fault_event, unsigned long long at_latest_by, 3369 unsigned long long *pdelay); 3370 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3371 unsigned long long at_latest_by, unsigned long long *pdelay); 3372 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3373 unsigned long long at_latest_by, unsigned long long *pdelay); 3374 3375 static int 3376 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3377 { 3378 struct constraintlist *ctp; 3379 struct evalue value; 3380 char *sep = ""; 3381 3382 if (arrowp->forever_false) { 3383 indent(); 3384 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3385 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3386 out(O_ALTFP|O_VERB|O_NONL, sep); 3387 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3388 sep = ", "; 3389 } 3390 out(O_ALTFP|O_VERB, NULL); 3391 return (0); 3392 } 3393 if (arrowp->forever_true) { 3394 indent(); 3395 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3396 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3397 out(O_ALTFP|O_VERB|O_NONL, sep); 3398 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3399 sep = ", "; 3400 } 3401 out(O_ALTFP|O_VERB, NULL); 3402 return (1); 3403 } 3404 3405 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3406 if (eval_expr(ctp->cnode, NULL, NULL, 3407 &fmep->globals, fmep->config, 3408 arrowp, 0, &value)) { 3409 /* evaluation successful */ 3410 if (value.t == UNDEFINED || value.v == 0) { 3411 /* known false */ 3412 arrowp->forever_false = 1; 3413 indent(); 3414 out(O_ALTFP|O_VERB|O_NONL, 3415 " False constraint: "); 3416 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3417 out(O_ALTFP|O_VERB, NULL); 3418 return (0); 3419 } 3420 } else { 3421 /* evaluation unsuccessful -- unknown value */ 3422 indent(); 3423 out(O_ALTFP|O_VERB|O_NONL, 3424 " Deferred constraint: "); 3425 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3426 out(O_ALTFP|O_VERB, NULL); 3427 return (1); 3428 } 3429 } 3430 /* known true */ 3431 arrowp->forever_true = 1; 3432 indent(); 3433 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3434 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3435 out(O_ALTFP|O_VERB|O_NONL, sep); 3436 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3437 sep = ", "; 3438 } 3439 out(O_ALTFP|O_VERB, NULL); 3440 return (1); 3441 } 3442 3443 static int 3444 triggered(struct fme *fmep, struct event *ep, int mark) 3445 { 3446 struct bubble *bp; 3447 struct arrowlist *ap; 3448 int count = 0; 3449 3450 stats_counter_bump(fmep->Tcallcount); 3451 for (bp = itree_next_bubble(ep, NULL); bp; 3452 bp = itree_next_bubble(ep, bp)) { 3453 if (bp->t != B_TO) 3454 continue; 3455 for (ap = itree_next_arrow(bp, NULL); ap; 3456 ap = itree_next_arrow(bp, ap)) { 3457 /* check count of marks against K in the bubble */ 3458 if ((ap->arrowp->mark & mark) && 3459 ++count >= bp->nork) 3460 return (1); 3461 } 3462 } 3463 return (0); 3464 } 3465 3466 static int 3467 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3468 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3469 { 3470 struct bubble *bp; 3471 struct arrowlist *ap; 3472 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3473 unsigned long long my_delay; 3474 enum fme_state result; 3475 int retval = 0; 3476 3477 for (bp = itree_next_bubble(ep, NULL); bp; 3478 bp = itree_next_bubble(ep, bp)) { 3479 if (bp->t != B_FROM) 3480 continue; 3481 stats_counter_bump(fmep->Marrowcount); 3482 for (ap = itree_next_arrow(bp, NULL); ap; 3483 ap = itree_next_arrow(bp, ap)) { 3484 struct event *ep2 = ap->arrowp->head->myevent; 3485 /* 3486 * if we're clearing marks, we can avoid doing 3487 * all that work evaluating constraints. 3488 */ 3489 if (mark == 0) { 3490 if (ap->arrowp->arrow_marked == 0) 3491 continue; 3492 ap->arrowp->arrow_marked = 0; 3493 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3494 if (keep && (ep2->cached_state & 3495 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3496 ep2->keep_in_tree = 1; 3497 ep2->cached_state &= 3498 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3499 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3500 keep); 3501 continue; 3502 } 3503 ap->arrowp->arrow_marked = 1; 3504 if (ep2->cached_state & REQMNTS_DISPROVED) { 3505 indent(); 3506 out(O_ALTFP|O_VERB|O_NONL, 3507 " ALREADY DISPROVED "); 3508 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3509 out(O_ALTFP|O_VERB, NULL); 3510 continue; 3511 } 3512 if (ep2->cached_state & WAIT_EFFECT) { 3513 indent(); 3514 out(O_ALTFP|O_VERB|O_NONL, 3515 " ALREADY EFFECTS WAIT "); 3516 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3517 out(O_ALTFP|O_VERB, NULL); 3518 continue; 3519 } 3520 if (ep2->cached_state & CREDIBLE_EFFECT) { 3521 indent(); 3522 out(O_ALTFP|O_VERB|O_NONL, 3523 " ALREADY EFFECTS CREDIBLE "); 3524 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3525 out(O_ALTFP|O_VERB, NULL); 3526 continue; 3527 } 3528 if ((ep2->cached_state & PARENT_WAIT) && 3529 (mark & PARENT_WAIT)) { 3530 indent(); 3531 out(O_ALTFP|O_VERB|O_NONL, 3532 " ALREADY PARENT EFFECTS WAIT "); 3533 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3534 out(O_ALTFP|O_VERB, NULL); 3535 continue; 3536 } 3537 platform_set_payloadnvp(ep2->nvp); 3538 if (checkconstraints(fmep, ap->arrowp) == 0) { 3539 platform_set_payloadnvp(NULL); 3540 indent(); 3541 out(O_ALTFP|O_VERB|O_NONL, 3542 " CONSTRAINTS FAIL "); 3543 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3544 out(O_ALTFP|O_VERB, NULL); 3545 continue; 3546 } 3547 platform_set_payloadnvp(NULL); 3548 ap->arrowp->mark |= EFFECTS_COUNTER; 3549 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3550 indent(); 3551 out(O_ALTFP|O_VERB|O_NONL, 3552 " K-COUNT NOT YET MET "); 3553 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3554 out(O_ALTFP|O_VERB, NULL); 3555 continue; 3556 } 3557 ep2->cached_state &= ~PARENT_WAIT; 3558 /* 3559 * if we've reached an ereport and no propagation time 3560 * is specified, use the Hesitate value 3561 */ 3562 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3563 ap->arrowp->maxdelay == 0ULL) { 3564 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3565 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3566 out(O_ALTFP|O_VERB, NULL); 3567 result = requirements_test(fmep, ep2, Hesitate, 3568 &my_delay); 3569 } else { 3570 result = requirements_test(fmep, ep2, 3571 at_latest_by + ap->arrowp->maxdelay, 3572 &my_delay); 3573 } 3574 if (result == FME_WAIT) { 3575 retval = WAIT_EFFECT; 3576 if (overall_delay > my_delay) 3577 overall_delay = my_delay; 3578 ep2->cached_state |= WAIT_EFFECT; 3579 indent(); 3580 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3581 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3582 out(O_ALTFP|O_VERB, NULL); 3583 indent_push(" E"); 3584 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3585 at_latest_by, &my_delay, 0) == 3586 WAIT_EFFECT) { 3587 retval = WAIT_EFFECT; 3588 if (overall_delay > my_delay) 3589 overall_delay = my_delay; 3590 } 3591 indent_pop(); 3592 } else if (result == FME_DISPROVED) { 3593 indent(); 3594 out(O_ALTFP|O_VERB|O_NONL, 3595 " EFFECTS DISPROVED "); 3596 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3597 out(O_ALTFP|O_VERB, NULL); 3598 } else { 3599 ep2->cached_state |= mark; 3600 indent(); 3601 if (mark == CREDIBLE_EFFECT) 3602 out(O_ALTFP|O_VERB|O_NONL, 3603 " EFFECTS CREDIBLE "); 3604 else 3605 out(O_ALTFP|O_VERB|O_NONL, 3606 " PARENT EFFECTS WAIT "); 3607 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3608 out(O_ALTFP|O_VERB, NULL); 3609 indent_push(" E"); 3610 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3611 &my_delay, 0) == WAIT_EFFECT) { 3612 retval = WAIT_EFFECT; 3613 if (overall_delay > my_delay) 3614 overall_delay = my_delay; 3615 } 3616 indent_pop(); 3617 } 3618 } 3619 } 3620 if (retval == WAIT_EFFECT) 3621 *pdelay = overall_delay; 3622 return (retval); 3623 } 3624 3625 static enum fme_state 3626 effects_test(struct fme *fmep, struct event *fault_event, 3627 unsigned long long at_latest_by, unsigned long long *pdelay) 3628 { 3629 struct event *error_event; 3630 enum fme_state return_value = FME_CREDIBLE; 3631 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3632 unsigned long long my_delay; 3633 3634 stats_counter_bump(fmep->Ecallcount); 3635 indent_push(" E"); 3636 indent(); 3637 out(O_ALTFP|O_VERB|O_NONL, "->"); 3638 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3639 out(O_ALTFP|O_VERB, NULL); 3640 3641 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3642 &my_delay, 0) == WAIT_EFFECT) { 3643 return_value = FME_WAIT; 3644 if (overall_delay > my_delay) 3645 overall_delay = my_delay; 3646 } 3647 for (error_event = fmep->observations; 3648 error_event; error_event = error_event->observations) { 3649 indent(); 3650 out(O_ALTFP|O_VERB|O_NONL, " "); 3651 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3652 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3653 if (error_event->cached_state & 3654 (PARENT_WAIT|WAIT_EFFECT)) { 3655 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3656 continue; 3657 } 3658 return_value = FME_DISPROVED; 3659 out(O_ALTFP|O_VERB, " NOT triggered"); 3660 break; 3661 } else { 3662 out(O_ALTFP|O_VERB, " triggered"); 3663 } 3664 } 3665 if (return_value == FME_DISPROVED) { 3666 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3667 } else { 3668 fault_event->keep_in_tree = 1; 3669 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3670 } 3671 3672 indent(); 3673 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3674 fme_state2str(return_value)); 3675 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3676 out(O_ALTFP|O_VERB, NULL); 3677 indent_pop(); 3678 if (return_value == FME_WAIT) 3679 *pdelay = overall_delay; 3680 return (return_value); 3681 } 3682 3683 static enum fme_state 3684 requirements_test(struct fme *fmep, struct event *ep, 3685 unsigned long long at_latest_by, unsigned long long *pdelay) 3686 { 3687 int waiting_events; 3688 int credible_events; 3689 int deferred_events; 3690 enum fme_state return_value = FME_CREDIBLE; 3691 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3692 unsigned long long arrow_delay; 3693 unsigned long long my_delay; 3694 struct event *ep2; 3695 struct bubble *bp; 3696 struct arrowlist *ap; 3697 3698 if (ep->cached_state & REQMNTS_CREDIBLE) { 3699 indent(); 3700 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3701 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3702 out(O_ALTFP|O_VERB, NULL); 3703 return (FME_CREDIBLE); 3704 } 3705 if (ep->cached_state & REQMNTS_DISPROVED) { 3706 indent(); 3707 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3708 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3709 out(O_ALTFP|O_VERB, NULL); 3710 return (FME_DISPROVED); 3711 } 3712 if (ep->cached_state & REQMNTS_WAIT) { 3713 indent(); 3714 *pdelay = ep->cached_delay; 3715 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3716 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3717 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3718 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3719 out(O_ALTFP|O_VERB, NULL); 3720 return (FME_WAIT); 3721 } 3722 stats_counter_bump(fmep->Rcallcount); 3723 indent_push(" R"); 3724 indent(); 3725 out(O_ALTFP|O_VERB|O_NONL, "->"); 3726 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3727 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3728 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3729 out(O_ALTFP|O_VERB, NULL); 3730 3731 if (ep->t == N_EREPORT) { 3732 if (ep->count == 0) { 3733 if (fmep->pull >= at_latest_by) { 3734 return_value = FME_DISPROVED; 3735 } else { 3736 ep->cached_delay = *pdelay = at_latest_by; 3737 return_value = FME_WAIT; 3738 } 3739 } 3740 3741 indent(); 3742 switch (return_value) { 3743 case FME_CREDIBLE: 3744 ep->cached_state |= REQMNTS_CREDIBLE; 3745 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3746 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3747 break; 3748 case FME_DISPROVED: 3749 ep->cached_state |= REQMNTS_DISPROVED; 3750 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3751 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3752 break; 3753 case FME_WAIT: 3754 ep->cached_state |= REQMNTS_WAIT; 3755 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3756 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3757 out(O_ALTFP|O_VERB|O_NONL, " to "); 3758 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3759 break; 3760 default: 3761 out(O_DIE, "requirements_test: unexpected fme_state"); 3762 break; 3763 } 3764 out(O_ALTFP|O_VERB, NULL); 3765 indent_pop(); 3766 3767 return (return_value); 3768 } 3769 3770 /* this event is not a report, descend the tree */ 3771 for (bp = itree_next_bubble(ep, NULL); bp; 3772 bp = itree_next_bubble(ep, bp)) { 3773 int n; 3774 3775 if (bp->t != B_FROM) 3776 continue; 3777 3778 n = bp->nork; 3779 3780 credible_events = 0; 3781 waiting_events = 0; 3782 deferred_events = 0; 3783 arrow_delay = TIMEVAL_EVENTUALLY; 3784 /* 3785 * n is -1 for 'A' so adjust it. 3786 * XXX just count up the arrows for now. 3787 */ 3788 if (n < 0) { 3789 n = 0; 3790 for (ap = itree_next_arrow(bp, NULL); ap; 3791 ap = itree_next_arrow(bp, ap)) 3792 n++; 3793 indent(); 3794 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3795 } else { 3796 indent(); 3797 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3798 } 3799 3800 if (n == 0) 3801 continue; 3802 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3803 for (ap = itree_next_arrow(bp, NULL); ap; 3804 ap = itree_next_arrow(bp, ap)) { 3805 ep2 = ap->arrowp->head->myevent; 3806 platform_set_payloadnvp(ep2->nvp); 3807 (void) checkconstraints(fmep, ap->arrowp); 3808 if (ap->arrowp->forever_true) { 3809 /* 3810 * if all arrows are invalidated by the 3811 * constraints, then we should elide the 3812 * whole bubble to be consistant with 3813 * the tree creation time behaviour 3814 */ 3815 bp->mark |= BUBBLE_OK; 3816 platform_set_payloadnvp(NULL); 3817 break; 3818 } 3819 platform_set_payloadnvp(NULL); 3820 } 3821 } 3822 for (ap = itree_next_arrow(bp, NULL); ap; 3823 ap = itree_next_arrow(bp, ap)) { 3824 ep2 = ap->arrowp->head->myevent; 3825 if (n <= credible_events) 3826 break; 3827 3828 ap->arrowp->mark |= REQMNTS_COUNTER; 3829 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3830 /* XXX adding max timevals! */ 3831 switch (requirements_test(fmep, ep2, 3832 at_latest_by + ap->arrowp->maxdelay, 3833 &my_delay)) { 3834 case FME_DEFERRED: 3835 deferred_events++; 3836 break; 3837 case FME_CREDIBLE: 3838 credible_events++; 3839 break; 3840 case FME_DISPROVED: 3841 break; 3842 case FME_WAIT: 3843 if (my_delay < arrow_delay) 3844 arrow_delay = my_delay; 3845 waiting_events++; 3846 break; 3847 default: 3848 out(O_DIE, 3849 "Bug in requirements_test."); 3850 } 3851 else 3852 deferred_events++; 3853 } 3854 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3855 bp->mark |= BUBBLE_ELIDED; 3856 continue; 3857 } 3858 indent(); 3859 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3860 credible_events + deferred_events, waiting_events); 3861 if (credible_events + deferred_events + waiting_events < n) { 3862 /* Can never meet requirements */ 3863 ep->cached_state |= REQMNTS_DISPROVED; 3864 indent(); 3865 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3866 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3867 out(O_ALTFP|O_VERB, NULL); 3868 indent_pop(); 3869 return (FME_DISPROVED); 3870 } 3871 if (credible_events + deferred_events < n) { 3872 /* will have to wait */ 3873 /* wait time is shortest known */ 3874 if (arrow_delay < overall_delay) 3875 overall_delay = arrow_delay; 3876 return_value = FME_WAIT; 3877 } else if (credible_events < n) { 3878 if (return_value != FME_WAIT) 3879 return_value = FME_DEFERRED; 3880 } 3881 } 3882 3883 /* 3884 * don't mark as FME_DEFERRED. If this event isn't reached by another 3885 * path, then this will be considered FME_CREDIBLE. But if it is 3886 * reached by a different path so the K-count is met, then might 3887 * get overridden by FME_WAIT or FME_DISPROVED. 3888 */ 3889 if (return_value == FME_WAIT) { 3890 ep->cached_state |= REQMNTS_WAIT; 3891 ep->cached_delay = *pdelay = overall_delay; 3892 } else if (return_value == FME_CREDIBLE) { 3893 ep->cached_state |= REQMNTS_CREDIBLE; 3894 } 3895 indent(); 3896 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 3897 fme_state2str(return_value)); 3898 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3899 out(O_ALTFP|O_VERB, NULL); 3900 indent_pop(); 3901 return (return_value); 3902 } 3903 3904 static enum fme_state 3905 causes_test(struct fme *fmep, struct event *ep, 3906 unsigned long long at_latest_by, unsigned long long *pdelay) 3907 { 3908 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3909 unsigned long long my_delay; 3910 int credible_results = 0; 3911 int waiting_results = 0; 3912 enum fme_state fstate; 3913 struct event *tail_event; 3914 struct bubble *bp; 3915 struct arrowlist *ap; 3916 int k = 1; 3917 3918 stats_counter_bump(fmep->Ccallcount); 3919 indent_push(" C"); 3920 indent(); 3921 out(O_ALTFP|O_VERB|O_NONL, "->"); 3922 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3923 out(O_ALTFP|O_VERB, NULL); 3924 3925 for (bp = itree_next_bubble(ep, NULL); bp; 3926 bp = itree_next_bubble(ep, bp)) { 3927 if (bp->t != B_TO) 3928 continue; 3929 k = bp->nork; /* remember the K value */ 3930 for (ap = itree_next_arrow(bp, NULL); ap; 3931 ap = itree_next_arrow(bp, ap)) { 3932 int do_not_follow = 0; 3933 3934 /* 3935 * if we get to the same event multiple times 3936 * only worry about the first one. 3937 */ 3938 if (ap->arrowp->tail->myevent->cached_state & 3939 CAUSES_TESTED) { 3940 indent(); 3941 out(O_ALTFP|O_VERB|O_NONL, 3942 " causes test already run for "); 3943 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3944 ap->arrowp->tail->myevent); 3945 out(O_ALTFP|O_VERB, NULL); 3946 continue; 3947 } 3948 3949 /* 3950 * see if false constraint prevents us 3951 * from traversing this arrow 3952 */ 3953 platform_set_payloadnvp(ep->nvp); 3954 if (checkconstraints(fmep, ap->arrowp) == 0) 3955 do_not_follow = 1; 3956 platform_set_payloadnvp(NULL); 3957 if (do_not_follow) { 3958 indent(); 3959 out(O_ALTFP|O_VERB|O_NONL, 3960 " False arrow from "); 3961 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3962 ap->arrowp->tail->myevent); 3963 out(O_ALTFP|O_VERB, NULL); 3964 continue; 3965 } 3966 3967 ap->arrowp->tail->myevent->cached_state |= 3968 CAUSES_TESTED; 3969 tail_event = ap->arrowp->tail->myevent; 3970 fstate = hypothesise(fmep, tail_event, at_latest_by, 3971 &my_delay); 3972 3973 switch (fstate) { 3974 case FME_WAIT: 3975 if (my_delay < overall_delay) 3976 overall_delay = my_delay; 3977 waiting_results++; 3978 break; 3979 case FME_CREDIBLE: 3980 credible_results++; 3981 break; 3982 case FME_DISPROVED: 3983 break; 3984 default: 3985 out(O_DIE, "Bug in causes_test"); 3986 } 3987 } 3988 } 3989 /* compare against K */ 3990 if (credible_results + waiting_results < k) { 3991 indent(); 3992 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 3993 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3994 out(O_ALTFP|O_VERB, NULL); 3995 indent_pop(); 3996 return (FME_DISPROVED); 3997 } 3998 if (waiting_results != 0) { 3999 *pdelay = overall_delay; 4000 indent(); 4001 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4002 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4003 out(O_ALTFP|O_VERB|O_NONL, " to "); 4004 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4005 out(O_ALTFP|O_VERB, NULL); 4006 indent_pop(); 4007 return (FME_WAIT); 4008 } 4009 indent(); 4010 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4011 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4012 out(O_ALTFP|O_VERB, NULL); 4013 indent_pop(); 4014 return (FME_CREDIBLE); 4015 } 4016 4017 static enum fme_state 4018 hypothesise(struct fme *fmep, struct event *ep, 4019 unsigned long long at_latest_by, unsigned long long *pdelay) 4020 { 4021 enum fme_state rtr, otr; 4022 unsigned long long my_delay; 4023 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4024 4025 stats_counter_bump(fmep->Hcallcount); 4026 indent_push(" H"); 4027 indent(); 4028 out(O_ALTFP|O_VERB|O_NONL, "->"); 4029 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4030 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4031 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4032 out(O_ALTFP|O_VERB, NULL); 4033 4034 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4035 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4036 overall_delay = my_delay; 4037 if (rtr != FME_DISPROVED) { 4038 if (is_problem(ep->t)) { 4039 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4040 if (otr != FME_DISPROVED) { 4041 if (fmep->peek == 0 && ep->is_suspect == 0) { 4042 ep->suspects = fmep->suspects; 4043 ep->is_suspect = 1; 4044 fmep->suspects = ep; 4045 fmep->nsuspects++; 4046 if (!is_fault(ep->t)) 4047 fmep->nonfault++; 4048 } 4049 } 4050 } else 4051 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4052 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4053 overall_delay = my_delay; 4054 if ((otr != FME_DISPROVED) && 4055 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4056 *pdelay = overall_delay; 4057 } 4058 if (rtr == FME_DISPROVED) { 4059 indent(); 4060 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4061 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4062 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4063 indent_pop(); 4064 return (FME_DISPROVED); 4065 } 4066 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4067 indent(); 4068 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4069 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4070 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4071 indent_pop(); 4072 return (FME_DISPROVED); 4073 } 4074 if (otr == FME_DISPROVED) { 4075 indent(); 4076 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4077 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4078 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4079 indent_pop(); 4080 return (FME_DISPROVED); 4081 } 4082 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4083 indent(); 4084 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4085 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4086 out(O_ALTFP|O_VERB|O_NONL, " to "); 4087 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4088 out(O_ALTFP|O_VERB, NULL); 4089 indent_pop(); 4090 return (FME_WAIT); 4091 } 4092 indent(); 4093 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4094 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4095 out(O_ALTFP|O_VERB, NULL); 4096 indent_pop(); 4097 return (FME_CREDIBLE); 4098 } 4099 4100 /* 4101 * fme_istat_load -- reconstitute any persistent istats 4102 */ 4103 void 4104 fme_istat_load(fmd_hdl_t *hdl) 4105 { 4106 int sz; 4107 char *sbuf; 4108 char *ptr; 4109 4110 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4111 out(O_ALTFP, "fme_istat_load: No stats"); 4112 return; 4113 } 4114 4115 sbuf = alloca(sz); 4116 4117 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4118 4119 /* 4120 * pick apart the serialized stats 4121 * 4122 * format is: 4123 * <class-name>, '@', <path>, '\0', <value>, '\0' 4124 * for example: 4125 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4126 * 4127 * since this is parsing our own serialized data, any parsing issues 4128 * are fatal, so we check for them all with ASSERT() below. 4129 */ 4130 ptr = sbuf; 4131 while (ptr < &sbuf[sz]) { 4132 char *sepptr; 4133 struct node *np; 4134 int val; 4135 4136 sepptr = strchr(ptr, '@'); 4137 ASSERT(sepptr != NULL); 4138 *sepptr = '\0'; 4139 4140 /* construct the event */ 4141 np = newnode(T_EVENT, NULL, 0); 4142 np->u.event.ename = newnode(T_NAME, NULL, 0); 4143 np->u.event.ename->u.name.t = N_STAT; 4144 np->u.event.ename->u.name.s = stable(ptr); 4145 np->u.event.ename->u.name.it = IT_ENAME; 4146 np->u.event.ename->u.name.last = np->u.event.ename; 4147 4148 ptr = sepptr + 1; 4149 ASSERT(ptr < &sbuf[sz]); 4150 ptr += strlen(ptr); 4151 ptr++; /* move past the '\0' separating path from value */ 4152 ASSERT(ptr < &sbuf[sz]); 4153 ASSERT(isdigit(*ptr)); 4154 val = atoi(ptr); 4155 ASSERT(val > 0); 4156 ptr += strlen(ptr); 4157 ptr++; /* move past the final '\0' for this entry */ 4158 4159 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4160 ASSERT(np->u.event.epname != NULL); 4161 4162 istat_bump(np, val); 4163 tree_free(np); 4164 } 4165 4166 istat_save(); 4167 } 4168