1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <strings.h> 35 #include <ctype.h> 36 #include <alloca.h> 37 #include <libnvpair.h> 38 #include <sys/fm/protocol.h> 39 #include <fm/fmd_api.h> 40 #include "alloc.h" 41 #include "out.h" 42 #include "stats.h" 43 #include "stable.h" 44 #include "literals.h" 45 #include "lut.h" 46 #include "tree.h" 47 #include "ptree.h" 48 #include "itree.h" 49 #include "ipath.h" 50 #include "fme.h" 51 #include "evnv.h" 52 #include "eval.h" 53 #include "config.h" 54 #include "platform.h" 55 #include "esclex.h" 56 57 /* imported from eft.c... */ 58 extern hrtime_t Hesitate; 59 extern char *Serd_Override; 60 extern nv_alloc_t Eft_nv_hdl; 61 extern int Max_fme; 62 extern fmd_hdl_t *Hdl; 63 64 static int Istat_need_save; 65 static int Serd_need_save; 66 void istat_save(void); 67 void serd_save(void); 68 69 /* fme under construction is global so we can free it on module abort */ 70 static struct fme *Nfmep; 71 72 static int Undiag_reason = UD_VAL_UNKNOWN; 73 74 static int Nextid = 0; 75 76 static int Open_fme_count = 0; /* Count of open FMEs */ 77 78 /* list of fault management exercises underway */ 79 static struct fme { 80 struct fme *next; /* next exercise */ 81 unsigned long long ull; /* time when fme was created */ 82 int id; /* FME id */ 83 struct config *config; /* cooked configuration data */ 84 struct lut *eventtree; /* propagation tree for this FME */ 85 /* 86 * The initial error report that created this FME is kept in 87 * two forms. e0 points to the instance tree node and is used 88 * by fme_eval() as the starting point for the inference 89 * algorithm. e0r is the event handle FMD passed to us when 90 * the ereport first arrived and is used when setting timers, 91 * which are always relative to the time of this initial 92 * report. 93 */ 94 struct event *e0; 95 fmd_event_t *e0r; 96 97 id_t timer; /* for setting an fmd time-out */ 98 99 struct event *ecurrent; /* ereport under consideration */ 100 struct event *suspects; /* current suspect list */ 101 struct event *psuspects; /* previous suspect list */ 102 int nsuspects; /* count of suspects */ 103 int nonfault; /* zero if all suspects T_FAULT */ 104 int posted_suspects; /* true if we've posted a diagnosis */ 105 int uniqobs; /* number of unique events observed */ 106 int peek; /* just peeking, don't track suspects */ 107 int overflow; /* true if overflow FME */ 108 enum fme_state { 109 FME_NOTHING = 5000, /* not evaluated yet */ 110 FME_WAIT, /* need to wait for more info */ 111 FME_CREDIBLE, /* suspect list is credible */ 112 FME_DISPROVED, /* no valid suspects found */ 113 FME_DEFERRED /* don't know yet (k-count not met) */ 114 } state; 115 116 unsigned long long pull; /* time passed since created */ 117 unsigned long long wull; /* wait until this time for re-eval */ 118 struct event *observations; /* observation list */ 119 struct lut *globals; /* values of global variables */ 120 /* fmd interfacing */ 121 fmd_hdl_t *hdl; /* handle for talking with fmd */ 122 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 123 /* stats */ 124 struct stats *Rcount; 125 struct stats *Hcallcount; 126 struct stats *Rcallcount; 127 struct stats *Ccallcount; 128 struct stats *Ecallcount; 129 struct stats *Tcallcount; 130 struct stats *Marrowcount; 131 struct stats *diags; 132 } *FMElist, *EFMElist, *ClosedFMEs; 133 134 static struct case_list { 135 fmd_case_t *fmcase; 136 struct case_list *next; 137 } *Undiagablecaselist; 138 139 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 140 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 141 unsigned long long at_latest_by, unsigned long long *pdelay); 142 static struct node *eventprop_lookup(struct event *ep, const char *propname); 143 static struct node *pathstring2epnamenp(char *path); 144 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 145 fmd_case_t *fmcase); 146 static const char *undiag_2reason_str(int ud); 147 static const char *undiag_2defect_str(int ud); 148 static void restore_suspects(struct fme *fmep); 149 static void save_suspects(struct fme *fmep); 150 static void destroy_fme(struct fme *f); 151 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 152 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 153 static void istat_counter_reset_cb(struct istat_entry *entp, 154 struct stats *statp, const struct ipath *ipp); 155 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 156 struct stats *statp, void *unused); 157 static void serd_reset_cb(struct serd_entry *entp, void *unused, 158 const struct ipath *ipp); 159 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 160 void *unused2); 161 static void destroy_fme_bufs(struct fme *fp); 162 163 static struct fme * 164 alloc_fme(void) 165 { 166 struct fme *fmep; 167 168 fmep = MALLOC(sizeof (*fmep)); 169 bzero(fmep, sizeof (*fmep)); 170 return (fmep); 171 } 172 173 /* 174 * fme_ready -- called when all initialization of the FME (except for 175 * stats) has completed successfully. Adds the fme to global lists 176 * and establishes its stats. 177 */ 178 static struct fme * 179 fme_ready(struct fme *fmep) 180 { 181 char nbuf[100]; 182 183 Nfmep = NULL; /* don't need to free this on module abort now */ 184 185 if (EFMElist) { 186 EFMElist->next = fmep; 187 EFMElist = fmep; 188 } else 189 FMElist = EFMElist = fmep; 190 191 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 192 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 193 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 194 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 195 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 196 fmep->Rcallcount = stats_new_counter(nbuf, 197 "calls to requirements_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 199 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 200 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 201 fmep->Ecallcount = 202 stats_new_counter(nbuf, "calls to effects_test()", 1); 203 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 204 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 205 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 206 fmep->Marrowcount = stats_new_counter(nbuf, 207 "arrows marked by mark_arrows()", 1); 208 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 209 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 210 211 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 212 config_print(O_ALTFP|O_VERB2, fmep->config); 213 214 return (fmep); 215 } 216 217 extern void ipath_dummy_lut(struct arrow *); 218 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 219 220 /* ARGSUSED */ 221 static void 222 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 223 { 224 struct bubble *bp; 225 struct arrowlist *ap; 226 227 for (bp = itree_next_bubble(ep, NULL); bp; 228 bp = itree_next_bubble(ep, bp)) { 229 if (bp->t != B_FROM) 230 continue; 231 for (ap = itree_next_arrow(bp, NULL); ap; 232 ap = itree_next_arrow(bp, ap)) { 233 ap->arrowp->pnode->u.arrow.needed = 1; 234 ipath_dummy_lut(ap->arrowp); 235 } 236 } 237 } 238 239 /* ARGSUSED */ 240 static void 241 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 242 { 243 struct bubble *bp; 244 struct arrowlist *ap; 245 246 for (bp = itree_next_bubble(ep, NULL); bp; 247 bp = itree_next_bubble(ep, bp)) { 248 if (bp->t != B_FROM) 249 continue; 250 for (ap = itree_next_arrow(bp, NULL); ap; 251 ap = itree_next_arrow(bp, ap)) 252 ap->arrowp->pnode->u.arrow.needed = 0; 253 } 254 } 255 256 static void globals_destructor(void *left, void *right, void *arg); 257 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 258 259 static void 260 prune_propagations(const char *e0class, const struct ipath *e0ipp) 261 { 262 char nbuf[100]; 263 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 264 extern struct lut *Usednames; 265 266 Nfmep = alloc_fme(); 267 Nfmep->id = Nextid; 268 Nfmep->state = FME_NOTHING; 269 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 270 if ((Nfmep->e0 = 271 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 272 out(O_ALTFP, "prune_propagations: e0 not in instance tree"); 273 itree_free(Nfmep->eventtree); 274 FREE(Nfmep); 275 Nfmep = NULL; 276 return; 277 } 278 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 279 Nfmep->e0->count++; 280 281 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 282 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 283 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 284 Nfmep->Hcallcount = 285 stats_new_counter(nbuf, "calls to hypothesise()", 1); 286 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 287 Nfmep->Rcallcount = stats_new_counter(nbuf, 288 "calls to requirements_test()", 1); 289 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 290 Nfmep->Ccallcount = 291 stats_new_counter(nbuf, "calls to causes_test()", 1); 292 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 293 Nfmep->Ecallcount = 294 stats_new_counter(nbuf, "calls to effects_test()", 1); 295 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 296 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 297 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 298 Nfmep->Marrowcount = stats_new_counter(nbuf, 299 "arrows marked by mark_arrows()", 1); 300 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 301 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 302 303 Nfmep->peek = 1; 304 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 305 lut_free(Usednames, NULL, NULL); 306 Usednames = NULL; 307 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 308 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 309 itree_prune(Nfmep->eventtree); 310 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 311 312 stats_delete(Nfmep->Rcount); 313 stats_delete(Nfmep->Hcallcount); 314 stats_delete(Nfmep->Rcallcount); 315 stats_delete(Nfmep->Ccallcount); 316 stats_delete(Nfmep->Ecallcount); 317 stats_delete(Nfmep->Tcallcount); 318 stats_delete(Nfmep->Marrowcount); 319 stats_delete(Nfmep->diags); 320 itree_free(Nfmep->eventtree); 321 lut_free(Nfmep->globals, globals_destructor, NULL); 322 FREE(Nfmep); 323 } 324 325 static struct fme * 326 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 327 fmd_case_t *fmcase) 328 { 329 struct cfgdata *cfgdata; 330 int init_size; 331 extern int alloc_total(); 332 333 init_size = alloc_total(); 334 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 335 cfgdata = config_snapshot(); 336 platform_save_config(hdl, fmcase); 337 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 338 alloc_total() - init_size); 339 340 Nfmep = alloc_fme(); 341 342 Nfmep->id = Nextid++; 343 Nfmep->config = cfgdata->cooked; 344 config_free(cfgdata); 345 Nfmep->posted_suspects = 0; 346 Nfmep->uniqobs = 0; 347 Nfmep->state = FME_NOTHING; 348 Nfmep->pull = 0ULL; 349 Nfmep->overflow = 0; 350 351 Nfmep->fmcase = fmcase; 352 Nfmep->hdl = hdl; 353 354 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 355 out(O_ALTFP, "newfme: NULL instance tree"); 356 Undiag_reason = UD_VAL_INSTFAIL; 357 structconfig_free(Nfmep->config); 358 destroy_fme_bufs(Nfmep); 359 FREE(Nfmep); 360 Nfmep = NULL; 361 return (NULL); 362 } 363 364 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 365 366 if ((Nfmep->e0 = 367 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 368 out(O_ALTFP, "newfme: e0 not in instance tree"); 369 Undiag_reason = UD_VAL_BADEVENTI; 370 itree_free(Nfmep->eventtree); 371 structconfig_free(Nfmep->config); 372 destroy_fme_bufs(Nfmep); 373 FREE(Nfmep); 374 Nfmep = NULL; 375 return (NULL); 376 } 377 378 return (fme_ready(Nfmep)); 379 } 380 381 void 382 fme_fini(void) 383 { 384 struct fme *sfp, *fp; 385 struct case_list *ucasep, *nextcasep; 386 387 ucasep = Undiagablecaselist; 388 while (ucasep != NULL) { 389 nextcasep = ucasep->next; 390 FREE(ucasep); 391 ucasep = nextcasep; 392 } 393 Undiagablecaselist = NULL; 394 395 /* clean up closed fmes */ 396 fp = ClosedFMEs; 397 while (fp != NULL) { 398 sfp = fp->next; 399 destroy_fme(fp); 400 fp = sfp; 401 } 402 ClosedFMEs = NULL; 403 404 fp = FMElist; 405 while (fp != NULL) { 406 sfp = fp->next; 407 destroy_fme(fp); 408 fp = sfp; 409 } 410 FMElist = EFMElist = NULL; 411 412 /* if we were in the middle of creating an fme, free it now */ 413 if (Nfmep) { 414 destroy_fme(Nfmep); 415 Nfmep = NULL; 416 } 417 } 418 419 /* 420 * Allocated space for a buffer name. 20 bytes allows for 421 * a ridiculous 9,999,999 unique observations. 422 */ 423 #define OBBUFNMSZ 20 424 425 /* 426 * serialize_observation 427 * 428 * Create a recoverable version of the current observation 429 * (f->ecurrent). We keep a serialized version of each unique 430 * observation in order that we may resume correctly the fme in the 431 * correct state if eft or fmd crashes and we're restarted. 432 */ 433 static void 434 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 435 { 436 size_t pkdlen; 437 char tmpbuf[OBBUFNMSZ]; 438 char *pkd = NULL; 439 char *estr; 440 441 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 442 estr = ipath2str(cls, ipp); 443 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 444 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 445 strlen(estr) + 1); 446 FREE(estr); 447 448 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 449 (void) snprintf(tmpbuf, 450 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 451 if (nvlist_xpack(fp->ecurrent->nvp, 452 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 453 out(O_DIE|O_SYS, "pack of observed nvl failed"); 454 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 455 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 456 FREE(pkd); 457 } 458 459 fp->uniqobs++; 460 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 461 sizeof (fp->uniqobs)); 462 } 463 464 /* 465 * init_fme_bufs -- We keep several bits of state about an fme for 466 * use if eft or fmd crashes and we're restarted. 467 */ 468 static void 469 init_fme_bufs(struct fme *fp) 470 { 471 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 472 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 473 sizeof (fp->pull)); 474 475 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 476 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 477 sizeof (fp->id)); 478 479 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 480 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 481 sizeof (fp->uniqobs)); 482 483 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 484 sizeof (fp->posted_suspects)); 485 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 486 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 487 } 488 489 static void 490 destroy_fme_bufs(struct fme *fp) 491 { 492 char tmpbuf[OBBUFNMSZ]; 493 int o; 494 495 platform_restore_config(fp->hdl, fp->fmcase); 496 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 497 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 498 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 499 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 500 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 501 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 502 503 for (o = 0; o < fp->uniqobs; o++) { 504 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 505 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 506 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 507 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 508 } 509 } 510 511 /* 512 * reconstitute_observations -- convert a case's serialized observations 513 * back into struct events. Returns zero if all observations are 514 * successfully reconstituted. 515 */ 516 static int 517 reconstitute_observations(struct fme *fmep) 518 { 519 struct event *ep; 520 struct node *epnamenp = NULL; 521 size_t pkdlen; 522 char *pkd = NULL; 523 char *tmpbuf = alloca(OBBUFNMSZ); 524 char *sepptr; 525 char *estr; 526 int ocnt; 527 int elen; 528 529 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 530 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 531 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 532 if (elen == 0) { 533 out(O_ALTFP, 534 "reconstitute_observation: no %s buffer found.", 535 tmpbuf); 536 Undiag_reason = UD_VAL_MISSINGOBS; 537 break; 538 } 539 540 estr = MALLOC(elen); 541 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 542 sepptr = strchr(estr, '@'); 543 if (sepptr == NULL) { 544 out(O_ALTFP, 545 "reconstitute_observation: %s: " 546 "missing @ separator in %s.", 547 tmpbuf, estr); 548 Undiag_reason = UD_VAL_MISSINGPATH; 549 FREE(estr); 550 break; 551 } 552 553 *sepptr = '\0'; 554 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 555 out(O_ALTFP, 556 "reconstitute_observation: %s: " 557 "trouble converting path string \"%s\" " 558 "to internal representation.", 559 tmpbuf, sepptr + 1); 560 Undiag_reason = UD_VAL_MISSINGPATH; 561 FREE(estr); 562 break; 563 } 564 565 /* construct the event */ 566 ep = itree_lookup(fmep->eventtree, 567 stable(estr), ipath(epnamenp)); 568 if (ep == NULL) { 569 out(O_ALTFP, 570 "reconstitute_observation: %s: " 571 "lookup of \"%s\" in itree failed.", 572 tmpbuf, ipath2str(estr, ipath(epnamenp))); 573 Undiag_reason = UD_VAL_BADOBS; 574 tree_free(epnamenp); 575 FREE(estr); 576 break; 577 } 578 tree_free(epnamenp); 579 580 /* 581 * We may or may not have a saved nvlist for the observation 582 */ 583 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 584 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 585 if (pkdlen != 0) { 586 pkd = MALLOC(pkdlen); 587 fmd_buf_read(fmep->hdl, 588 fmep->fmcase, tmpbuf, pkd, pkdlen); 589 ASSERT(ep->nvp == NULL); 590 if (nvlist_xunpack(pkd, 591 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 592 out(O_DIE|O_SYS, "pack of observed nvl failed"); 593 FREE(pkd); 594 } 595 596 if (ocnt == 0) 597 fmep->e0 = ep; 598 599 FREE(estr); 600 fmep->ecurrent = ep; 601 ep->count++; 602 603 /* link it into list of observations seen */ 604 ep->observations = fmep->observations; 605 fmep->observations = ep; 606 } 607 608 if (ocnt == fmep->uniqobs) { 609 (void) fme_ready(fmep); 610 return (0); 611 } 612 613 return (1); 614 } 615 616 /* 617 * restart_fme -- called during eft initialization. Reconstitutes 618 * an in-progress fme. 619 */ 620 void 621 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 622 { 623 nvlist_t *defect; 624 struct case_list *bad; 625 struct fme *fmep; 626 struct cfgdata *cfgdata; 627 size_t rawsz; 628 struct event *ep; 629 char *tmpbuf = alloca(OBBUFNMSZ); 630 char *sepptr; 631 char *estr; 632 int elen; 633 struct node *epnamenp = NULL; 634 int init_size; 635 extern int alloc_total(); 636 637 /* 638 * ignore solved or closed cases 639 */ 640 if (fmd_case_solved(hdl, inprogress) || 641 fmd_case_closed(hdl, inprogress)) 642 return; 643 644 fmep = alloc_fme(); 645 fmep->fmcase = inprogress; 646 fmep->hdl = hdl; 647 648 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 649 out(O_ALTFP, "restart_fme: no saved posted status"); 650 Undiag_reason = UD_VAL_MISSINGINFO; 651 goto badcase; 652 } else { 653 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 654 (void *)&fmep->posted_suspects, 655 sizeof (fmep->posted_suspects)); 656 } 657 658 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 659 out(O_ALTFP, "restart_fme: no saved id"); 660 Undiag_reason = UD_VAL_MISSINGINFO; 661 goto badcase; 662 } else { 663 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 664 sizeof (fmep->id)); 665 } 666 if (Nextid <= fmep->id) 667 Nextid = fmep->id + 1; 668 669 out(O_ALTFP, "Replay FME %d", fmep->id); 670 671 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 672 out(O_ALTFP, "restart_fme: No config data"); 673 Undiag_reason = UD_VAL_MISSINGINFO; 674 goto badcase; 675 } 676 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 677 sizeof (size_t)); 678 679 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 680 out(O_ALTFP, "restart_fme: No event zero"); 681 Undiag_reason = UD_VAL_MISSINGZERO; 682 goto badcase; 683 } 684 685 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 686 out(O_ALTFP, "restart_fme: no saved wait time"); 687 Undiag_reason = UD_VAL_MISSINGINFO; 688 goto badcase; 689 } else { 690 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 691 sizeof (fmep->pull)); 692 } 693 694 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 695 out(O_ALTFP, "restart_fme: no count of observations"); 696 Undiag_reason = UD_VAL_MISSINGINFO; 697 goto badcase; 698 } else { 699 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 700 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 701 } 702 703 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 704 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 705 if (elen == 0) { 706 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 707 tmpbuf); 708 Undiag_reason = UD_VAL_MISSINGOBS; 709 goto badcase; 710 } 711 estr = MALLOC(elen); 712 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 713 sepptr = strchr(estr, '@'); 714 if (sepptr == NULL) { 715 out(O_ALTFP, "reconstitute_observation: %s: " 716 "missing @ separator in %s.", 717 tmpbuf, estr); 718 Undiag_reason = UD_VAL_MISSINGPATH; 719 FREE(estr); 720 goto badcase; 721 } 722 *sepptr = '\0'; 723 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 724 out(O_ALTFP, "reconstitute_observation: %s: " 725 "trouble converting path string \"%s\" " 726 "to internal representation.", tmpbuf, sepptr + 1); 727 Undiag_reason = UD_VAL_MISSINGPATH; 728 FREE(estr); 729 goto badcase; 730 } 731 prune_propagations(stable(estr), ipath(epnamenp)); 732 tree_free(epnamenp); 733 FREE(estr); 734 735 init_size = alloc_total(); 736 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 737 cfgdata = MALLOC(sizeof (struct cfgdata)); 738 cfgdata->cooked = NULL; 739 cfgdata->devcache = NULL; 740 cfgdata->devidcache = NULL; 741 cfgdata->cpucache = NULL; 742 cfgdata->raw_refcnt = 1; 743 744 if (rawsz > 0) { 745 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 746 out(O_ALTFP, "restart_fme: Config data size mismatch"); 747 Undiag_reason = UD_VAL_CFGMISMATCH; 748 goto badcase; 749 } 750 cfgdata->begin = MALLOC(rawsz); 751 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 752 fmd_buf_read(hdl, 753 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 754 } else { 755 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 756 } 757 758 config_cook(cfgdata); 759 fmep->config = cfgdata->cooked; 760 config_free(cfgdata); 761 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 762 alloc_total() - init_size); 763 764 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 765 /* case not properly saved or irretrievable */ 766 out(O_ALTFP, "restart_fme: NULL instance tree"); 767 Undiag_reason = UD_VAL_INSTFAIL; 768 goto badcase; 769 } 770 771 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 772 773 if (reconstitute_observations(fmep) != 0) 774 goto badcase; 775 776 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 777 for (ep = fmep->observations; ep; ep = ep->observations) { 778 out(O_ALTFP|O_NONL, " "); 779 itree_pevent_brief(O_ALTFP|O_NONL, ep); 780 } 781 out(O_ALTFP, NULL); 782 783 Open_fme_count++; 784 785 /* give the diagnosis algorithm a shot at the new FME state */ 786 fme_eval(fmep, fmep->e0r); 787 return; 788 789 badcase: 790 if (fmep->eventtree != NULL) 791 itree_free(fmep->eventtree); 792 if (fmep->config) 793 structconfig_free(fmep->config); 794 destroy_fme_bufs(fmep); 795 FREE(fmep); 796 797 /* 798 * Since we're unable to restart the case, add it to the undiagable 799 * list and solve and close it as appropriate. 800 */ 801 bad = MALLOC(sizeof (struct case_list)); 802 bad->next = NULL; 803 804 if (Undiagablecaselist != NULL) 805 bad->next = Undiagablecaselist; 806 Undiagablecaselist = bad; 807 bad->fmcase = inprogress; 808 809 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 810 fmd_case_uuid(hdl, bad->fmcase)); 811 812 if (fmd_case_solved(hdl, bad->fmcase)) { 813 out(O_ALTFP|O_NONL, "already solved, "); 814 } else { 815 out(O_ALTFP|O_NONL, "solving, "); 816 defect = fmd_nvl_create_fault(hdl, 817 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 818 (void) nvlist_add_string(defect, UNDIAG_REASON, 819 undiag_2reason_str(Undiag_reason)); 820 fmd_case_add_suspect(hdl, bad->fmcase, defect); 821 fmd_case_solve(hdl, bad->fmcase); 822 Undiag_reason = UD_VAL_UNKNOWN; 823 } 824 825 if (fmd_case_closed(hdl, bad->fmcase)) { 826 out(O_ALTFP, "already closed ]"); 827 } else { 828 out(O_ALTFP, "closing ]"); 829 fmd_case_close(hdl, bad->fmcase); 830 } 831 } 832 833 /*ARGSUSED*/ 834 static void 835 globals_destructor(void *left, void *right, void *arg) 836 { 837 struct evalue *evp = (struct evalue *)right; 838 if (evp->t == NODEPTR) 839 tree_free((struct node *)(uintptr_t)evp->v); 840 evp->v = (uintptr_t)NULL; 841 FREE(evp); 842 } 843 844 void 845 destroy_fme(struct fme *f) 846 { 847 stats_delete(f->Rcount); 848 stats_delete(f->Hcallcount); 849 stats_delete(f->Rcallcount); 850 stats_delete(f->Ccallcount); 851 stats_delete(f->Ecallcount); 852 stats_delete(f->Tcallcount); 853 stats_delete(f->Marrowcount); 854 stats_delete(f->diags); 855 856 if (f->eventtree != NULL) 857 itree_free(f->eventtree); 858 if (f->config) 859 structconfig_free(f->config); 860 lut_free(f->globals, globals_destructor, NULL); 861 FREE(f); 862 } 863 864 static const char * 865 fme_state2str(enum fme_state s) 866 { 867 switch (s) { 868 case FME_NOTHING: return ("NOTHING"); 869 case FME_WAIT: return ("WAIT"); 870 case FME_CREDIBLE: return ("CREDIBLE"); 871 case FME_DISPROVED: return ("DISPROVED"); 872 case FME_DEFERRED: return ("DEFERRED"); 873 default: return ("UNKNOWN"); 874 } 875 } 876 877 static int 878 is_problem(enum nametype t) 879 { 880 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 881 } 882 883 static int 884 is_fault(enum nametype t) 885 { 886 return (t == N_FAULT); 887 } 888 889 static int 890 is_defect(enum nametype t) 891 { 892 return (t == N_DEFECT); 893 } 894 895 static int 896 is_upset(enum nametype t) 897 { 898 return (t == N_UPSET); 899 } 900 901 static void 902 fme_print(int flags, struct fme *fmep) 903 { 904 struct event *ep; 905 906 out(flags, "Fault Management Exercise %d", fmep->id); 907 out(flags, "\t State: %s", fme_state2str(fmep->state)); 908 out(flags|O_NONL, "\t Start time: "); 909 ptree_timeval(flags|O_NONL, &fmep->ull); 910 out(flags, NULL); 911 if (fmep->wull) { 912 out(flags|O_NONL, "\t Wait time: "); 913 ptree_timeval(flags|O_NONL, &fmep->wull); 914 out(flags, NULL); 915 } 916 out(flags|O_NONL, "\t E0: "); 917 if (fmep->e0) 918 itree_pevent_brief(flags|O_NONL, fmep->e0); 919 else 920 out(flags|O_NONL, "NULL"); 921 out(flags, NULL); 922 out(flags|O_NONL, "\tObservations:"); 923 for (ep = fmep->observations; ep; ep = ep->observations) { 924 out(flags|O_NONL, " "); 925 itree_pevent_brief(flags|O_NONL, ep); 926 } 927 out(flags, NULL); 928 out(flags|O_NONL, "\tSuspect list:"); 929 for (ep = fmep->suspects; ep; ep = ep->suspects) { 930 out(flags|O_NONL, " "); 931 itree_pevent_brief(flags|O_NONL, ep); 932 } 933 out(flags, NULL); 934 if (fmep->eventtree != NULL) { 935 out(flags|O_VERB2, "\t Tree:"); 936 itree_ptree(flags|O_VERB2, fmep->eventtree); 937 } 938 } 939 940 static struct node * 941 pathstring2epnamenp(char *path) 942 { 943 char *sep = "/"; 944 struct node *ret; 945 char *ptr; 946 947 if ((ptr = strtok(path, sep)) == NULL) 948 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 949 950 ret = tree_iname(stable(ptr), NULL, 0); 951 952 while ((ptr = strtok(NULL, sep)) != NULL) 953 ret = tree_name_append(ret, 954 tree_iname(stable(ptr), NULL, 0)); 955 956 return (ret); 957 } 958 959 /* 960 * for a given upset sp, increment the corresponding SERD engine. if the 961 * SERD engine trips, return the ename and ipp of the resulting ereport. 962 * returns true if engine tripped and *enamep and *ippp were filled in. 963 */ 964 static int 965 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 966 fmd_case_t *fmcase, struct event *sp, const char **enamep, 967 const struct ipath **ippp) 968 { 969 struct node *serdinst; 970 char *serdname; 971 char *serdresource; 972 struct node *nid; 973 struct serd_entry *newentp; 974 int i, serdn = -1, serdincrement = 1, len = 0; 975 char *serdsuffix = NULL, *serdt = NULL, *ptr; 976 struct evalue *ep; 977 978 ASSERT(sp->t == N_UPSET); 979 ASSERT(ffep != NULL); 980 981 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 982 (void *)"n", (lut_cmp)strcmp)) != NULL) { 983 ASSERT(ep->t == UINT64); 984 serdn = (int)ep->v; 985 } 986 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 987 (void *)"t", (lut_cmp)strcmp)) != NULL) { 988 ASSERT(ep->t == STRING); 989 serdt = (char *)(uintptr_t)ep->v; 990 } 991 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 992 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 993 ASSERT(ep->t == STRING); 994 serdsuffix = (char *)(uintptr_t)ep->v; 995 } 996 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 997 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 998 ASSERT(ep->t == UINT64); 999 serdincrement = (int)ep->v; 1000 } 1001 1002 /* 1003 * obtain instanced SERD engine from the upset sp. from this 1004 * derive serdname, the string used to identify the SERD engine. 1005 */ 1006 serdinst = eventprop_lookup(sp, L_engine); 1007 1008 if (serdinst == NULL) 1009 return (-1); 1010 1011 serdname = ipath2str(serdinst->u.stmt.np->u.event.ename->u.name.s, 1012 NULL); 1013 serdresource = ipath2str(NULL, 1014 ipath(serdinst->u.stmt.np->u.event.epname)); 1015 1016 len = strlen(serdname) + strlen(serdresource) + 2; 1017 if (serdsuffix != NULL) 1018 len += strlen(serdsuffix); 1019 1020 ptr = MALLOC(len); 1021 if (serdsuffix != NULL) { 1022 (void) snprintf(ptr, len, "%s%s@%s", serdname, serdsuffix, 1023 serdresource); 1024 } else { 1025 (void) snprintf(ptr, len, "%s@%s", serdname, serdresource); 1026 } 1027 FREE(serdname); 1028 FREE(serdresource); 1029 serdname = ptr; 1030 1031 /* handle serd engine "id" property, if there is one */ 1032 if ((nid = 1033 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1034 struct evalue *gval; 1035 char suffixbuf[200]; 1036 char *suffix; 1037 char *nserdname; 1038 size_t nname; 1039 1040 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1041 ptree_name_iter(O_ALTFP|O_NONL, nid); 1042 1043 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1044 1045 if ((gval = lut_lookup(fmep->globals, 1046 (void *)nid->u.globid.s, NULL)) == NULL) { 1047 out(O_ALTFP, " undefined"); 1048 } else if (gval->t == UINT64) { 1049 out(O_ALTFP, " %llu", gval->v); 1050 (void) sprintf(suffixbuf, "%llu", gval->v); 1051 suffix = suffixbuf; 1052 } else { 1053 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1054 suffix = (char *)(uintptr_t)gval->v; 1055 } 1056 1057 nname = strlen(serdname) + strlen(suffix) + 2; 1058 nserdname = MALLOC(nname); 1059 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1060 FREE(serdname); 1061 serdname = nserdname; 1062 } 1063 1064 /* 1065 * if the engine is empty, and we have an override for n/t then 1066 * destroy and recreate it. 1067 */ 1068 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1069 fmd_serd_empty(hdl, serdname)) 1070 fmd_serd_destroy(hdl, serdname); 1071 1072 if (!fmd_serd_exists(hdl, serdname)) { 1073 struct node *nN, *nT; 1074 const char *s; 1075 struct node *nodep; 1076 struct config *cp; 1077 char *path; 1078 uint_t nval; 1079 hrtime_t tval; 1080 const char *name; 1081 char *tptr; 1082 char *serd_name; 1083 int i; 1084 int tmplen; 1085 char *ptr; 1086 int got_n_override = 0, got_t_override = 0; 1087 1088 /* no SERD engine yet, so create it */ 1089 nodep = serdinst->u.stmt.np->u.event.epname; 1090 tmplen = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) 1091 + 2; 1092 if (serdsuffix != NULL) 1093 tmplen += strlen(serdsuffix); 1094 tptr = MALLOC(tmplen); 1095 if (serdsuffix != NULL) { 1096 (void) snprintf(tptr, len, "%s%s", 1097 serdinst->u.stmt.np->u.event.ename->u.name.s, 1098 serdsuffix); 1099 } else { 1100 (void) snprintf(tptr, len, "%s", 1101 serdinst->u.stmt.np->u.event.ename->u.name.s); 1102 } 1103 name = (const char *)tptr; 1104 path = ipath2str(NULL, ipath(nodep)); 1105 cp = config_lookup(fmep->config, path, 0); 1106 FREE((void *)path); 1107 1108 /* 1109 * We allow serd paramaters to be overridden, either from 1110 * eft.conf file values (if Serd_Override is set) or from 1111 * driver properties (for "serd.io.device" engines). 1112 */ 1113 if (Serd_Override != NULL) { 1114 char *save_ptr, *ptr1, *ptr2, *ptr3; 1115 ptr3 = save_ptr = STRDUP(Serd_Override); 1116 while (*ptr3 != '\0') { 1117 ptr1 = strchr(ptr3, ','); 1118 *ptr1 = '\0'; 1119 if (strcmp(ptr3, name) == 0) { 1120 ptr2 = strchr(ptr1 + 1, ','); 1121 *ptr2 = '\0'; 1122 nval = atoi(ptr1 + 1); 1123 out(O_ALTFP, "serd override %s_n %d", 1124 name, nval); 1125 ptr3 = strchr(ptr2 + 1, ' '); 1126 if (ptr3) 1127 *ptr3 = '\0'; 1128 ptr = STRDUP(ptr2 + 1); 1129 out(O_ALTFP, "serd override %s_t %s", 1130 name, ptr); 1131 got_n_override = 1; 1132 got_t_override = 1; 1133 break; 1134 } else { 1135 ptr2 = strchr(ptr1 + 1, ','); 1136 ptr3 = strchr(ptr2 + 1, ' '); 1137 if (ptr3 == NULL) 1138 break; 1139 } 1140 ptr3++; 1141 } 1142 FREE(save_ptr); 1143 } 1144 1145 if (cp && got_n_override == 0) { 1146 /* 1147 * convert serd engine name into property name 1148 */ 1149 serd_name = MALLOC(strlen(name) + 3); 1150 for (i = 0; i < strlen(name); i++) { 1151 if (name[i] == '.') 1152 serd_name[i] = '_'; 1153 else 1154 serd_name[i] = name[i]; 1155 } 1156 serd_name[i++] = '_'; 1157 serd_name[i++] = 'n'; 1158 serd_name[i] = '\0'; 1159 if (s = config_getprop(cp, serd_name)) { 1160 nval = atoi(s); 1161 out(O_ALTFP, "serd override %s_n %s", name, s); 1162 got_n_override = 1; 1163 } 1164 serd_name[i - 1] = 't'; 1165 if (s = config_getprop(cp, serd_name)) { 1166 ptr = STRDUP(s); 1167 out(O_ALTFP, "serd override %s_t %s", name, s); 1168 got_t_override = 1; 1169 } 1170 FREE(serd_name); 1171 } 1172 1173 if (serdn != -1 && got_n_override == 0) { 1174 nval = serdn; 1175 out(O_ALTFP, "serd override %s_n %d", name, serdn); 1176 got_n_override = 1; 1177 } 1178 if (serdt != NULL && got_t_override == 0) { 1179 ptr = STRDUP(serdt); 1180 out(O_ALTFP, "serd override %s_t %s", name, serdt); 1181 got_t_override = 1; 1182 } 1183 1184 if (!got_n_override) { 1185 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1186 NULL); 1187 ASSERT(nN->t == T_NUM); 1188 nval = (uint_t)nN->u.ull; 1189 } 1190 if (!got_t_override) { 1191 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1192 NULL); 1193 ASSERT(nT->t == T_TIMEVAL); 1194 tval = (hrtime_t)nT->u.ull; 1195 } else { 1196 const unsigned long long *ullp; 1197 const char *suffix; 1198 int len; 1199 1200 len = strspn(ptr, "0123456789"); 1201 suffix = stable(&ptr[len]); 1202 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1203 (void *)suffix, NULL); 1204 ptr[len] = '\0'; 1205 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1206 FREE(ptr); 1207 } 1208 fmd_serd_create(hdl, serdname, nval, tval); 1209 FREE(tptr); 1210 } 1211 1212 newentp = MALLOC(sizeof (*newentp)); 1213 newentp->ename = stable(serdinst->u.stmt.np->u.event.ename->u.name.s); 1214 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1215 newentp->hdl = hdl; 1216 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1217 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1218 (void *)newentp, (lut_cmp)serd_cmp); 1219 Serd_need_save = 1; 1220 serd_save(); 1221 } else { 1222 FREE(newentp); 1223 } 1224 1225 1226 /* 1227 * increment SERD engine. if engine fires, reset serd 1228 * engine and return trip_strcode if required. 1229 */ 1230 for (i = 0; i < serdincrement; i++) { 1231 if (fmd_serd_record(hdl, serdname, ffep)) { 1232 fmd_case_add_serd(hdl, fmcase, serdname); 1233 fmd_serd_reset(hdl, serdname); 1234 1235 if (ippp) { 1236 struct node *tripinst = 1237 lut_lookup(serdinst->u.stmt.lutp, 1238 (void *)L_trip, NULL); 1239 ASSERT(tripinst != NULL); 1240 *enamep = tripinst->u.event.ename->u.name.s; 1241 *ippp = ipath(tripinst->u.event.epname); 1242 out(O_ALTFP|O_NONL, 1243 "[engine fired: %s, sending: ", serdname); 1244 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1245 out(O_ALTFP, "]"); 1246 } else { 1247 out(O_ALTFP, "[engine fired: %s, no trip]", 1248 serdname); 1249 } 1250 FREE(serdname); 1251 return (1); 1252 } 1253 } 1254 1255 FREE(serdname); 1256 return (0); 1257 } 1258 1259 /* 1260 * search a suspect list for upsets. feed each upset to serd_eval() and 1261 * build up tripped[], an array of ereports produced by the firing of 1262 * any SERD engines. then feed each ereport back into 1263 * fme_receive_report(). 1264 * 1265 * returns ntrip, the number of these ereports produced. 1266 */ 1267 static int 1268 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1269 { 1270 /* we build an array of tripped ereports that we send ourselves */ 1271 struct { 1272 const char *ename; 1273 const struct ipath *ipp; 1274 } *tripped; 1275 struct event *sp; 1276 int ntrip, nupset, i; 1277 1278 /* 1279 * count the number of upsets to determine the upper limit on 1280 * expected trip ereport strings. remember that one upset can 1281 * lead to at most one ereport. 1282 */ 1283 nupset = 0; 1284 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1285 if (sp->t == N_UPSET) 1286 nupset++; 1287 } 1288 1289 if (nupset == 0) 1290 return (0); 1291 1292 /* 1293 * get to this point if we have upsets and expect some trip 1294 * ereports 1295 */ 1296 tripped = alloca(sizeof (*tripped) * nupset); 1297 bzero((void *)tripped, sizeof (*tripped) * nupset); 1298 1299 ntrip = 0; 1300 for (sp = fmep->suspects; sp; sp = sp->suspects) 1301 if (sp->t == N_UPSET && 1302 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1303 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1304 ntrip++; 1305 1306 for (i = 0; i < ntrip; i++) { 1307 struct event *ep, *nep; 1308 struct fme *nfmep; 1309 fmd_case_t *fmcase; 1310 const struct ipath *ipp; 1311 const char *eventstring; 1312 int prev_verbose; 1313 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1314 enum fme_state state; 1315 1316 /* 1317 * First try and evaluate a case with the trip ereport plus 1318 * all the other ereports that cause the trip. If that fails 1319 * to evaluate then try again with just this ereport on its own. 1320 */ 1321 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1322 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1323 out(O_ALTFP|O_STAMP, NULL); 1324 ep = fmep->e0; 1325 eventstring = ep->enode->u.event.ename->u.name.s; 1326 ipp = ep->ipp; 1327 prune_propagations(eventstring, ipp); 1328 1329 /* 1330 * create a duplicate fme and case 1331 */ 1332 fmcase = fmd_case_open(fmep->hdl, NULL); 1333 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1334 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1335 out(O_ALTFP, " ]"); 1336 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1337 fmcase)) == NULL) { 1338 out(O_ALTFP|O_NONL, "["); 1339 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1340 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1341 publish_undiagnosable(fmep->hdl, ffep, fmcase); 1342 continue; 1343 } 1344 Open_fme_count++; 1345 nfmep->pull = fmep->pull; 1346 init_fme_bufs(nfmep); 1347 out(O_ALTFP|O_NONL, "["); 1348 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1349 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1350 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1351 if (ffep) { 1352 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1353 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1354 nfmep->e0r = ffep; 1355 } 1356 1357 /* 1358 * add the original ereports 1359 */ 1360 for (ep = fmep->observations; ep; ep = ep->observations) { 1361 eventstring = ep->enode->u.event.ename->u.name.s; 1362 ipp = ep->ipp; 1363 out(O_ALTFP|O_NONL, "adding event ["); 1364 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1365 out(O_ALTFP, " ]"); 1366 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1367 if (nep->count++ == 0) { 1368 nep->observations = nfmep->observations; 1369 nfmep->observations = nep; 1370 serialize_observation(nfmep, eventstring, ipp); 1371 nep->nvp = evnv_dupnvl(ep->nvp); 1372 } 1373 if (ep->ffep && ep->ffep != ffep) 1374 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1375 ep->ffep); 1376 stats_counter_bump(nfmep->Rcount); 1377 } 1378 1379 /* 1380 * add the serd trigger ereport 1381 */ 1382 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1383 tripped[i].ipp)) == NULL) { 1384 /* 1385 * The trigger ereport is not in the instance tree. It 1386 * was presumably removed by prune_propagations() as 1387 * this combination of events is not present in the 1388 * rules. 1389 */ 1390 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1391 Undiag_reason = UD_VAL_BADEVENTI; 1392 goto retry_lone_ereport; 1393 } 1394 out(O_ALTFP|O_NONL, "adding event ["); 1395 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1396 out(O_ALTFP, " ]"); 1397 nfmep->ecurrent = ep; 1398 ep->nvp = NULL; 1399 ep->count = 1; 1400 ep->observations = nfmep->observations; 1401 nfmep->observations = ep; 1402 1403 /* 1404 * just peek first. 1405 */ 1406 nfmep->peek = 1; 1407 prev_verbose = Verbose; 1408 if (Debug == 0) 1409 Verbose = 0; 1410 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1411 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1412 nfmep->peek = 0; 1413 Verbose = prev_verbose; 1414 if (state == FME_DISPROVED) { 1415 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1416 Undiag_reason = UD_VAL_UNSOLVD; 1417 retry_lone_ereport: 1418 /* 1419 * However the trigger ereport on its own might be 1420 * diagnosable, so check for that. Undo the new fme 1421 * and case we just created and call fme_receive_report. 1422 */ 1423 out(O_ALTFP|O_NONL, "["); 1424 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1425 tripped[i].ipp); 1426 out(O_ALTFP, " retrying with just trigger ereport]"); 1427 itree_free(nfmep->eventtree); 1428 nfmep->eventtree = NULL; 1429 structconfig_free(nfmep->config); 1430 nfmep->config = NULL; 1431 destroy_fme_bufs(nfmep); 1432 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1433 fme_receive_report(fmep->hdl, ffep, 1434 tripped[i].ename, tripped[i].ipp, NULL); 1435 continue; 1436 } 1437 1438 /* 1439 * and evaluate 1440 */ 1441 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1442 fme_eval(nfmep, ffep); 1443 } 1444 1445 return (ntrip); 1446 } 1447 1448 /* 1449 * fme_receive_external_report -- call when an external ereport comes in 1450 * 1451 * this routine just converts the relevant information from the ereport 1452 * into a format used internally and passes it on to fme_receive_report(). 1453 */ 1454 void 1455 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1456 const char *class) 1457 { 1458 struct node *epnamenp; 1459 fmd_case_t *fmcase; 1460 const struct ipath *ipp; 1461 1462 class = stable(class); 1463 1464 /* Get the component path from the ereport */ 1465 epnamenp = platform_getpath(nvl); 1466 1467 /* See if we ended up without a path. */ 1468 if (epnamenp == NULL) { 1469 /* See if class permits silent discard on unknown component. */ 1470 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1471 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1472 "to component path, but silent discard allowed.", 1473 class); 1474 } else { 1475 /* 1476 * XFILE: Failure to find a component is bad unless 1477 * 'discard_if_config_unknown=1' was specified in the 1478 * ereport definition. Indicate undiagnosable. 1479 */ 1480 out(O_ALTFP, "XFILE: Unable to map \"%s\" ereport " 1481 "to component path.", class); 1482 Undiag_reason = UD_VAL_NOPATH; 1483 fmcase = fmd_case_open(hdl, NULL); 1484 publish_undiagnosable(hdl, ffep, fmcase); 1485 } 1486 return; 1487 } 1488 1489 ipp = ipath(epnamenp); 1490 tree_free(epnamenp); 1491 fme_receive_report(hdl, ffep, class, ipp, nvl); 1492 } 1493 1494 /*ARGSUSED*/ 1495 void 1496 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1497 const char *eventstring) 1498 { 1499 char *uuid; 1500 nvlist_t **nva; 1501 uint_t nvc; 1502 const struct ipath *ipp; 1503 1504 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1505 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1506 &nva, &nvc) != 0) { 1507 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1508 return; 1509 } 1510 1511 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1512 1513 while (nvc-- != 0) { 1514 /* 1515 * Reset any istat or serd engine associated with this path. 1516 */ 1517 char *path; 1518 1519 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1520 continue; 1521 1522 path = ipath2str(NULL, ipp); 1523 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1524 path); 1525 FREE(path); 1526 1527 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1528 istat_save(); 1529 1530 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1531 serd_save(); 1532 } 1533 } 1534 1535 /*ARGSUSED*/ 1536 void 1537 fme_receive_topology_change(void) 1538 { 1539 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1540 istat_save(); 1541 1542 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1543 serd_save(); 1544 } 1545 1546 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1547 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1548 1549 /* ARGSUSED */ 1550 static void 1551 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1552 { 1553 struct bubble *bp; 1554 struct arrowlist *ap; 1555 1556 ep->cached_state = 0; 1557 ep->keep_in_tree = 0; 1558 for (bp = itree_next_bubble(ep, NULL); bp; 1559 bp = itree_next_bubble(ep, bp)) { 1560 if (bp->t != B_FROM) 1561 continue; 1562 bp->mark = 0; 1563 for (ap = itree_next_arrow(bp, NULL); ap; 1564 ap = itree_next_arrow(bp, ap)) 1565 ap->arrowp->mark = 0; 1566 } 1567 } 1568 1569 static void 1570 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1571 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1572 { 1573 struct event *ep; 1574 struct fme *fmep = NULL; 1575 struct fme *ofmep = NULL; 1576 struct fme *cfmep, *svfmep; 1577 int matched = 0; 1578 nvlist_t *defect; 1579 fmd_case_t *fmcase; 1580 1581 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1582 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1583 out(O_ALTFP|O_STAMP, NULL); 1584 1585 /* decide which FME it goes to */ 1586 for (fmep = FMElist; fmep; fmep = fmep->next) { 1587 int prev_verbose; 1588 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1589 enum fme_state state; 1590 nvlist_t *pre_peek_nvp = NULL; 1591 1592 if (fmep->overflow) { 1593 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1594 ofmep = fmep; 1595 1596 continue; 1597 } 1598 1599 /* 1600 * ignore solved or closed cases 1601 */ 1602 if (fmep->posted_suspects || 1603 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1604 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1605 continue; 1606 1607 /* look up event in event tree for this FME */ 1608 if ((ep = itree_lookup(fmep->eventtree, 1609 eventstring, ipp)) == NULL) 1610 continue; 1611 1612 /* note observation */ 1613 fmep->ecurrent = ep; 1614 if (ep->count++ == 0) { 1615 /* link it into list of observations seen */ 1616 ep->observations = fmep->observations; 1617 fmep->observations = ep; 1618 ep->nvp = evnv_dupnvl(nvl); 1619 } else { 1620 /* use new payload values for peek */ 1621 pre_peek_nvp = ep->nvp; 1622 ep->nvp = evnv_dupnvl(nvl); 1623 } 1624 1625 /* tell hypothesise() not to mess with suspect list */ 1626 fmep->peek = 1; 1627 1628 /* don't want this to be verbose (unless Debug is set) */ 1629 prev_verbose = Verbose; 1630 if (Debug == 0) 1631 Verbose = 0; 1632 1633 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1634 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1635 1636 fmep->peek = 0; 1637 1638 /* put verbose flag back */ 1639 Verbose = prev_verbose; 1640 1641 if (state != FME_DISPROVED) { 1642 /* found an FME that explains the ereport */ 1643 matched++; 1644 out(O_ALTFP|O_NONL, "["); 1645 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1646 out(O_ALTFP, " explained by FME%d]", fmep->id); 1647 1648 if (pre_peek_nvp) 1649 nvlist_free(pre_peek_nvp); 1650 1651 if (ep->count == 1) 1652 serialize_observation(fmep, eventstring, ipp); 1653 1654 if (ffep) { 1655 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1656 ep->ffep = ffep; 1657 } 1658 1659 stats_counter_bump(fmep->Rcount); 1660 1661 /* re-eval FME */ 1662 fme_eval(fmep, ffep); 1663 } else { 1664 1665 /* not a match, undo noting of observation */ 1666 fmep->ecurrent = NULL; 1667 if (--ep->count == 0) { 1668 /* unlink it from observations */ 1669 fmep->observations = ep->observations; 1670 ep->observations = NULL; 1671 nvlist_free(ep->nvp); 1672 ep->nvp = NULL; 1673 } else { 1674 nvlist_free(ep->nvp); 1675 ep->nvp = pre_peek_nvp; 1676 } 1677 } 1678 } 1679 1680 if (matched) 1681 return; /* explained by at least one existing FME */ 1682 1683 /* clean up closed fmes */ 1684 cfmep = ClosedFMEs; 1685 while (cfmep != NULL) { 1686 svfmep = cfmep->next; 1687 destroy_fme(cfmep); 1688 cfmep = svfmep; 1689 } 1690 ClosedFMEs = NULL; 1691 prune_propagations(eventstring, ipp); 1692 1693 if (ofmep) { 1694 out(O_ALTFP|O_NONL, "["); 1695 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1696 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1697 if (ffep) 1698 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1699 1700 return; 1701 1702 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1703 out(O_ALTFP|O_NONL, "["); 1704 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1705 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1706 1707 fmcase = fmd_case_open(hdl, NULL); 1708 1709 /* Create overflow fme */ 1710 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1711 out(O_ALTFP|O_NONL, "["); 1712 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1713 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1714 publish_undiagnosable(hdl, ffep, fmcase); 1715 return; 1716 } 1717 1718 Open_fme_count++; 1719 1720 init_fme_bufs(fmep); 1721 fmep->overflow = B_TRUE; 1722 1723 if (ffep) 1724 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1725 1726 Undiag_reason = UD_VAL_MAXFME; 1727 defect = fmd_nvl_create_fault(hdl, 1728 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1729 (void) nvlist_add_string(defect, UNDIAG_REASON, 1730 undiag_2reason_str(Undiag_reason)); 1731 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1732 fmd_case_solve(hdl, fmep->fmcase); 1733 Undiag_reason = UD_VAL_UNKNOWN; 1734 return; 1735 } 1736 1737 /* open a case */ 1738 fmcase = fmd_case_open(hdl, NULL); 1739 1740 /* start a new FME */ 1741 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1742 out(O_ALTFP|O_NONL, "["); 1743 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1744 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1745 publish_undiagnosable(hdl, ffep, fmcase); 1746 return; 1747 } 1748 1749 Open_fme_count++; 1750 1751 init_fme_bufs(fmep); 1752 1753 out(O_ALTFP|O_NONL, "["); 1754 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1755 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1756 fmd_case_uuid(hdl, fmep->fmcase)); 1757 1758 ep = fmep->e0; 1759 ASSERT(ep != NULL); 1760 1761 /* note observation */ 1762 fmep->ecurrent = ep; 1763 if (ep->count++ == 0) { 1764 /* link it into list of observations seen */ 1765 ep->observations = fmep->observations; 1766 fmep->observations = ep; 1767 ep->nvp = evnv_dupnvl(nvl); 1768 serialize_observation(fmep, eventstring, ipp); 1769 } else { 1770 /* new payload overrides any previous */ 1771 nvlist_free(ep->nvp); 1772 ep->nvp = evnv_dupnvl(nvl); 1773 } 1774 1775 stats_counter_bump(fmep->Rcount); 1776 1777 if (ffep) { 1778 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1779 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1780 fmep->e0r = ffep; 1781 ep->ffep = ffep; 1782 } 1783 1784 /* give the diagnosis algorithm a shot at the new FME state */ 1785 fme_eval(fmep, ffep); 1786 } 1787 1788 void 1789 fme_status(int flags) 1790 { 1791 struct fme *fmep; 1792 1793 if (FMElist == NULL) { 1794 out(flags, "No fault management exercises underway."); 1795 return; 1796 } 1797 1798 for (fmep = FMElist; fmep; fmep = fmep->next) 1799 fme_print(flags, fmep); 1800 } 1801 1802 /* 1803 * "indent" routines used mostly for nicely formatted debug output, but also 1804 * for sanity checking for infinite recursion bugs. 1805 */ 1806 1807 #define MAX_INDENT 1024 1808 static const char *indent_s[MAX_INDENT]; 1809 static int current_indent; 1810 1811 static void 1812 indent_push(const char *s) 1813 { 1814 if (current_indent < MAX_INDENT) 1815 indent_s[current_indent++] = s; 1816 else 1817 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1818 } 1819 1820 static void 1821 indent_set(const char *s) 1822 { 1823 current_indent = 0; 1824 indent_push(s); 1825 } 1826 1827 static void 1828 indent_pop(void) 1829 { 1830 if (current_indent > 0) 1831 current_indent--; 1832 else 1833 out(O_DIE, "recursion underflow"); 1834 } 1835 1836 static void 1837 indent(void) 1838 { 1839 int i; 1840 if (!Verbose) 1841 return; 1842 for (i = 0; i < current_indent; i++) 1843 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1844 } 1845 1846 #define SLNEW 1 1847 #define SLCHANGED 2 1848 #define SLWAIT 3 1849 #define SLDISPROVED 4 1850 1851 static void 1852 print_suspects(int circumstance, struct fme *fmep) 1853 { 1854 struct event *ep; 1855 1856 out(O_ALTFP|O_NONL, "["); 1857 if (circumstance == SLCHANGED) { 1858 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1859 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1860 } else if (circumstance == SLWAIT) { 1861 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1862 fmep->timer); 1863 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1864 } else if (circumstance == SLDISPROVED) { 1865 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1866 } else { 1867 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1868 } 1869 1870 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1871 out(O_ALTFP, "]"); 1872 return; 1873 } 1874 1875 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1876 out(O_ALTFP|O_NONL, " "); 1877 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1878 } 1879 out(O_ALTFP, "]"); 1880 } 1881 1882 static struct node * 1883 eventprop_lookup(struct event *ep, const char *propname) 1884 { 1885 return (lut_lookup(ep->props, (void *)propname, NULL)); 1886 } 1887 1888 #define MAXDIGITIDX 23 1889 static char numbuf[MAXDIGITIDX + 1]; 1890 1891 static int 1892 node2uint(struct node *n, uint_t *valp) 1893 { 1894 struct evalue value; 1895 struct lut *globals = NULL; 1896 1897 if (n == NULL) 1898 return (1); 1899 1900 /* 1901 * check value.v since we are being asked to convert an unsigned 1902 * long long int to an unsigned int 1903 */ 1904 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1905 value.t != UINT64 || value.v > (1ULL << 32)) 1906 return (1); 1907 1908 *valp = (uint_t)value.v; 1909 1910 return (0); 1911 } 1912 1913 static nvlist_t * 1914 node2fmri(struct node *n) 1915 { 1916 nvlist_t **pa, *f, *p; 1917 struct node *nc; 1918 uint_t depth = 0; 1919 char *numstr, *nullbyte; 1920 char *failure; 1921 int err, i; 1922 1923 /* XXX do we need to be able to handle a non-T_NAME node? */ 1924 if (n == NULL || n->t != T_NAME) 1925 return (NULL); 1926 1927 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1928 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1929 break; 1930 depth++; 1931 } 1932 1933 if (nc != NULL) { 1934 /* We bailed early, something went wrong */ 1935 return (NULL); 1936 } 1937 1938 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1939 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1940 pa = alloca(depth * sizeof (nvlist_t *)); 1941 for (i = 0; i < depth; i++) 1942 pa[i] = NULL; 1943 1944 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1945 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1946 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1947 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1948 if (err != 0) { 1949 failure = "basic construction of FMRI failed"; 1950 goto boom; 1951 } 1952 1953 numbuf[MAXDIGITIDX] = '\0'; 1954 nullbyte = &numbuf[MAXDIGITIDX]; 1955 i = 0; 1956 1957 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1958 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1959 if (err != 0) { 1960 failure = "alloc of an hc-pair failed"; 1961 goto boom; 1962 } 1963 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1964 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1965 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1966 if (err != 0) { 1967 failure = "construction of an hc-pair failed"; 1968 goto boom; 1969 } 1970 pa[i++] = p; 1971 } 1972 1973 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1974 if (err == 0) { 1975 for (i = 0; i < depth; i++) 1976 if (pa[i] != NULL) 1977 nvlist_free(pa[i]); 1978 return (f); 1979 } 1980 failure = "addition of hc-pair array to FMRI failed"; 1981 1982 boom: 1983 for (i = 0; i < depth; i++) 1984 if (pa[i] != NULL) 1985 nvlist_free(pa[i]); 1986 nvlist_free(f); 1987 out(O_DIE, "%s", failure); 1988 /*NOTREACHED*/ 1989 return (NULL); 1990 } 1991 1992 /* an ipath cache entry is an array of these, with s==NULL at the end */ 1993 struct ipath { 1994 const char *s; /* component name (in stable) */ 1995 int i; /* instance number */ 1996 }; 1997 1998 static nvlist_t * 1999 ipath2fmri(struct ipath *ipath) 2000 { 2001 nvlist_t **pa, *f, *p; 2002 uint_t depth = 0; 2003 char *numstr, *nullbyte; 2004 char *failure; 2005 int err, i; 2006 struct ipath *ipp; 2007 2008 for (ipp = ipath; ipp->s != NULL; ipp++) 2009 depth++; 2010 2011 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2012 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2013 pa = alloca(depth * sizeof (nvlist_t *)); 2014 for (i = 0; i < depth; i++) 2015 pa[i] = NULL; 2016 2017 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2018 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2019 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2020 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2021 if (err != 0) { 2022 failure = "basic construction of FMRI failed"; 2023 goto boom; 2024 } 2025 2026 numbuf[MAXDIGITIDX] = '\0'; 2027 nullbyte = &numbuf[MAXDIGITIDX]; 2028 i = 0; 2029 2030 for (ipp = ipath; ipp->s != NULL; ipp++) { 2031 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2032 if (err != 0) { 2033 failure = "alloc of an hc-pair failed"; 2034 goto boom; 2035 } 2036 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2037 numstr = ulltostr(ipp->i, nullbyte); 2038 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2039 if (err != 0) { 2040 failure = "construction of an hc-pair failed"; 2041 goto boom; 2042 } 2043 pa[i++] = p; 2044 } 2045 2046 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2047 if (err == 0) { 2048 for (i = 0; i < depth; i++) 2049 if (pa[i] != NULL) 2050 nvlist_free(pa[i]); 2051 return (f); 2052 } 2053 failure = "addition of hc-pair array to FMRI failed"; 2054 2055 boom: 2056 for (i = 0; i < depth; i++) 2057 if (pa[i] != NULL) 2058 nvlist_free(pa[i]); 2059 nvlist_free(f); 2060 out(O_DIE, "%s", failure); 2061 /*NOTREACHED*/ 2062 return (NULL); 2063 } 2064 2065 static uint_t 2066 avg(uint_t sum, uint_t cnt) 2067 { 2068 unsigned long long s = sum * 10; 2069 2070 return ((s / cnt / 10) + (((s / cnt % 10) >= 5) ? 1 : 0)); 2071 } 2072 2073 static uint8_t 2074 percentof(uint_t part, uint_t whole) 2075 { 2076 unsigned long long p = part * 1000; 2077 2078 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2079 } 2080 2081 struct rsl { 2082 struct event *suspect; 2083 nvlist_t *asru; 2084 nvlist_t *fru; 2085 nvlist_t *rsrc; 2086 }; 2087 2088 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2089 2090 /* 2091 * rslfree -- free internal members of struct rsl not expected to be 2092 * freed elsewhere. 2093 */ 2094 static void 2095 rslfree(struct rsl *freeme) 2096 { 2097 if (freeme->asru != NULL) 2098 nvlist_free(freeme->asru); 2099 if (freeme->fru != NULL) 2100 nvlist_free(freeme->fru); 2101 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2102 nvlist_free(freeme->rsrc); 2103 } 2104 2105 /* 2106 * rslcmp -- compare two rsl structures. Use the following 2107 * comparisons to establish cardinality: 2108 * 2109 * 1. Name of the suspect's class. (simple strcmp) 2110 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2111 * 2112 */ 2113 static int 2114 rslcmp(const void *a, const void *b) 2115 { 2116 struct rsl *r1 = (struct rsl *)a; 2117 struct rsl *r2 = (struct rsl *)b; 2118 int rv; 2119 2120 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2121 r2->suspect->enode->u.event.ename->u.name.s); 2122 if (rv != 0) 2123 return (rv); 2124 2125 if (r1->rsrc == NULL && r2->rsrc == NULL) 2126 return (0); 2127 if (r1->rsrc == NULL) 2128 return (-1); 2129 if (r2->rsrc == NULL) 2130 return (1); 2131 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2132 } 2133 2134 /* 2135 * rsluniq -- given an array of rsl structures, seek out and "remove" 2136 * any duplicates. Dups are "remove"d by NULLing the suspect pointer 2137 * of the array element. Removal also means updating the number of 2138 * problems and the number of problems which are not faults. User 2139 * provides the first and last element pointers. 2140 */ 2141 static void 2142 rsluniq(struct rsl *first, struct rsl *last, int *nprobs, int *nnonf) 2143 { 2144 struct rsl *cr; 2145 2146 if (*nprobs == 1) 2147 return; 2148 2149 /* 2150 * At this point, we only expect duplicate defects. 2151 * Eversholt's diagnosis algorithm prevents duplicate 2152 * suspects, but we rewrite defects in the platform code after 2153 * the diagnosis is made, and that can introduce new 2154 * duplicates. 2155 */ 2156 while (first <= last) { 2157 if (first->suspect == NULL || !is_defect(first->suspect->t)) { 2158 first++; 2159 continue; 2160 } 2161 cr = first + 1; 2162 while (cr <= last) { 2163 if (is_defect(first->suspect->t)) { 2164 if (rslcmp(first, cr) == 0) { 2165 cr->suspect = NULL; 2166 rslfree(cr); 2167 (*nprobs)--; 2168 (*nnonf)--; 2169 } 2170 } 2171 /* 2172 * assume all defects are in order after our 2173 * sort and short circuit here with "else break" ? 2174 */ 2175 cr++; 2176 } 2177 first++; 2178 } 2179 } 2180 2181 /* 2182 * get_resources -- for a given suspect, determine what ASRU, FRU and 2183 * RSRC nvlists should be advertised in the final suspect list. 2184 */ 2185 void 2186 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2187 { 2188 struct node *asrudef, *frudef; 2189 nvlist_t *asru, *fru; 2190 nvlist_t *rsrc = NULL; 2191 char *pathstr; 2192 2193 /* 2194 * First find any ASRU and/or FRU defined in the 2195 * initial fault tree. 2196 */ 2197 asrudef = eventprop_lookup(sp, L_ASRU); 2198 frudef = eventprop_lookup(sp, L_FRU); 2199 2200 /* 2201 * Create FMRIs based on those definitions 2202 */ 2203 asru = node2fmri(asrudef); 2204 fru = node2fmri(frudef); 2205 pathstr = ipath2str(NULL, sp->ipp); 2206 2207 /* 2208 * Allow for platform translations of the FMRIs 2209 */ 2210 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2211 pathstr); 2212 2213 FREE(pathstr); 2214 rsrcs->suspect = sp; 2215 rsrcs->asru = asru; 2216 rsrcs->fru = fru; 2217 rsrcs->rsrc = rsrc; 2218 } 2219 2220 /* 2221 * trim_suspects -- prior to publishing, we may need to remove some 2222 * suspects from the list. If we're auto-closing upsets, we don't 2223 * want any of those in the published list. If the ASRUs for multiple 2224 * defects resolve to the same ASRU (driver) we only want to publish 2225 * that as a single suspect. 2226 */ 2227 static int 2228 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2229 fmd_event_t *ffep, int *mess_zero_nonfaultp) 2230 { 2231 struct event *ep; 2232 struct rsl *rp = begin; 2233 struct rsl *rp2 = begin2; 2234 int mess_zero_count = 0; 2235 int serd_rval; 2236 uint_t messval; 2237 2238 /* remove any unwanted upsets and populate our array */ 2239 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2240 if (is_upset(ep->t)) 2241 continue; 2242 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2243 NULL, NULL); 2244 if (serd_rval == 0) 2245 continue; 2246 if (node2uint(eventprop_lookup(ep, L_message), 2247 &messval) == 0 && messval == 0) { 2248 get_resources(ep, rp2, fmep->config); 2249 rp2++; 2250 mess_zero_count++; 2251 if (!is_fault(ep->t)) 2252 (*mess_zero_nonfaultp)++; 2253 } else { 2254 get_resources(ep, rp, fmep->config); 2255 rp++; 2256 fmep->nsuspects++; 2257 if (!is_fault(ep->t)) 2258 fmep->nonfault++; 2259 } 2260 } 2261 return (mess_zero_count); 2262 } 2263 2264 /* 2265 * addpayloadprop -- add a payload prop to a problem 2266 */ 2267 static void 2268 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2269 { 2270 nvlist_t *rsrc, *hcs; 2271 2272 ASSERT(fault != NULL); 2273 ASSERT(lhs != NULL); 2274 ASSERT(rhs != NULL); 2275 2276 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2277 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2278 2279 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2280 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2281 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2282 out(O_DIE, 2283 "cannot add payloadprop \"%s\" to fault", lhs); 2284 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2285 out(O_DIE, 2286 "cannot add payloadprop \"%s\" to fault", lhs); 2287 nvlist_free(hcs); 2288 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2289 out(O_DIE, 2290 "cannot add payloadprop \"%s\" to fault", lhs); 2291 } else 2292 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2293 2294 if (rhs->t == UINT64) { 2295 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2296 2297 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2298 out(O_DIE, 2299 "cannot add payloadprop \"%s\" to fault", lhs); 2300 } else { 2301 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2302 lhs, (char *)(uintptr_t)rhs->v); 2303 2304 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2305 out(O_DIE, 2306 "cannot add payloadprop \"%s\" to fault", lhs); 2307 } 2308 } 2309 2310 static char *Istatbuf; 2311 static char *Istatbufptr; 2312 static int Istatsz; 2313 2314 /* 2315 * istataddsize -- calculate size of istat and add it to Istatsz 2316 */ 2317 /*ARGSUSED2*/ 2318 static void 2319 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2320 { 2321 int val; 2322 2323 ASSERT(lhs != NULL); 2324 ASSERT(rhs != NULL); 2325 2326 if ((val = stats_counter_value(rhs)) == 0) 2327 return; /* skip zero-valued stats */ 2328 2329 /* count up the size of the stat name */ 2330 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2331 Istatsz++; /* for the trailing NULL byte */ 2332 2333 /* count up the size of the stat value */ 2334 Istatsz += snprintf(NULL, 0, "%d", val); 2335 Istatsz++; /* for the trailing NULL byte */ 2336 } 2337 2338 /* 2339 * istat2str -- serialize an istat, writing result to *Istatbufptr 2340 */ 2341 /*ARGSUSED2*/ 2342 static void 2343 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2344 { 2345 char *str; 2346 int len; 2347 int val; 2348 2349 ASSERT(lhs != NULL); 2350 ASSERT(rhs != NULL); 2351 2352 if ((val = stats_counter_value(rhs)) == 0) 2353 return; /* skip zero-valued stats */ 2354 2355 /* serialize the stat name */ 2356 str = ipath2str(lhs->ename, lhs->ipath); 2357 len = strlen(str); 2358 2359 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2360 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2361 Istatbufptr += len; 2362 FREE(str); 2363 *Istatbufptr++ = '\0'; 2364 2365 /* serialize the stat value */ 2366 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2367 "%d", val); 2368 *Istatbufptr++ = '\0'; 2369 2370 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2371 } 2372 2373 void 2374 istat_save() 2375 { 2376 if (Istat_need_save == 0) 2377 return; 2378 2379 /* figure out how big the serialzed info is */ 2380 Istatsz = 0; 2381 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2382 2383 if (Istatsz == 0) { 2384 /* no stats to save */ 2385 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2386 return; 2387 } 2388 2389 /* create the serialized buffer */ 2390 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2391 lut_walk(Istats, (lut_cb)istat2str, NULL); 2392 2393 /* clear out current saved stats */ 2394 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2395 2396 /* write out the new version */ 2397 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2398 FREE(Istatbuf); 2399 2400 Istat_need_save = 0; 2401 } 2402 2403 int 2404 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2405 { 2406 if (ent1->ename != ent2->ename) 2407 return (ent2->ename - ent1->ename); 2408 if (ent1->ipath != ent2->ipath) 2409 return ((char *)ent2->ipath - (char *)ent1->ipath); 2410 2411 return (0); 2412 } 2413 2414 /* 2415 * istat-verify -- verify the component associated with a stat still exists 2416 * 2417 * if the component no longer exists, this routine resets the stat and 2418 * returns 0. if the component still exists, it returns 1. 2419 */ 2420 static int 2421 istat_verify(struct node *snp, struct istat_entry *entp) 2422 { 2423 struct stats *statp; 2424 nvlist_t *fmri; 2425 2426 fmri = node2fmri(snp->u.event.epname); 2427 if (platform_path_exists(fmri)) { 2428 nvlist_free(fmri); 2429 return (1); 2430 } 2431 nvlist_free(fmri); 2432 2433 /* component no longer in system. zero out the associated stats */ 2434 if ((statp = (struct stats *) 2435 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2436 stats_counter_value(statp) == 0) 2437 return (0); /* stat is already reset */ 2438 2439 Istat_need_save = 1; 2440 stats_counter_reset(statp); 2441 return (0); 2442 } 2443 2444 static void 2445 istat_bump(struct node *snp, int n) 2446 { 2447 struct stats *statp; 2448 struct istat_entry ent; 2449 2450 ASSERT(snp != NULL); 2451 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2452 ASSERT(snp->u.event.epname != NULL); 2453 2454 /* class name should be hoisted into a single stable entry */ 2455 ASSERT(snp->u.event.ename->u.name.next == NULL); 2456 ent.ename = snp->u.event.ename->u.name.s; 2457 ent.ipath = ipath(snp->u.event.epname); 2458 2459 if (!istat_verify(snp, &ent)) { 2460 /* component no longer exists in system, nothing to do */ 2461 return; 2462 } 2463 2464 if ((statp = (struct stats *) 2465 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2466 /* need to create the counter */ 2467 int cnt = 0; 2468 struct node *np; 2469 char *sname; 2470 char *snamep; 2471 struct istat_entry *newentp; 2472 2473 /* count up the size of the stat name */ 2474 np = snp->u.event.ename; 2475 while (np != NULL) { 2476 cnt += strlen(np->u.name.s); 2477 cnt++; /* for the '.' or '@' */ 2478 np = np->u.name.next; 2479 } 2480 np = snp->u.event.epname; 2481 while (np != NULL) { 2482 cnt += snprintf(NULL, 0, "%s%llu", 2483 np->u.name.s, np->u.name.child->u.ull); 2484 cnt++; /* for the '/' or trailing NULL byte */ 2485 np = np->u.name.next; 2486 } 2487 2488 /* build the stat name */ 2489 snamep = sname = alloca(cnt); 2490 np = snp->u.event.ename; 2491 while (np != NULL) { 2492 snamep += snprintf(snamep, &sname[cnt] - snamep, 2493 "%s", np->u.name.s); 2494 np = np->u.name.next; 2495 if (np) 2496 *snamep++ = '.'; 2497 } 2498 *snamep++ = '@'; 2499 np = snp->u.event.epname; 2500 while (np != NULL) { 2501 snamep += snprintf(snamep, &sname[cnt] - snamep, 2502 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2503 np = np->u.name.next; 2504 if (np) 2505 *snamep++ = '/'; 2506 } 2507 *snamep++ = '\0'; 2508 2509 /* create the new stat & add it to our list */ 2510 newentp = MALLOC(sizeof (*newentp)); 2511 *newentp = ent; 2512 statp = stats_new_counter(NULL, sname, 0); 2513 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2514 (lut_cmp)istat_cmp); 2515 } 2516 2517 /* if n is non-zero, set that value instead of bumping */ 2518 if (n) { 2519 stats_counter_reset(statp); 2520 stats_counter_add(statp, n); 2521 } else 2522 stats_counter_bump(statp); 2523 Istat_need_save = 1; 2524 2525 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2526 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2527 stats_counter_value(statp)); 2528 } 2529 2530 /*ARGSUSED*/ 2531 static void 2532 istat_destructor(void *left, void *right, void *arg) 2533 { 2534 struct istat_entry *entp = (struct istat_entry *)left; 2535 struct stats *statp = (struct stats *)right; 2536 FREE(entp); 2537 stats_delete(statp); 2538 } 2539 2540 /* 2541 * Callback used in a walk of the Istats to reset matching stat counters. 2542 */ 2543 static void 2544 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2545 const struct ipath *ipp) 2546 { 2547 char *path; 2548 2549 if (entp->ipath == ipp) { 2550 path = ipath2str(entp->ename, ipp); 2551 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2552 FREE(path); 2553 stats_counter_reset(statp); 2554 Istat_need_save = 1; 2555 } 2556 } 2557 2558 /*ARGSUSED*/ 2559 static void 2560 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2561 void *unused) 2562 { 2563 char *path; 2564 nvlist_t *fmri; 2565 2566 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2567 if (!platform_path_exists(fmri)) { 2568 path = ipath2str(entp->ename, entp->ipath); 2569 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2570 FREE(path); 2571 stats_counter_reset(statp); 2572 Istat_need_save = 1; 2573 } 2574 nvlist_free(fmri); 2575 } 2576 2577 void 2578 istat_fini(void) 2579 { 2580 lut_free(Istats, istat_destructor, NULL); 2581 } 2582 2583 static char *Serdbuf; 2584 static char *Serdbufptr; 2585 static int Serdsz; 2586 2587 /* 2588 * serdaddsize -- calculate size of serd and add it to Serdsz 2589 */ 2590 /*ARGSUSED*/ 2591 static void 2592 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2593 { 2594 ASSERT(lhs != NULL); 2595 2596 /* count up the size of the stat name */ 2597 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2598 Serdsz++; /* for the trailing NULL byte */ 2599 } 2600 2601 /* 2602 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2603 */ 2604 /*ARGSUSED*/ 2605 static void 2606 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2607 { 2608 char *str; 2609 int len; 2610 2611 ASSERT(lhs != NULL); 2612 2613 /* serialize the serd engine name */ 2614 str = ipath2str(lhs->ename, lhs->ipath); 2615 len = strlen(str); 2616 2617 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2618 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2619 Serdbufptr += len; 2620 FREE(str); 2621 *Serdbufptr++ = '\0'; 2622 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2623 } 2624 2625 void 2626 serd_save() 2627 { 2628 if (Serd_need_save == 0) 2629 return; 2630 2631 /* figure out how big the serialzed info is */ 2632 Serdsz = 0; 2633 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2634 2635 if (Serdsz == 0) { 2636 /* no serd engines to save */ 2637 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2638 return; 2639 } 2640 2641 /* create the serialized buffer */ 2642 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2643 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2644 2645 /* clear out current saved stats */ 2646 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2647 2648 /* write out the new version */ 2649 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2650 FREE(Serdbuf); 2651 Serd_need_save = 0; 2652 } 2653 2654 int 2655 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2656 { 2657 if (ent1->ename != ent2->ename) 2658 return (ent2->ename - ent1->ename); 2659 if (ent1->ipath != ent2->ipath) 2660 return ((char *)ent2->ipath - (char *)ent1->ipath); 2661 2662 return (0); 2663 } 2664 2665 void 2666 fme_serd_load(fmd_hdl_t *hdl) 2667 { 2668 int sz; 2669 char *sbuf; 2670 char *sepptr; 2671 char *ptr; 2672 struct serd_entry *newentp; 2673 struct node *epname; 2674 nvlist_t *fmri; 2675 char *namestring; 2676 2677 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2678 return; 2679 sbuf = alloca(sz); 2680 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2681 ptr = sbuf; 2682 while (ptr < &sbuf[sz]) { 2683 sepptr = strchr(ptr, '@'); 2684 *sepptr = '\0'; 2685 namestring = ptr; 2686 sepptr++; 2687 ptr = sepptr; 2688 ptr += strlen(ptr); 2689 ptr++; /* move past the '\0' separating paths */ 2690 epname = pathstring2epnamenp(sepptr); 2691 fmri = node2fmri(epname); 2692 if (platform_path_exists(fmri)) { 2693 newentp = MALLOC(sizeof (*newentp)); 2694 newentp->hdl = hdl; 2695 newentp->ipath = ipath(epname); 2696 newentp->ename = stable(namestring); 2697 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2698 (void *)newentp, (lut_cmp)serd_cmp); 2699 } else 2700 Serd_need_save = 1; 2701 tree_free(epname); 2702 nvlist_free(fmri); 2703 } 2704 /* save it back again in case some of the paths no longer exist */ 2705 serd_save(); 2706 } 2707 2708 /*ARGSUSED*/ 2709 static void 2710 serd_destructor(void *left, void *right, void *arg) 2711 { 2712 struct serd_entry *entp = (struct serd_entry *)left; 2713 FREE(entp); 2714 } 2715 2716 /* 2717 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2718 */ 2719 /*ARGSUSED*/ 2720 static void 2721 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2722 { 2723 char *path; 2724 2725 if (entp->ipath == ipp) { 2726 path = ipath2str(entp->ename, ipp); 2727 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2728 fmd_serd_reset(entp->hdl, path); 2729 FREE(path); 2730 Serd_need_save = 1; 2731 } 2732 } 2733 2734 /*ARGSUSED*/ 2735 static void 2736 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2737 { 2738 char *path; 2739 nvlist_t *fmri; 2740 2741 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2742 if (!platform_path_exists(fmri)) { 2743 path = ipath2str(entp->ename, entp->ipath); 2744 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2745 fmd_serd_reset(entp->hdl, path); 2746 FREE(path); 2747 Serd_need_save = 1; 2748 } 2749 nvlist_free(fmri); 2750 } 2751 2752 void 2753 serd_fini(void) 2754 { 2755 lut_free(SerdEngines, serd_destructor, NULL); 2756 } 2757 2758 static void 2759 publish_suspects(struct fme *fmep, struct rsl *srl) 2760 { 2761 struct rsl *rp; 2762 nvlist_t *fault; 2763 uint8_t cert; 2764 uint_t *frs; 2765 uint_t fravg, frsum, fr; 2766 uint_t messval; 2767 uint_t retireval; 2768 uint_t responseval; 2769 struct node *snp; 2770 int frcnt, fridx; 2771 boolean_t allfaulty = B_TRUE; 2772 struct rsl *erl = srl + fmep->nsuspects - 1; 2773 2774 /* 2775 * sort the array 2776 */ 2777 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2778 rsluniq(srl, erl, &fmep->nsuspects, &fmep->nonfault); 2779 2780 /* 2781 * If the suspect list is all faults, then for a given fault, 2782 * say X of N, X's certainty is computed via: 2783 * 2784 * fitrate(X) / (fitrate(1) + ... + fitrate(N)) * 100 2785 * 2786 * If none of the suspects are faults, and there are N suspects, 2787 * the certainty of a given suspect is 100/N. 2788 * 2789 * If there are are a mixture of faults and other problems in 2790 * the suspect list, we take an average of the faults' 2791 * FITrates and treat this average as the FITrate for any 2792 * non-faults. The fitrate of any given suspect is then 2793 * computed per the first formula above. 2794 */ 2795 if (fmep->nonfault == fmep->nsuspects) { 2796 /* NO faults in the suspect list */ 2797 cert = percentof(1, fmep->nsuspects); 2798 } else { 2799 /* sum the fitrates */ 2800 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2801 fridx = frcnt = frsum = 0; 2802 2803 for (rp = srl; rp <= erl; rp++) { 2804 struct node *n; 2805 2806 if (rp->suspect == NULL) 2807 continue; 2808 if (!is_fault(rp->suspect->t)) { 2809 frs[fridx++] = 0; 2810 continue; 2811 } 2812 n = eventprop_lookup(rp->suspect, L_FITrate); 2813 if (node2uint(n, &fr) != 0) { 2814 out(O_DEBUG|O_NONL, "event "); 2815 ipath_print(O_DEBUG|O_NONL, 2816 rp->suspect->enode->u.event.ename->u.name.s, 2817 rp->suspect->ipp); 2818 out(O_DEBUG, " has no FITrate (using 1)"); 2819 fr = 1; 2820 } else if (fr == 0) { 2821 out(O_DEBUG|O_NONL, "event "); 2822 ipath_print(O_DEBUG|O_NONL, 2823 rp->suspect->enode->u.event.ename->u.name.s, 2824 rp->suspect->ipp); 2825 out(O_DEBUG, " has zero FITrate (using 1)"); 2826 fr = 1; 2827 } 2828 2829 frs[fridx++] = fr; 2830 frsum += fr; 2831 frcnt++; 2832 } 2833 fravg = avg(frsum, frcnt); 2834 for (fridx = 0; fridx < fmep->nsuspects; fridx++) 2835 if (frs[fridx] == 0) { 2836 frs[fridx] = fravg; 2837 frsum += fravg; 2838 } 2839 } 2840 2841 /* Add them in reverse order of our sort, as fmd reverses order */ 2842 for (rp = erl; rp >= srl; rp--) { 2843 if (rp->suspect == NULL) 2844 continue; 2845 if (!is_fault(rp->suspect->t)) 2846 allfaulty = B_FALSE; 2847 if (fmep->nonfault != fmep->nsuspects) 2848 cert = percentof(frs[--fridx], frsum); 2849 fault = fmd_nvl_create_fault(fmep->hdl, 2850 rp->suspect->enode->u.event.ename->u.name.s, 2851 cert, 2852 rp->asru, 2853 rp->fru, 2854 rp->rsrc); 2855 if (fault == NULL) 2856 out(O_DIE, "fault creation failed"); 2857 /* if "message" property exists, add it to the fault */ 2858 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2859 &messval) == 0) { 2860 2861 out(O_ALTFP, 2862 "[FME%d, %s adds message=%d to suspect list]", 2863 fmep->id, 2864 rp->suspect->enode->u.event.ename->u.name.s, 2865 messval); 2866 if (nvlist_add_boolean_value(fault, 2867 FM_SUSPECT_MESSAGE, 2868 (messval) ? B_TRUE : B_FALSE) != 0) { 2869 out(O_DIE, "cannot add no-message to fault"); 2870 } 2871 } 2872 2873 /* if "retire" property exists, add it to the fault */ 2874 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2875 &retireval) == 0) { 2876 2877 out(O_ALTFP, 2878 "[FME%d, %s adds retire=%d to suspect list]", 2879 fmep->id, 2880 rp->suspect->enode->u.event.ename->u.name.s, 2881 retireval); 2882 if (nvlist_add_boolean_value(fault, 2883 FM_SUSPECT_RETIRE, 2884 (retireval) ? B_TRUE : B_FALSE) != 0) { 2885 out(O_DIE, "cannot add no-retire to fault"); 2886 } 2887 } 2888 2889 /* if "response" property exists, add it to the fault */ 2890 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2891 &responseval) == 0) { 2892 2893 out(O_ALTFP, 2894 "[FME%d, %s adds response=%d to suspect list]", 2895 fmep->id, 2896 rp->suspect->enode->u.event.ename->u.name.s, 2897 responseval); 2898 if (nvlist_add_boolean_value(fault, 2899 FM_SUSPECT_RESPONSE, 2900 (responseval) ? B_TRUE : B_FALSE) != 0) { 2901 out(O_DIE, "cannot add no-response to fault"); 2902 } 2903 } 2904 2905 /* add any payload properties */ 2906 lut_walk(rp->suspect->payloadprops, 2907 (lut_cb)addpayloadprop, (void *)fault); 2908 rslfree(rp); 2909 2910 /* 2911 * If "action" property exists, evaluate it; this must be done 2912 * before the allfaulty check below since some actions may 2913 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2914 * needs to be restructured if any new actions are introduced 2915 * that have effects that we do not want to be visible if 2916 * we decide not to publish in the dupclose check below. 2917 */ 2918 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2919 struct evalue evalue; 2920 2921 out(O_ALTFP|O_NONL, 2922 "[FME%d, %s action ", fmep->id, 2923 rp->suspect->enode->u.event.ename->u.name.s); 2924 ptree_name_iter(O_ALTFP|O_NONL, snp); 2925 out(O_ALTFP, "]"); 2926 Action_nvl = fault; 2927 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2928 NULL, 0, &evalue); 2929 } 2930 2931 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2932 2933 /* 2934 * check if the asru is already marked as "faulty". 2935 */ 2936 if (allfaulty) { 2937 nvlist_t *asru; 2938 2939 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2940 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2941 out(O_ALTFP|O_VERB|O_NONL, " "); 2942 if (nvlist_lookup_nvlist(fault, 2943 FM_FAULT_ASRU, &asru) != 0) { 2944 out(O_ALTFP|O_VERB, "NULL asru"); 2945 allfaulty = B_FALSE; 2946 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2947 FMD_HAS_FAULT_ASRU, NULL)) { 2948 out(O_ALTFP|O_VERB, "faulty"); 2949 } else { 2950 out(O_ALTFP|O_VERB, "not faulty"); 2951 allfaulty = B_FALSE; 2952 } 2953 } 2954 2955 } 2956 2957 if (!allfaulty) { 2958 /* 2959 * don't update the count stat if all asrus are already 2960 * present and unrepaired in the asru cache 2961 */ 2962 for (rp = erl; rp >= srl; rp--) { 2963 struct event *suspect = rp->suspect; 2964 2965 if (suspect == NULL) 2966 continue; 2967 2968 /* if "count" exists, increment the appropriate stat */ 2969 if ((snp = eventprop_lookup(suspect, 2970 L_count)) != NULL) { 2971 out(O_ALTFP|O_NONL, 2972 "[FME%d, %s count ", fmep->id, 2973 suspect->enode->u.event.ename->u.name.s); 2974 ptree_name_iter(O_ALTFP|O_NONL, snp); 2975 out(O_ALTFP, "]"); 2976 istat_bump(snp, 0); 2977 2978 } 2979 } 2980 istat_save(); /* write out any istat changes */ 2981 } 2982 } 2983 2984 static const char * 2985 undiag_2defect_str(int ud) 2986 { 2987 switch (ud) { 2988 case UD_VAL_MISSINGINFO: 2989 case UD_VAL_MISSINGOBS: 2990 case UD_VAL_MISSINGPATH: 2991 case UD_VAL_MISSINGZERO: 2992 case UD_VAL_BADOBS: 2993 case UD_VAL_CFGMISMATCH: 2994 return (UNDIAG_DEFECT_CHKPT); 2995 break; 2996 2997 case UD_VAL_BADEVENTI: 2998 case UD_VAL_INSTFAIL: 2999 case UD_VAL_NOPATH: 3000 case UD_VAL_UNSOLVD: 3001 return (UNDIAG_DEFECT_FME); 3002 break; 3003 3004 case UD_VAL_MAXFME: 3005 return (UNDIAG_DEFECT_LIMIT); 3006 break; 3007 3008 case UD_VAL_UNKNOWN: 3009 default: 3010 return (UNDIAG_DEFECT_UNKNOWN); 3011 break; 3012 } 3013 } 3014 3015 const char * 3016 undiag_2reason_str(int ud) 3017 { 3018 switch (ud) { 3019 case UD_VAL_BADEVENTI: 3020 return (UD_STR_BADEVENTI); 3021 case UD_VAL_BADOBS: 3022 return (UD_STR_BADOBS); 3023 case UD_VAL_CFGMISMATCH: 3024 return (UD_STR_CFGMISMATCH); 3025 case UD_VAL_INSTFAIL: 3026 return (UD_STR_INSTFAIL); 3027 case UD_VAL_MAXFME: 3028 return (UD_STR_MAXFME); 3029 case UD_VAL_MISSINGINFO: 3030 return (UD_STR_MISSINGINFO); 3031 case UD_VAL_MISSINGOBS: 3032 return (UD_STR_MISSINGOBS); 3033 case UD_VAL_MISSINGPATH: 3034 return (UD_STR_MISSINGPATH); 3035 case UD_VAL_MISSINGZERO: 3036 return (UD_STR_MISSINGZERO); 3037 case UD_VAL_NOPATH: 3038 return (UD_STR_NOPATH); 3039 case UD_VAL_UNSOLVD: 3040 return (UD_STR_UNSOLVD); 3041 case UD_VAL_UNKNOWN: 3042 default: 3043 return (UD_STR_UNKNOWN); 3044 } 3045 } 3046 3047 static void 3048 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase) 3049 { 3050 struct case_list *newcase; 3051 nvlist_t *defect; 3052 3053 out(O_ALTFP, 3054 "[undiagnosable ereport received, " 3055 "creating and closing a new case (%s)]", 3056 undiag_2reason_str(Undiag_reason)); 3057 3058 newcase = MALLOC(sizeof (struct case_list)); 3059 newcase->next = NULL; 3060 newcase->fmcase = fmcase; 3061 if (Undiagablecaselist != NULL) 3062 newcase->next = Undiagablecaselist; 3063 Undiagablecaselist = newcase; 3064 3065 if (ffep != NULL) 3066 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3067 3068 defect = fmd_nvl_create_fault(hdl, 3069 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 3070 (void) nvlist_add_string(defect, UNDIAG_REASON, 3071 undiag_2reason_str(Undiag_reason)); 3072 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3073 3074 fmd_case_solve(hdl, newcase->fmcase); 3075 fmd_case_close(hdl, newcase->fmcase); 3076 Undiag_reason = UD_VAL_UNKNOWN; 3077 } 3078 3079 static void 3080 fme_undiagnosable(struct fme *f) 3081 { 3082 nvlist_t *defect; 3083 3084 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3085 f->id, fmd_case_uuid(f->hdl, f->fmcase), 3086 undiag_2reason_str(Undiag_reason)); 3087 3088 defect = fmd_nvl_create_fault(f->hdl, 3089 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 3090 (void) nvlist_add_string(defect, UNDIAG_REASON, 3091 undiag_2reason_str(Undiag_reason)); 3092 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3093 fmd_case_solve(f->hdl, f->fmcase); 3094 fmd_case_close(f->hdl, f->fmcase); 3095 Undiag_reason = UD_VAL_UNKNOWN; 3096 } 3097 3098 /* 3099 * fme_close_case 3100 * 3101 * Find the requested case amongst our fmes and close it. Free up 3102 * the related fme. 3103 */ 3104 void 3105 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3106 { 3107 struct case_list *ucasep, *prevcasep = NULL; 3108 struct fme *prev = NULL; 3109 struct fme *fmep; 3110 3111 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3112 if (fmcase != ucasep->fmcase) { 3113 prevcasep = ucasep; 3114 continue; 3115 } 3116 3117 if (prevcasep == NULL) 3118 Undiagablecaselist = Undiagablecaselist->next; 3119 else 3120 prevcasep->next = ucasep->next; 3121 3122 FREE(ucasep); 3123 return; 3124 } 3125 3126 for (fmep = FMElist; fmep; fmep = fmep->next) { 3127 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3128 break; 3129 prev = fmep; 3130 } 3131 3132 if (fmep == NULL) { 3133 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3134 fmd_case_uuid(hdl, fmcase)); 3135 return; 3136 } 3137 3138 if (EFMElist == fmep) 3139 EFMElist = prev; 3140 3141 if (prev == NULL) 3142 FMElist = FMElist->next; 3143 else 3144 prev->next = fmep->next; 3145 3146 fmep->next = NULL; 3147 3148 /* Get rid of any timer this fme has set */ 3149 if (fmep->wull != 0) 3150 fmd_timer_remove(fmep->hdl, fmep->timer); 3151 3152 if (ClosedFMEs == NULL) { 3153 ClosedFMEs = fmep; 3154 } else { 3155 fmep->next = ClosedFMEs; 3156 ClosedFMEs = fmep; 3157 } 3158 3159 Open_fme_count--; 3160 3161 /* See if we can close the overflow FME */ 3162 if (Open_fme_count <= Max_fme) { 3163 for (fmep = FMElist; fmep; fmep = fmep->next) { 3164 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3165 fmep->fmcase))) 3166 break; 3167 } 3168 3169 if (fmep != NULL) 3170 fmd_case_close(fmep->hdl, fmep->fmcase); 3171 } 3172 } 3173 3174 /* 3175 * fme_set_timer() 3176 * If the time we need to wait for the given FME is less than the 3177 * current timer, kick that old timer out and establish a new one. 3178 */ 3179 static int 3180 fme_set_timer(struct fme *fmep, unsigned long long wull) 3181 { 3182 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3183 ptree_timeval(O_ALTFP|O_VERB, &wull); 3184 3185 if (wull <= fmep->pull) { 3186 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3187 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3188 out(O_ALTFP|O_VERB, NULL); 3189 /* we've waited at least wull already, don't need timer */ 3190 return (0); 3191 } 3192 3193 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3194 if (fmep->wull != 0) { 3195 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3196 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3197 out(O_ALTFP|O_VERB, NULL); 3198 } else { 3199 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3200 out(O_ALTFP|O_VERB, NULL); 3201 } 3202 3203 if (fmep->wull != 0) 3204 if (wull >= fmep->wull) 3205 /* New timer would fire later than established timer */ 3206 return (0); 3207 3208 if (fmep->wull != 0) { 3209 fmd_timer_remove(fmep->hdl, fmep->timer); 3210 } 3211 3212 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3213 fmep->e0r, wull); 3214 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3215 fmep->wull = wull; 3216 return (1); 3217 } 3218 3219 void 3220 fme_timer_fired(struct fme *fmep, id_t tid) 3221 { 3222 struct fme *ffmep = NULL; 3223 3224 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3225 if (ffmep == fmep) 3226 break; 3227 3228 if (ffmep == NULL) { 3229 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3230 (void *)fmep); 3231 return; 3232 } 3233 3234 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3235 fmep->pull = fmep->wull; 3236 fmep->wull = 0; 3237 fmd_buf_write(fmep->hdl, fmep->fmcase, 3238 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3239 3240 fme_eval(fmep, fmep->e0r); 3241 } 3242 3243 /* 3244 * Preserve the fme's suspect list in its psuspects list, NULLing the 3245 * suspects list in the meantime. 3246 */ 3247 static void 3248 save_suspects(struct fme *fmep) 3249 { 3250 struct event *ep; 3251 struct event *nextep; 3252 3253 /* zero out the previous suspect list */ 3254 for (ep = fmep->psuspects; ep; ep = nextep) { 3255 nextep = ep->psuspects; 3256 ep->psuspects = NULL; 3257 } 3258 fmep->psuspects = NULL; 3259 3260 /* zero out the suspect list, copying it to previous suspect list */ 3261 fmep->psuspects = fmep->suspects; 3262 for (ep = fmep->suspects; ep; ep = nextep) { 3263 nextep = ep->suspects; 3264 ep->psuspects = ep->suspects; 3265 ep->suspects = NULL; 3266 ep->is_suspect = 0; 3267 } 3268 fmep->suspects = NULL; 3269 fmep->nsuspects = 0; 3270 fmep->nonfault = 0; 3271 } 3272 3273 /* 3274 * Retrieve the fme's suspect list from its psuspects list. 3275 */ 3276 static void 3277 restore_suspects(struct fme *fmep) 3278 { 3279 struct event *ep; 3280 struct event *nextep; 3281 3282 fmep->nsuspects = fmep->nonfault = 0; 3283 fmep->suspects = fmep->psuspects; 3284 for (ep = fmep->psuspects; ep; ep = nextep) { 3285 fmep->nsuspects++; 3286 if (!is_fault(ep->t)) 3287 fmep->nonfault++; 3288 nextep = ep->psuspects; 3289 ep->suspects = ep->psuspects; 3290 } 3291 } 3292 3293 /* 3294 * this is what we use to call the Emrys prototype code instead of main() 3295 */ 3296 static void 3297 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3298 { 3299 struct event *ep; 3300 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3301 struct rsl *srl = NULL; 3302 struct rsl *srl2 = NULL; 3303 int mess_zero_count; 3304 int mess_zero_nonfault = 0; 3305 int rpcnt; 3306 3307 save_suspects(fmep); 3308 3309 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3310 indent_set(" "); 3311 3312 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3313 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3314 3315 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3316 fme_state2str(fmep->state)); 3317 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3318 out(O_ALTFP|O_NONL, " "); 3319 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3320 } 3321 out(O_ALTFP, NULL); 3322 3323 switch (fmep->state) { 3324 case FME_CREDIBLE: 3325 print_suspects(SLNEW, fmep); 3326 (void) upsets_eval(fmep, ffep); 3327 3328 /* 3329 * we may have already posted suspects in upsets_eval() which 3330 * can recurse into fme_eval() again. If so then just return. 3331 */ 3332 if (fmep->posted_suspects) 3333 return; 3334 3335 stats_counter_bump(fmep->diags); 3336 rpcnt = fmep->nsuspects; 3337 save_suspects(fmep); 3338 3339 /* 3340 * create two lists, one for "message=1" faults and one for 3341 * "message=0" faults. If we have a mixture we will generate 3342 * two separate suspect lists. 3343 */ 3344 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3345 bzero(srl, rpcnt * sizeof (struct rsl)); 3346 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3347 bzero(srl2, rpcnt * sizeof (struct rsl)); 3348 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep, 3349 &mess_zero_nonfault); 3350 3351 /* 3352 * If the resulting suspect list has no members, we're 3353 * done so simply close the case. Otherwise sort and publish. 3354 */ 3355 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3356 out(O_ALTFP, 3357 "[FME%d, case %s (all suspects are upsets)]", 3358 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3359 fmd_case_close(fmep->hdl, fmep->fmcase); 3360 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3361 publish_suspects(fmep, srl); 3362 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3363 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3364 fmd_case_solve(fmep->hdl, fmep->fmcase); 3365 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3366 fmep->nsuspects = mess_zero_count; 3367 fmep->nonfault = mess_zero_nonfault; 3368 publish_suspects(fmep, srl2); 3369 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3370 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3371 fmd_case_solve(fmep->hdl, fmep->fmcase); 3372 } else { 3373 struct event *obsp; 3374 struct fme *nfmep; 3375 3376 publish_suspects(fmep, srl); 3377 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3378 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3379 fmd_case_solve(fmep->hdl, fmep->fmcase); 3380 3381 /* 3382 * Got both message=0 and message=1 so create a 3383 * duplicate case. Also need a temporary duplicate fme 3384 * structure for use by publish_suspects(). 3385 */ 3386 nfmep = alloc_fme(); 3387 nfmep->id = Nextid++; 3388 nfmep->hdl = fmep->hdl; 3389 nfmep->nsuspects = mess_zero_count; 3390 nfmep->nonfault = mess_zero_nonfault; 3391 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3392 out(O_ALTFP|O_STAMP, 3393 "[creating parallel FME%d, case %s]", nfmep->id, 3394 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3395 Open_fme_count++; 3396 if (ffep) { 3397 fmd_case_setprincipal(nfmep->hdl, 3398 nfmep->fmcase, ffep); 3399 fmd_case_add_ereport(nfmep->hdl, 3400 nfmep->fmcase, ffep); 3401 } 3402 for (obsp = fmep->observations; obsp; 3403 obsp = obsp->observations) 3404 if (obsp->ffep && obsp->ffep != ffep) 3405 fmd_case_add_ereport(nfmep->hdl, 3406 nfmep->fmcase, obsp->ffep); 3407 3408 publish_suspects(nfmep, srl2); 3409 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3410 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3411 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3412 FREE(nfmep); 3413 } 3414 FREE(srl); 3415 FREE(srl2); 3416 restore_suspects(fmep); 3417 3418 fmep->posted_suspects = 1; 3419 fmd_buf_write(fmep->hdl, fmep->fmcase, 3420 WOBUF_POSTD, 3421 (void *)&fmep->posted_suspects, 3422 sizeof (fmep->posted_suspects)); 3423 3424 /* 3425 * Now the suspects have been posted, we can clear up 3426 * the instance tree as we won't be looking at it again. 3427 * Also cancel the timer as the case is now solved. 3428 */ 3429 if (fmep->wull != 0) { 3430 fmd_timer_remove(fmep->hdl, fmep->timer); 3431 fmep->wull = 0; 3432 } 3433 break; 3434 3435 case FME_WAIT: 3436 ASSERT(my_delay > fmep->ull); 3437 (void) fme_set_timer(fmep, my_delay); 3438 print_suspects(SLWAIT, fmep); 3439 itree_prune(fmep->eventtree); 3440 return; 3441 3442 case FME_DISPROVED: 3443 print_suspects(SLDISPROVED, fmep); 3444 Undiag_reason = UD_VAL_UNSOLVD; 3445 fme_undiagnosable(fmep); 3446 break; 3447 } 3448 3449 itree_free(fmep->eventtree); 3450 fmep->eventtree = NULL; 3451 structconfig_free(fmep->config); 3452 fmep->config = NULL; 3453 destroy_fme_bufs(fmep); 3454 } 3455 3456 static void indent(void); 3457 static int triggered(struct fme *fmep, struct event *ep, int mark); 3458 static enum fme_state effects_test(struct fme *fmep, 3459 struct event *fault_event, unsigned long long at_latest_by, 3460 unsigned long long *pdelay); 3461 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3462 unsigned long long at_latest_by, unsigned long long *pdelay); 3463 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3464 unsigned long long at_latest_by, unsigned long long *pdelay); 3465 3466 static int 3467 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3468 { 3469 struct constraintlist *ctp; 3470 struct evalue value; 3471 char *sep = ""; 3472 3473 if (arrowp->forever_false) { 3474 indent(); 3475 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3476 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3477 out(O_ALTFP|O_VERB|O_NONL, sep); 3478 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3479 sep = ", "; 3480 } 3481 out(O_ALTFP|O_VERB, NULL); 3482 return (0); 3483 } 3484 if (arrowp->forever_true) { 3485 indent(); 3486 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3487 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3488 out(O_ALTFP|O_VERB|O_NONL, sep); 3489 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3490 sep = ", "; 3491 } 3492 out(O_ALTFP|O_VERB, NULL); 3493 return (1); 3494 } 3495 3496 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3497 if (eval_expr(ctp->cnode, NULL, NULL, 3498 &fmep->globals, fmep->config, 3499 arrowp, 0, &value)) { 3500 /* evaluation successful */ 3501 if (value.t == UNDEFINED || value.v == 0) { 3502 /* known false */ 3503 arrowp->forever_false = 1; 3504 indent(); 3505 out(O_ALTFP|O_VERB|O_NONL, 3506 " False constraint: "); 3507 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3508 out(O_ALTFP|O_VERB, NULL); 3509 return (0); 3510 } 3511 } else { 3512 /* evaluation unsuccessful -- unknown value */ 3513 indent(); 3514 out(O_ALTFP|O_VERB|O_NONL, 3515 " Deferred constraint: "); 3516 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3517 out(O_ALTFP|O_VERB, NULL); 3518 return (1); 3519 } 3520 } 3521 /* known true */ 3522 arrowp->forever_true = 1; 3523 indent(); 3524 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3525 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3526 out(O_ALTFP|O_VERB|O_NONL, sep); 3527 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3528 sep = ", "; 3529 } 3530 out(O_ALTFP|O_VERB, NULL); 3531 return (1); 3532 } 3533 3534 static int 3535 triggered(struct fme *fmep, struct event *ep, int mark) 3536 { 3537 struct bubble *bp; 3538 struct arrowlist *ap; 3539 int count = 0; 3540 3541 stats_counter_bump(fmep->Tcallcount); 3542 for (bp = itree_next_bubble(ep, NULL); bp; 3543 bp = itree_next_bubble(ep, bp)) { 3544 if (bp->t != B_TO) 3545 continue; 3546 for (ap = itree_next_arrow(bp, NULL); ap; 3547 ap = itree_next_arrow(bp, ap)) { 3548 /* check count of marks against K in the bubble */ 3549 if ((ap->arrowp->mark & mark) && 3550 ++count >= bp->nork) 3551 return (1); 3552 } 3553 } 3554 return (0); 3555 } 3556 3557 static int 3558 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3559 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3560 { 3561 struct bubble *bp; 3562 struct arrowlist *ap; 3563 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3564 unsigned long long my_delay; 3565 enum fme_state result; 3566 int retval = 0; 3567 3568 for (bp = itree_next_bubble(ep, NULL); bp; 3569 bp = itree_next_bubble(ep, bp)) { 3570 if (bp->t != B_FROM) 3571 continue; 3572 stats_counter_bump(fmep->Marrowcount); 3573 for (ap = itree_next_arrow(bp, NULL); ap; 3574 ap = itree_next_arrow(bp, ap)) { 3575 struct event *ep2 = ap->arrowp->head->myevent; 3576 /* 3577 * if we're clearing marks, we can avoid doing 3578 * all that work evaluating constraints. 3579 */ 3580 if (mark == 0) { 3581 if (ap->arrowp->arrow_marked == 0) 3582 continue; 3583 ap->arrowp->arrow_marked = 0; 3584 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3585 if (keep && (ep2->cached_state & 3586 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3587 ep2->keep_in_tree = 1; 3588 ep2->cached_state &= 3589 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3590 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3591 keep); 3592 continue; 3593 } 3594 ap->arrowp->arrow_marked = 1; 3595 if (ep2->cached_state & REQMNTS_DISPROVED) { 3596 indent(); 3597 out(O_ALTFP|O_VERB|O_NONL, 3598 " ALREADY DISPROVED "); 3599 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3600 out(O_ALTFP|O_VERB, NULL); 3601 continue; 3602 } 3603 if (ep2->cached_state & WAIT_EFFECT) { 3604 indent(); 3605 out(O_ALTFP|O_VERB|O_NONL, 3606 " ALREADY EFFECTS WAIT "); 3607 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3608 out(O_ALTFP|O_VERB, NULL); 3609 continue; 3610 } 3611 if (ep2->cached_state & CREDIBLE_EFFECT) { 3612 indent(); 3613 out(O_ALTFP|O_VERB|O_NONL, 3614 " ALREADY EFFECTS CREDIBLE "); 3615 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3616 out(O_ALTFP|O_VERB, NULL); 3617 continue; 3618 } 3619 if ((ep2->cached_state & PARENT_WAIT) && 3620 (mark & PARENT_WAIT)) { 3621 indent(); 3622 out(O_ALTFP|O_VERB|O_NONL, 3623 " ALREADY PARENT EFFECTS WAIT "); 3624 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3625 out(O_ALTFP|O_VERB, NULL); 3626 continue; 3627 } 3628 platform_set_payloadnvp(ep2->nvp); 3629 if (checkconstraints(fmep, ap->arrowp) == 0) { 3630 platform_set_payloadnvp(NULL); 3631 indent(); 3632 out(O_ALTFP|O_VERB|O_NONL, 3633 " CONSTRAINTS FAIL "); 3634 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3635 out(O_ALTFP|O_VERB, NULL); 3636 continue; 3637 } 3638 platform_set_payloadnvp(NULL); 3639 ap->arrowp->mark |= EFFECTS_COUNTER; 3640 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3641 indent(); 3642 out(O_ALTFP|O_VERB|O_NONL, 3643 " K-COUNT NOT YET MET "); 3644 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3645 out(O_ALTFP|O_VERB, NULL); 3646 continue; 3647 } 3648 ep2->cached_state &= ~PARENT_WAIT; 3649 /* 3650 * if we've reached an ereport and no propagation time 3651 * is specified, use the Hesitate value 3652 */ 3653 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3654 ap->arrowp->maxdelay == 0ULL) { 3655 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3656 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3657 out(O_ALTFP|O_VERB, NULL); 3658 result = requirements_test(fmep, ep2, Hesitate, 3659 &my_delay); 3660 } else { 3661 result = requirements_test(fmep, ep2, 3662 at_latest_by + ap->arrowp->maxdelay, 3663 &my_delay); 3664 } 3665 if (result == FME_WAIT) { 3666 retval = WAIT_EFFECT; 3667 if (overall_delay > my_delay) 3668 overall_delay = my_delay; 3669 ep2->cached_state |= WAIT_EFFECT; 3670 indent(); 3671 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3672 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3673 out(O_ALTFP|O_VERB, NULL); 3674 indent_push(" E"); 3675 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3676 at_latest_by, &my_delay, 0) == 3677 WAIT_EFFECT) { 3678 retval = WAIT_EFFECT; 3679 if (overall_delay > my_delay) 3680 overall_delay = my_delay; 3681 } 3682 indent_pop(); 3683 } else if (result == FME_DISPROVED) { 3684 indent(); 3685 out(O_ALTFP|O_VERB|O_NONL, 3686 " EFFECTS DISPROVED "); 3687 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3688 out(O_ALTFP|O_VERB, NULL); 3689 } else { 3690 ep2->cached_state |= mark; 3691 indent(); 3692 if (mark == CREDIBLE_EFFECT) 3693 out(O_ALTFP|O_VERB|O_NONL, 3694 " EFFECTS CREDIBLE "); 3695 else 3696 out(O_ALTFP|O_VERB|O_NONL, 3697 " PARENT EFFECTS WAIT "); 3698 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3699 out(O_ALTFP|O_VERB, NULL); 3700 indent_push(" E"); 3701 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3702 &my_delay, 0) == WAIT_EFFECT) { 3703 retval = WAIT_EFFECT; 3704 if (overall_delay > my_delay) 3705 overall_delay = my_delay; 3706 } 3707 indent_pop(); 3708 } 3709 } 3710 } 3711 if (retval == WAIT_EFFECT) 3712 *pdelay = overall_delay; 3713 return (retval); 3714 } 3715 3716 static enum fme_state 3717 effects_test(struct fme *fmep, struct event *fault_event, 3718 unsigned long long at_latest_by, unsigned long long *pdelay) 3719 { 3720 struct event *error_event; 3721 enum fme_state return_value = FME_CREDIBLE; 3722 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3723 unsigned long long my_delay; 3724 3725 stats_counter_bump(fmep->Ecallcount); 3726 indent_push(" E"); 3727 indent(); 3728 out(O_ALTFP|O_VERB|O_NONL, "->"); 3729 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3730 out(O_ALTFP|O_VERB, NULL); 3731 3732 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3733 &my_delay, 0) == WAIT_EFFECT) { 3734 return_value = FME_WAIT; 3735 if (overall_delay > my_delay) 3736 overall_delay = my_delay; 3737 } 3738 for (error_event = fmep->observations; 3739 error_event; error_event = error_event->observations) { 3740 indent(); 3741 out(O_ALTFP|O_VERB|O_NONL, " "); 3742 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3743 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3744 if (error_event->cached_state & 3745 (PARENT_WAIT|WAIT_EFFECT)) { 3746 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3747 continue; 3748 } 3749 return_value = FME_DISPROVED; 3750 out(O_ALTFP|O_VERB, " NOT triggered"); 3751 break; 3752 } else { 3753 out(O_ALTFP|O_VERB, " triggered"); 3754 } 3755 } 3756 if (return_value == FME_DISPROVED) { 3757 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3758 } else { 3759 fault_event->keep_in_tree = 1; 3760 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3761 } 3762 3763 indent(); 3764 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3765 fme_state2str(return_value)); 3766 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3767 out(O_ALTFP|O_VERB, NULL); 3768 indent_pop(); 3769 if (return_value == FME_WAIT) 3770 *pdelay = overall_delay; 3771 return (return_value); 3772 } 3773 3774 static enum fme_state 3775 requirements_test(struct fme *fmep, struct event *ep, 3776 unsigned long long at_latest_by, unsigned long long *pdelay) 3777 { 3778 int waiting_events; 3779 int credible_events; 3780 int deferred_events; 3781 enum fme_state return_value = FME_CREDIBLE; 3782 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3783 unsigned long long arrow_delay; 3784 unsigned long long my_delay; 3785 struct event *ep2; 3786 struct bubble *bp; 3787 struct arrowlist *ap; 3788 3789 if (ep->cached_state & REQMNTS_CREDIBLE) { 3790 indent(); 3791 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3792 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3793 out(O_ALTFP|O_VERB, NULL); 3794 return (FME_CREDIBLE); 3795 } 3796 if (ep->cached_state & REQMNTS_DISPROVED) { 3797 indent(); 3798 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3799 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3800 out(O_ALTFP|O_VERB, NULL); 3801 return (FME_DISPROVED); 3802 } 3803 if (ep->cached_state & REQMNTS_WAIT) { 3804 indent(); 3805 *pdelay = ep->cached_delay; 3806 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3807 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3808 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3809 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3810 out(O_ALTFP|O_VERB, NULL); 3811 return (FME_WAIT); 3812 } 3813 stats_counter_bump(fmep->Rcallcount); 3814 indent_push(" R"); 3815 indent(); 3816 out(O_ALTFP|O_VERB|O_NONL, "->"); 3817 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3818 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3819 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3820 out(O_ALTFP|O_VERB, NULL); 3821 3822 if (ep->t == N_EREPORT) { 3823 if (ep->count == 0) { 3824 if (fmep->pull >= at_latest_by) { 3825 return_value = FME_DISPROVED; 3826 } else { 3827 ep->cached_delay = *pdelay = at_latest_by; 3828 return_value = FME_WAIT; 3829 } 3830 } 3831 3832 indent(); 3833 switch (return_value) { 3834 case FME_CREDIBLE: 3835 ep->cached_state |= REQMNTS_CREDIBLE; 3836 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3837 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3838 break; 3839 case FME_DISPROVED: 3840 ep->cached_state |= REQMNTS_DISPROVED; 3841 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3842 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3843 break; 3844 case FME_WAIT: 3845 ep->cached_state |= REQMNTS_WAIT; 3846 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3847 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3848 out(O_ALTFP|O_VERB|O_NONL, " to "); 3849 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3850 break; 3851 default: 3852 out(O_DIE, "requirements_test: unexpected fme_state"); 3853 break; 3854 } 3855 out(O_ALTFP|O_VERB, NULL); 3856 indent_pop(); 3857 3858 return (return_value); 3859 } 3860 3861 /* this event is not a report, descend the tree */ 3862 for (bp = itree_next_bubble(ep, NULL); bp; 3863 bp = itree_next_bubble(ep, bp)) { 3864 int n; 3865 3866 if (bp->t != B_FROM) 3867 continue; 3868 3869 n = bp->nork; 3870 3871 credible_events = 0; 3872 waiting_events = 0; 3873 deferred_events = 0; 3874 arrow_delay = TIMEVAL_EVENTUALLY; 3875 /* 3876 * n is -1 for 'A' so adjust it. 3877 * XXX just count up the arrows for now. 3878 */ 3879 if (n < 0) { 3880 n = 0; 3881 for (ap = itree_next_arrow(bp, NULL); ap; 3882 ap = itree_next_arrow(bp, ap)) 3883 n++; 3884 indent(); 3885 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3886 } else { 3887 indent(); 3888 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3889 } 3890 3891 if (n == 0) 3892 continue; 3893 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3894 for (ap = itree_next_arrow(bp, NULL); ap; 3895 ap = itree_next_arrow(bp, ap)) { 3896 ep2 = ap->arrowp->head->myevent; 3897 platform_set_payloadnvp(ep2->nvp); 3898 (void) checkconstraints(fmep, ap->arrowp); 3899 if (ap->arrowp->forever_true) { 3900 /* 3901 * if all arrows are invalidated by the 3902 * constraints, then we should elide the 3903 * whole bubble to be consistant with 3904 * the tree creation time behaviour 3905 */ 3906 bp->mark |= BUBBLE_OK; 3907 platform_set_payloadnvp(NULL); 3908 break; 3909 } 3910 platform_set_payloadnvp(NULL); 3911 } 3912 } 3913 for (ap = itree_next_arrow(bp, NULL); ap; 3914 ap = itree_next_arrow(bp, ap)) { 3915 ep2 = ap->arrowp->head->myevent; 3916 if (n <= credible_events) 3917 break; 3918 3919 ap->arrowp->mark |= REQMNTS_COUNTER; 3920 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3921 /* XXX adding max timevals! */ 3922 switch (requirements_test(fmep, ep2, 3923 at_latest_by + ap->arrowp->maxdelay, 3924 &my_delay)) { 3925 case FME_DEFERRED: 3926 deferred_events++; 3927 break; 3928 case FME_CREDIBLE: 3929 credible_events++; 3930 break; 3931 case FME_DISPROVED: 3932 break; 3933 case FME_WAIT: 3934 if (my_delay < arrow_delay) 3935 arrow_delay = my_delay; 3936 waiting_events++; 3937 break; 3938 default: 3939 out(O_DIE, 3940 "Bug in requirements_test."); 3941 } 3942 else 3943 deferred_events++; 3944 } 3945 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3946 bp->mark |= BUBBLE_ELIDED; 3947 continue; 3948 } 3949 indent(); 3950 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3951 credible_events + deferred_events, waiting_events); 3952 if (credible_events + deferred_events + waiting_events < n) { 3953 /* Can never meet requirements */ 3954 ep->cached_state |= REQMNTS_DISPROVED; 3955 indent(); 3956 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3957 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3958 out(O_ALTFP|O_VERB, NULL); 3959 indent_pop(); 3960 return (FME_DISPROVED); 3961 } 3962 if (credible_events + deferred_events < n) { 3963 /* will have to wait */ 3964 /* wait time is shortest known */ 3965 if (arrow_delay < overall_delay) 3966 overall_delay = arrow_delay; 3967 return_value = FME_WAIT; 3968 } else if (credible_events < n) { 3969 if (return_value != FME_WAIT) 3970 return_value = FME_DEFERRED; 3971 } 3972 } 3973 3974 /* 3975 * don't mark as FME_DEFERRED. If this event isn't reached by another 3976 * path, then this will be considered FME_CREDIBLE. But if it is 3977 * reached by a different path so the K-count is met, then might 3978 * get overridden by FME_WAIT or FME_DISPROVED. 3979 */ 3980 if (return_value == FME_WAIT) { 3981 ep->cached_state |= REQMNTS_WAIT; 3982 ep->cached_delay = *pdelay = overall_delay; 3983 } else if (return_value == FME_CREDIBLE) { 3984 ep->cached_state |= REQMNTS_CREDIBLE; 3985 } 3986 indent(); 3987 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 3988 fme_state2str(return_value)); 3989 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3990 out(O_ALTFP|O_VERB, NULL); 3991 indent_pop(); 3992 return (return_value); 3993 } 3994 3995 static enum fme_state 3996 causes_test(struct fme *fmep, struct event *ep, 3997 unsigned long long at_latest_by, unsigned long long *pdelay) 3998 { 3999 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4000 unsigned long long my_delay; 4001 int credible_results = 0; 4002 int waiting_results = 0; 4003 enum fme_state fstate; 4004 struct event *tail_event; 4005 struct bubble *bp; 4006 struct arrowlist *ap; 4007 int k = 1; 4008 4009 stats_counter_bump(fmep->Ccallcount); 4010 indent_push(" C"); 4011 indent(); 4012 out(O_ALTFP|O_VERB|O_NONL, "->"); 4013 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4014 out(O_ALTFP|O_VERB, NULL); 4015 4016 for (bp = itree_next_bubble(ep, NULL); bp; 4017 bp = itree_next_bubble(ep, bp)) { 4018 if (bp->t != B_TO) 4019 continue; 4020 k = bp->nork; /* remember the K value */ 4021 for (ap = itree_next_arrow(bp, NULL); ap; 4022 ap = itree_next_arrow(bp, ap)) { 4023 int do_not_follow = 0; 4024 4025 /* 4026 * if we get to the same event multiple times 4027 * only worry about the first one. 4028 */ 4029 if (ap->arrowp->tail->myevent->cached_state & 4030 CAUSES_TESTED) { 4031 indent(); 4032 out(O_ALTFP|O_VERB|O_NONL, 4033 " causes test already run for "); 4034 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4035 ap->arrowp->tail->myevent); 4036 out(O_ALTFP|O_VERB, NULL); 4037 continue; 4038 } 4039 4040 /* 4041 * see if false constraint prevents us 4042 * from traversing this arrow 4043 */ 4044 platform_set_payloadnvp(ep->nvp); 4045 if (checkconstraints(fmep, ap->arrowp) == 0) 4046 do_not_follow = 1; 4047 platform_set_payloadnvp(NULL); 4048 if (do_not_follow) { 4049 indent(); 4050 out(O_ALTFP|O_VERB|O_NONL, 4051 " False arrow from "); 4052 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4053 ap->arrowp->tail->myevent); 4054 out(O_ALTFP|O_VERB, NULL); 4055 continue; 4056 } 4057 4058 ap->arrowp->tail->myevent->cached_state |= 4059 CAUSES_TESTED; 4060 tail_event = ap->arrowp->tail->myevent; 4061 fstate = hypothesise(fmep, tail_event, at_latest_by, 4062 &my_delay); 4063 4064 switch (fstate) { 4065 case FME_WAIT: 4066 if (my_delay < overall_delay) 4067 overall_delay = my_delay; 4068 waiting_results++; 4069 break; 4070 case FME_CREDIBLE: 4071 credible_results++; 4072 break; 4073 case FME_DISPROVED: 4074 break; 4075 default: 4076 out(O_DIE, "Bug in causes_test"); 4077 } 4078 } 4079 } 4080 /* compare against K */ 4081 if (credible_results + waiting_results < k) { 4082 indent(); 4083 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4084 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4085 out(O_ALTFP|O_VERB, NULL); 4086 indent_pop(); 4087 return (FME_DISPROVED); 4088 } 4089 if (waiting_results != 0) { 4090 *pdelay = overall_delay; 4091 indent(); 4092 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4093 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4094 out(O_ALTFP|O_VERB|O_NONL, " to "); 4095 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4096 out(O_ALTFP|O_VERB, NULL); 4097 indent_pop(); 4098 return (FME_WAIT); 4099 } 4100 indent(); 4101 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4102 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4103 out(O_ALTFP|O_VERB, NULL); 4104 indent_pop(); 4105 return (FME_CREDIBLE); 4106 } 4107 4108 static enum fme_state 4109 hypothesise(struct fme *fmep, struct event *ep, 4110 unsigned long long at_latest_by, unsigned long long *pdelay) 4111 { 4112 enum fme_state rtr, otr; 4113 unsigned long long my_delay; 4114 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4115 4116 stats_counter_bump(fmep->Hcallcount); 4117 indent_push(" H"); 4118 indent(); 4119 out(O_ALTFP|O_VERB|O_NONL, "->"); 4120 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4121 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4122 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4123 out(O_ALTFP|O_VERB, NULL); 4124 4125 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4126 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4127 overall_delay = my_delay; 4128 if (rtr != FME_DISPROVED) { 4129 if (is_problem(ep->t)) { 4130 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4131 if (otr != FME_DISPROVED) { 4132 if (fmep->peek == 0 && ep->is_suspect == 0) { 4133 ep->suspects = fmep->suspects; 4134 ep->is_suspect = 1; 4135 fmep->suspects = ep; 4136 fmep->nsuspects++; 4137 if (!is_fault(ep->t)) 4138 fmep->nonfault++; 4139 } 4140 } 4141 } else 4142 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4143 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4144 overall_delay = my_delay; 4145 if ((otr != FME_DISPROVED) && 4146 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4147 *pdelay = overall_delay; 4148 } 4149 if (rtr == FME_DISPROVED) { 4150 indent(); 4151 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4152 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4153 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4154 indent_pop(); 4155 return (FME_DISPROVED); 4156 } 4157 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4158 indent(); 4159 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4160 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4161 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4162 indent_pop(); 4163 return (FME_DISPROVED); 4164 } 4165 if (otr == FME_DISPROVED) { 4166 indent(); 4167 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4168 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4169 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4170 indent_pop(); 4171 return (FME_DISPROVED); 4172 } 4173 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4174 indent(); 4175 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4176 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4177 out(O_ALTFP|O_VERB|O_NONL, " to "); 4178 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4179 out(O_ALTFP|O_VERB, NULL); 4180 indent_pop(); 4181 return (FME_WAIT); 4182 } 4183 indent(); 4184 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4185 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4186 out(O_ALTFP|O_VERB, NULL); 4187 indent_pop(); 4188 return (FME_CREDIBLE); 4189 } 4190 4191 /* 4192 * fme_istat_load -- reconstitute any persistent istats 4193 */ 4194 void 4195 fme_istat_load(fmd_hdl_t *hdl) 4196 { 4197 int sz; 4198 char *sbuf; 4199 char *ptr; 4200 4201 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4202 out(O_ALTFP, "fme_istat_load: No stats"); 4203 return; 4204 } 4205 4206 sbuf = alloca(sz); 4207 4208 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4209 4210 /* 4211 * pick apart the serialized stats 4212 * 4213 * format is: 4214 * <class-name>, '@', <path>, '\0', <value>, '\0' 4215 * for example: 4216 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4217 * 4218 * since this is parsing our own serialized data, any parsing issues 4219 * are fatal, so we check for them all with ASSERT() below. 4220 */ 4221 ptr = sbuf; 4222 while (ptr < &sbuf[sz]) { 4223 char *sepptr; 4224 struct node *np; 4225 int val; 4226 4227 sepptr = strchr(ptr, '@'); 4228 ASSERT(sepptr != NULL); 4229 *sepptr = '\0'; 4230 4231 /* construct the event */ 4232 np = newnode(T_EVENT, NULL, 0); 4233 np->u.event.ename = newnode(T_NAME, NULL, 0); 4234 np->u.event.ename->u.name.t = N_STAT; 4235 np->u.event.ename->u.name.s = stable(ptr); 4236 np->u.event.ename->u.name.it = IT_ENAME; 4237 np->u.event.ename->u.name.last = np->u.event.ename; 4238 4239 ptr = sepptr + 1; 4240 ASSERT(ptr < &sbuf[sz]); 4241 ptr += strlen(ptr); 4242 ptr++; /* move past the '\0' separating path from value */ 4243 ASSERT(ptr < &sbuf[sz]); 4244 ASSERT(isdigit(*ptr)); 4245 val = atoi(ptr); 4246 ASSERT(val > 0); 4247 ptr += strlen(ptr); 4248 ptr++; /* move past the final '\0' for this entry */ 4249 4250 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4251 ASSERT(np->u.event.epname != NULL); 4252 4253 istat_bump(np, val); 4254 tree_free(np); 4255 } 4256 4257 istat_save(); 4258 } 4259