1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * 25 * fme.c -- fault management exercise module 26 * 27 * this module provides the simulated fault management exercise. 28 */ 29 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <alloca.h> 36 #include <libnvpair.h> 37 #include <sys/fm/protocol.h> 38 #include <fm/fmd_api.h> 39 #include "alloc.h" 40 #include "out.h" 41 #include "stats.h" 42 #include "stable.h" 43 #include "literals.h" 44 #include "lut.h" 45 #include "tree.h" 46 #include "ptree.h" 47 #include "itree.h" 48 #include "ipath.h" 49 #include "fme.h" 50 #include "evnv.h" 51 #include "eval.h" 52 #include "config.h" 53 #include "platform.h" 54 #include "esclex.h" 55 56 /* imported from eft.c... */ 57 extern hrtime_t Hesitate; 58 extern char *Serd_Override; 59 extern nv_alloc_t Eft_nv_hdl; 60 extern int Max_fme; 61 extern fmd_hdl_t *Hdl; 62 63 static int Istat_need_save; 64 static int Serd_need_save; 65 void istat_save(void); 66 void serd_save(void); 67 68 /* fme under construction is global so we can free it on module abort */ 69 static struct fme *Nfmep; 70 71 static int Undiag_reason = UD_VAL_UNKNOWN; 72 73 static int Nextid = 0; 74 75 static int Open_fme_count = 0; /* Count of open FMEs */ 76 77 /* list of fault management exercises underway */ 78 static struct fme { 79 struct fme *next; /* next exercise */ 80 unsigned long long ull; /* time when fme was created */ 81 int id; /* FME id */ 82 struct config *config; /* cooked configuration data */ 83 struct lut *eventtree; /* propagation tree for this FME */ 84 /* 85 * The initial error report that created this FME is kept in 86 * two forms. e0 points to the instance tree node and is used 87 * by fme_eval() as the starting point for the inference 88 * algorithm. e0r is the event handle FMD passed to us when 89 * the ereport first arrived and is used when setting timers, 90 * which are always relative to the time of this initial 91 * report. 92 */ 93 struct event *e0; 94 fmd_event_t *e0r; 95 96 id_t timer; /* for setting an fmd time-out */ 97 98 struct event *ecurrent; /* ereport under consideration */ 99 struct event *suspects; /* current suspect list */ 100 struct event *psuspects; /* previous suspect list */ 101 int nsuspects; /* count of suspects */ 102 int posted_suspects; /* true if we've posted a diagnosis */ 103 int uniqobs; /* number of unique events observed */ 104 int peek; /* just peeking, don't track suspects */ 105 int overflow; /* true if overflow FME */ 106 enum fme_state { 107 FME_NOTHING = 5000, /* not evaluated yet */ 108 FME_WAIT, /* need to wait for more info */ 109 FME_CREDIBLE, /* suspect list is credible */ 110 FME_DISPROVED, /* no valid suspects found */ 111 FME_DEFERRED /* don't know yet (k-count not met) */ 112 } state; 113 114 unsigned long long pull; /* time passed since created */ 115 unsigned long long wull; /* wait until this time for re-eval */ 116 struct event *observations; /* observation list */ 117 struct lut *globals; /* values of global variables */ 118 /* fmd interfacing */ 119 fmd_hdl_t *hdl; /* handle for talking with fmd */ 120 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 121 /* stats */ 122 struct stats *Rcount; 123 struct stats *Hcallcount; 124 struct stats *Rcallcount; 125 struct stats *Ccallcount; 126 struct stats *Ecallcount; 127 struct stats *Tcallcount; 128 struct stats *Marrowcount; 129 struct stats *diags; 130 } *FMElist, *EFMElist, *ClosedFMEs; 131 132 static struct case_list { 133 fmd_case_t *fmcase; 134 struct case_list *next; 135 } *Undiagablecaselist; 136 137 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 138 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 139 unsigned long long at_latest_by, unsigned long long *pdelay); 140 static struct node *eventprop_lookup(struct event *ep, const char *propname); 141 static struct node *pathstring2epnamenp(char *path); 142 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 143 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 144 static char *undiag_2reason_str(int ud, char *arg); 145 static const char *undiag_2defect_str(int ud); 146 static void restore_suspects(struct fme *fmep); 147 static void save_suspects(struct fme *fmep); 148 static void destroy_fme(struct fme *f); 149 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 150 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 151 static void istat_counter_reset_cb(struct istat_entry *entp, 152 struct stats *statp, const struct ipath *ipp); 153 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 154 struct stats *statp, void *unused); 155 static void serd_reset_cb(struct serd_entry *entp, void *unused, 156 const struct ipath *ipp); 157 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 158 void *unused2); 159 static void destroy_fme_bufs(struct fme *fp); 160 161 static struct fme * 162 alloc_fme(void) 163 { 164 struct fme *fmep; 165 166 fmep = MALLOC(sizeof (*fmep)); 167 bzero(fmep, sizeof (*fmep)); 168 return (fmep); 169 } 170 171 /* 172 * fme_ready -- called when all initialization of the FME (except for 173 * stats) has completed successfully. Adds the fme to global lists 174 * and establishes its stats. 175 */ 176 static struct fme * 177 fme_ready(struct fme *fmep) 178 { 179 char nbuf[100]; 180 181 Nfmep = NULL; /* don't need to free this on module abort now */ 182 183 if (EFMElist) { 184 EFMElist->next = fmep; 185 EFMElist = fmep; 186 } else 187 FMElist = EFMElist = fmep; 188 189 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 190 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 191 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 192 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 193 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 194 fmep->Rcallcount = stats_new_counter(nbuf, 195 "calls to requirements_test()", 1); 196 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 197 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 199 fmep->Ecallcount = 200 stats_new_counter(nbuf, "calls to effects_test()", 1); 201 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 202 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 203 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 204 fmep->Marrowcount = stats_new_counter(nbuf, 205 "arrows marked by mark_arrows()", 1); 206 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 207 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 208 209 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 210 config_print(O_ALTFP|O_VERB2, fmep->config); 211 212 return (fmep); 213 } 214 215 extern void ipath_dummy_lut(struct arrow *); 216 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 217 218 /* ARGSUSED */ 219 static void 220 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 221 { 222 struct bubble *bp; 223 struct arrowlist *ap; 224 225 for (bp = itree_next_bubble(ep, NULL); bp; 226 bp = itree_next_bubble(ep, bp)) { 227 if (bp->t != B_FROM) 228 continue; 229 for (ap = itree_next_arrow(bp, NULL); ap; 230 ap = itree_next_arrow(bp, ap)) { 231 ap->arrowp->pnode->u.arrow.needed = 1; 232 ipath_dummy_lut(ap->arrowp); 233 } 234 } 235 } 236 237 /* ARGSUSED */ 238 static void 239 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 240 { 241 struct bubble *bp; 242 struct arrowlist *ap; 243 244 for (bp = itree_next_bubble(ep, NULL); bp; 245 bp = itree_next_bubble(ep, bp)) { 246 if (bp->t != B_FROM) 247 continue; 248 for (ap = itree_next_arrow(bp, NULL); ap; 249 ap = itree_next_arrow(bp, ap)) 250 ap->arrowp->pnode->u.arrow.needed = 0; 251 } 252 } 253 254 static void globals_destructor(void *left, void *right, void *arg); 255 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 256 257 static boolean_t 258 prune_propagations(const char *e0class, const struct ipath *e0ipp) 259 { 260 char nbuf[100]; 261 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 262 extern struct lut *Usednames; 263 264 Nfmep = alloc_fme(); 265 Nfmep->id = Nextid; 266 Nfmep->state = FME_NOTHING; 267 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 268 if ((Nfmep->e0 = 269 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 270 itree_free(Nfmep->eventtree); 271 FREE(Nfmep); 272 Nfmep = NULL; 273 return (B_FALSE); 274 } 275 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 276 Nfmep->e0->count++; 277 278 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 279 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 280 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 281 Nfmep->Hcallcount = 282 stats_new_counter(nbuf, "calls to hypothesise()", 1); 283 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 284 Nfmep->Rcallcount = stats_new_counter(nbuf, 285 "calls to requirements_test()", 1); 286 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 287 Nfmep->Ccallcount = 288 stats_new_counter(nbuf, "calls to causes_test()", 1); 289 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 290 Nfmep->Ecallcount = 291 stats_new_counter(nbuf, "calls to effects_test()", 1); 292 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 293 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 294 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 295 Nfmep->Marrowcount = stats_new_counter(nbuf, 296 "arrows marked by mark_arrows()", 1); 297 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 298 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 299 300 Nfmep->peek = 1; 301 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 302 lut_free(Usednames, NULL, NULL); 303 Usednames = NULL; 304 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 305 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 306 itree_prune(Nfmep->eventtree); 307 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 308 309 stats_delete(Nfmep->Rcount); 310 stats_delete(Nfmep->Hcallcount); 311 stats_delete(Nfmep->Rcallcount); 312 stats_delete(Nfmep->Ccallcount); 313 stats_delete(Nfmep->Ecallcount); 314 stats_delete(Nfmep->Tcallcount); 315 stats_delete(Nfmep->Marrowcount); 316 stats_delete(Nfmep->diags); 317 itree_free(Nfmep->eventtree); 318 lut_free(Nfmep->globals, globals_destructor, NULL); 319 FREE(Nfmep); 320 return (B_TRUE); 321 } 322 323 static struct fme * 324 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 325 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 326 { 327 struct cfgdata *cfgdata; 328 int init_size; 329 extern int alloc_total(); 330 nvlist_t *detector = NULL; 331 char *pathstr; 332 char *arg; 333 334 /* 335 * First check if e0ipp is actually in the topology so we can give a 336 * more useful error message. 337 */ 338 ipathlastcomp(e0ipp); 339 pathstr = ipath2str(NULL, e0ipp); 340 cfgdata = config_snapshot(); 341 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 342 &detector, pathstr); 343 FREE(pathstr); 344 structconfig_free(cfgdata->cooked); 345 config_free(cfgdata); 346 if (detector == NULL) { 347 /* See if class permits silent discard on unknown component. */ 348 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 349 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 350 "to component path, but silent discard allowed.", 351 e0class); 352 } else { 353 Undiag_reason = UD_VAL_BADEVENTPATH; 354 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 355 &detector); 356 arg = ipath2str(e0class, e0ipp); 357 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 358 FREE(arg); 359 } 360 return (NULL); 361 } 362 363 /* 364 * Next run a quick first pass of the rules with a dummy config. This 365 * allows us to prune those rules which can't possibly cause this 366 * ereport. 367 */ 368 if (!prune_propagations(e0class, e0ipp)) { 369 /* 370 * The fault class must have been in the rules or we would 371 * not have registered for it (and got a "nosub"), and the 372 * pathname must be in the topology or we would have failed the 373 * previous test. So to get here means the combination of 374 * class and pathname in the ereport must be invalid. 375 */ 376 Undiag_reason = UD_VAL_BADEVENTCLASS; 377 arg = ipath2str(e0class, e0ipp); 378 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 379 nvlist_free(detector); 380 FREE(arg); 381 return (NULL); 382 } 383 384 /* 385 * Now go ahead and create the real fme using the pruned rules. 386 */ 387 init_size = alloc_total(); 388 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 389 cfgdata = config_snapshot(); 390 platform_save_config(hdl, fmcase); 391 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 392 alloc_total() - init_size); 393 394 Nfmep = alloc_fme(); 395 396 Nfmep->id = Nextid++; 397 Nfmep->config = cfgdata->cooked; 398 config_free(cfgdata); 399 Nfmep->posted_suspects = 0; 400 Nfmep->uniqobs = 0; 401 Nfmep->state = FME_NOTHING; 402 Nfmep->pull = 0ULL; 403 Nfmep->overflow = 0; 404 405 Nfmep->fmcase = fmcase; 406 Nfmep->hdl = hdl; 407 408 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 409 Undiag_reason = UD_VAL_INSTFAIL; 410 arg = ipath2str(e0class, e0ipp); 411 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 412 nvlist_free(detector); 413 FREE(arg); 414 structconfig_free(Nfmep->config); 415 destroy_fme_bufs(Nfmep); 416 FREE(Nfmep); 417 Nfmep = NULL; 418 return (NULL); 419 } 420 421 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 422 423 if ((Nfmep->e0 = 424 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 425 Undiag_reason = UD_VAL_BADEVENTI; 426 arg = ipath2str(e0class, e0ipp); 427 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 428 nvlist_free(detector); 429 FREE(arg); 430 itree_free(Nfmep->eventtree); 431 structconfig_free(Nfmep->config); 432 destroy_fme_bufs(Nfmep); 433 FREE(Nfmep); 434 Nfmep = NULL; 435 return (NULL); 436 } 437 438 nvlist_free(detector); 439 return (fme_ready(Nfmep)); 440 } 441 442 void 443 fme_fini(void) 444 { 445 struct fme *sfp, *fp; 446 struct case_list *ucasep, *nextcasep; 447 448 ucasep = Undiagablecaselist; 449 while (ucasep != NULL) { 450 nextcasep = ucasep->next; 451 FREE(ucasep); 452 ucasep = nextcasep; 453 } 454 Undiagablecaselist = NULL; 455 456 /* clean up closed fmes */ 457 fp = ClosedFMEs; 458 while (fp != NULL) { 459 sfp = fp->next; 460 destroy_fme(fp); 461 fp = sfp; 462 } 463 ClosedFMEs = NULL; 464 465 fp = FMElist; 466 while (fp != NULL) { 467 sfp = fp->next; 468 destroy_fme(fp); 469 fp = sfp; 470 } 471 FMElist = EFMElist = NULL; 472 473 /* if we were in the middle of creating an fme, free it now */ 474 if (Nfmep) { 475 destroy_fme(Nfmep); 476 Nfmep = NULL; 477 } 478 } 479 480 /* 481 * Allocated space for a buffer name. 20 bytes allows for 482 * a ridiculous 9,999,999 unique observations. 483 */ 484 #define OBBUFNMSZ 20 485 486 /* 487 * serialize_observation 488 * 489 * Create a recoverable version of the current observation 490 * (f->ecurrent). We keep a serialized version of each unique 491 * observation in order that we may resume correctly the fme in the 492 * correct state if eft or fmd crashes and we're restarted. 493 */ 494 static void 495 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 496 { 497 size_t pkdlen; 498 char tmpbuf[OBBUFNMSZ]; 499 char *pkd = NULL; 500 char *estr; 501 502 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 503 estr = ipath2str(cls, ipp); 504 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 505 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 506 strlen(estr) + 1); 507 FREE(estr); 508 509 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 510 (void) snprintf(tmpbuf, 511 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 512 if (nvlist_xpack(fp->ecurrent->nvp, 513 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 514 out(O_DIE|O_SYS, "pack of observed nvl failed"); 515 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 516 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 517 FREE(pkd); 518 } 519 520 fp->uniqobs++; 521 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 522 sizeof (fp->uniqobs)); 523 } 524 525 /* 526 * init_fme_bufs -- We keep several bits of state about an fme for 527 * use if eft or fmd crashes and we're restarted. 528 */ 529 static void 530 init_fme_bufs(struct fme *fp) 531 { 532 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 533 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 534 sizeof (fp->pull)); 535 536 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 537 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 538 sizeof (fp->id)); 539 540 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 541 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 542 sizeof (fp->uniqobs)); 543 544 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 545 sizeof (fp->posted_suspects)); 546 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 547 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 548 } 549 550 static void 551 destroy_fme_bufs(struct fme *fp) 552 { 553 char tmpbuf[OBBUFNMSZ]; 554 int o; 555 556 platform_restore_config(fp->hdl, fp->fmcase); 557 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 558 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 559 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 560 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 561 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 562 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 563 564 for (o = 0; o < fp->uniqobs; o++) { 565 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 566 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 567 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 568 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 569 } 570 } 571 572 /* 573 * reconstitute_observations -- convert a case's serialized observations 574 * back into struct events. Returns zero if all observations are 575 * successfully reconstituted. 576 */ 577 static int 578 reconstitute_observations(struct fme *fmep) 579 { 580 struct event *ep; 581 struct node *epnamenp = NULL; 582 size_t pkdlen; 583 char *pkd = NULL; 584 char *tmpbuf = alloca(OBBUFNMSZ); 585 char *sepptr; 586 char *estr; 587 int ocnt; 588 int elen; 589 590 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 591 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 592 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 593 if (elen == 0) { 594 out(O_ALTFP, 595 "reconstitute_observation: no %s buffer found.", 596 tmpbuf); 597 Undiag_reason = UD_VAL_MISSINGOBS; 598 break; 599 } 600 601 estr = MALLOC(elen); 602 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 603 sepptr = strchr(estr, '@'); 604 if (sepptr == NULL) { 605 out(O_ALTFP, 606 "reconstitute_observation: %s: " 607 "missing @ separator in %s.", 608 tmpbuf, estr); 609 Undiag_reason = UD_VAL_MISSINGPATH; 610 FREE(estr); 611 break; 612 } 613 614 *sepptr = '\0'; 615 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 616 out(O_ALTFP, 617 "reconstitute_observation: %s: " 618 "trouble converting path string \"%s\" " 619 "to internal representation.", 620 tmpbuf, sepptr + 1); 621 Undiag_reason = UD_VAL_MISSINGPATH; 622 FREE(estr); 623 break; 624 } 625 626 /* construct the event */ 627 ep = itree_lookup(fmep->eventtree, 628 stable(estr), ipath(epnamenp)); 629 if (ep == NULL) { 630 out(O_ALTFP, 631 "reconstitute_observation: %s: " 632 "lookup of \"%s\" in itree failed.", 633 tmpbuf, ipath2str(estr, ipath(epnamenp))); 634 Undiag_reason = UD_VAL_BADOBS; 635 tree_free(epnamenp); 636 FREE(estr); 637 break; 638 } 639 tree_free(epnamenp); 640 641 /* 642 * We may or may not have a saved nvlist for the observation 643 */ 644 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 645 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 646 if (pkdlen != 0) { 647 pkd = MALLOC(pkdlen); 648 fmd_buf_read(fmep->hdl, 649 fmep->fmcase, tmpbuf, pkd, pkdlen); 650 ASSERT(ep->nvp == NULL); 651 if (nvlist_xunpack(pkd, 652 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 653 out(O_DIE|O_SYS, "pack of observed nvl failed"); 654 FREE(pkd); 655 } 656 657 if (ocnt == 0) 658 fmep->e0 = ep; 659 660 FREE(estr); 661 fmep->ecurrent = ep; 662 ep->count++; 663 664 /* link it into list of observations seen */ 665 ep->observations = fmep->observations; 666 fmep->observations = ep; 667 } 668 669 if (ocnt == fmep->uniqobs) { 670 (void) fme_ready(fmep); 671 return (0); 672 } 673 674 return (1); 675 } 676 677 /* 678 * restart_fme -- called during eft initialization. Reconstitutes 679 * an in-progress fme. 680 */ 681 void 682 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 683 { 684 nvlist_t *defect; 685 struct case_list *bad; 686 struct fme *fmep; 687 struct cfgdata *cfgdata; 688 size_t rawsz; 689 struct event *ep; 690 char *tmpbuf = alloca(OBBUFNMSZ); 691 char *sepptr; 692 char *estr; 693 int elen; 694 struct node *epnamenp = NULL; 695 int init_size; 696 extern int alloc_total(); 697 char *reason; 698 699 /* 700 * ignore solved or closed cases 701 */ 702 if (fmd_case_solved(hdl, inprogress) || 703 fmd_case_closed(hdl, inprogress)) 704 return; 705 706 fmep = alloc_fme(); 707 fmep->fmcase = inprogress; 708 fmep->hdl = hdl; 709 710 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 711 out(O_ALTFP, "restart_fme: no saved posted status"); 712 Undiag_reason = UD_VAL_MISSINGINFO; 713 goto badcase; 714 } else { 715 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 716 (void *)&fmep->posted_suspects, 717 sizeof (fmep->posted_suspects)); 718 } 719 720 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 721 out(O_ALTFP, "restart_fme: no saved id"); 722 Undiag_reason = UD_VAL_MISSINGINFO; 723 goto badcase; 724 } else { 725 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 726 sizeof (fmep->id)); 727 } 728 if (Nextid <= fmep->id) 729 Nextid = fmep->id + 1; 730 731 out(O_ALTFP, "Replay FME %d", fmep->id); 732 733 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 734 out(O_ALTFP, "restart_fme: No config data"); 735 Undiag_reason = UD_VAL_MISSINGINFO; 736 goto badcase; 737 } 738 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 739 sizeof (size_t)); 740 741 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 742 out(O_ALTFP, "restart_fme: No event zero"); 743 Undiag_reason = UD_VAL_MISSINGZERO; 744 goto badcase; 745 } 746 747 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 748 out(O_ALTFP, "restart_fme: no saved wait time"); 749 Undiag_reason = UD_VAL_MISSINGINFO; 750 goto badcase; 751 } else { 752 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 753 sizeof (fmep->pull)); 754 } 755 756 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 757 out(O_ALTFP, "restart_fme: no count of observations"); 758 Undiag_reason = UD_VAL_MISSINGINFO; 759 goto badcase; 760 } else { 761 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 762 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 763 } 764 765 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 766 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 767 if (elen == 0) { 768 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 769 tmpbuf); 770 Undiag_reason = UD_VAL_MISSINGOBS; 771 goto badcase; 772 } 773 estr = MALLOC(elen); 774 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 775 sepptr = strchr(estr, '@'); 776 if (sepptr == NULL) { 777 out(O_ALTFP, "reconstitute_observation: %s: " 778 "missing @ separator in %s.", 779 tmpbuf, estr); 780 Undiag_reason = UD_VAL_MISSINGPATH; 781 FREE(estr); 782 goto badcase; 783 } 784 *sepptr = '\0'; 785 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 786 out(O_ALTFP, "reconstitute_observation: %s: " 787 "trouble converting path string \"%s\" " 788 "to internal representation.", tmpbuf, sepptr + 1); 789 Undiag_reason = UD_VAL_MISSINGPATH; 790 FREE(estr); 791 goto badcase; 792 } 793 (void) prune_propagations(stable(estr), ipath(epnamenp)); 794 tree_free(epnamenp); 795 FREE(estr); 796 797 init_size = alloc_total(); 798 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 799 cfgdata = MALLOC(sizeof (struct cfgdata)); 800 cfgdata->cooked = NULL; 801 cfgdata->devcache = NULL; 802 cfgdata->devidcache = NULL; 803 cfgdata->tpcache = NULL; 804 cfgdata->cpucache = NULL; 805 cfgdata->raw_refcnt = 1; 806 807 if (rawsz > 0) { 808 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 809 out(O_ALTFP, "restart_fme: Config data size mismatch"); 810 Undiag_reason = UD_VAL_CFGMISMATCH; 811 goto badcase; 812 } 813 cfgdata->begin = MALLOC(rawsz); 814 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 815 fmd_buf_read(hdl, 816 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 817 } else { 818 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 819 } 820 821 config_cook(cfgdata); 822 fmep->config = cfgdata->cooked; 823 config_free(cfgdata); 824 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 825 alloc_total() - init_size); 826 827 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 828 /* case not properly saved or irretrievable */ 829 out(O_ALTFP, "restart_fme: NULL instance tree"); 830 Undiag_reason = UD_VAL_INSTFAIL; 831 goto badcase; 832 } 833 834 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 835 836 if (reconstitute_observations(fmep) != 0) 837 goto badcase; 838 839 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 840 for (ep = fmep->observations; ep; ep = ep->observations) { 841 out(O_ALTFP|O_NONL, " "); 842 itree_pevent_brief(O_ALTFP|O_NONL, ep); 843 } 844 out(O_ALTFP, NULL); 845 846 Open_fme_count++; 847 848 /* give the diagnosis algorithm a shot at the new FME state */ 849 fme_eval(fmep, fmep->e0r); 850 return; 851 852 badcase: 853 if (fmep->eventtree != NULL) 854 itree_free(fmep->eventtree); 855 if (fmep->config) 856 structconfig_free(fmep->config); 857 destroy_fme_bufs(fmep); 858 FREE(fmep); 859 860 /* 861 * Since we're unable to restart the case, add it to the undiagable 862 * list and solve and close it as appropriate. 863 */ 864 bad = MALLOC(sizeof (struct case_list)); 865 bad->next = NULL; 866 867 if (Undiagablecaselist != NULL) 868 bad->next = Undiagablecaselist; 869 Undiagablecaselist = bad; 870 bad->fmcase = inprogress; 871 872 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 873 fmd_case_uuid(hdl, bad->fmcase)); 874 875 if (fmd_case_solved(hdl, bad->fmcase)) { 876 out(O_ALTFP|O_NONL, "already solved, "); 877 } else { 878 out(O_ALTFP|O_NONL, "solving, "); 879 defect = fmd_nvl_create_fault(hdl, 880 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 881 reason = undiag_2reason_str(Undiag_reason, NULL); 882 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 883 FREE(reason); 884 fmd_case_add_suspect(hdl, bad->fmcase, defect); 885 fmd_case_solve(hdl, bad->fmcase); 886 Undiag_reason = UD_VAL_UNKNOWN; 887 } 888 889 if (fmd_case_closed(hdl, bad->fmcase)) { 890 out(O_ALTFP, "already closed ]"); 891 } else { 892 out(O_ALTFP, "closing ]"); 893 fmd_case_close(hdl, bad->fmcase); 894 } 895 } 896 897 /*ARGSUSED*/ 898 static void 899 globals_destructor(void *left, void *right, void *arg) 900 { 901 struct evalue *evp = (struct evalue *)right; 902 if (evp->t == NODEPTR) 903 tree_free((struct node *)(uintptr_t)evp->v); 904 evp->v = (uintptr_t)NULL; 905 FREE(evp); 906 } 907 908 void 909 destroy_fme(struct fme *f) 910 { 911 stats_delete(f->Rcount); 912 stats_delete(f->Hcallcount); 913 stats_delete(f->Rcallcount); 914 stats_delete(f->Ccallcount); 915 stats_delete(f->Ecallcount); 916 stats_delete(f->Tcallcount); 917 stats_delete(f->Marrowcount); 918 stats_delete(f->diags); 919 920 if (f->eventtree != NULL) 921 itree_free(f->eventtree); 922 if (f->config) 923 structconfig_free(f->config); 924 lut_free(f->globals, globals_destructor, NULL); 925 FREE(f); 926 } 927 928 static const char * 929 fme_state2str(enum fme_state s) 930 { 931 switch (s) { 932 case FME_NOTHING: return ("NOTHING"); 933 case FME_WAIT: return ("WAIT"); 934 case FME_CREDIBLE: return ("CREDIBLE"); 935 case FME_DISPROVED: return ("DISPROVED"); 936 case FME_DEFERRED: return ("DEFERRED"); 937 default: return ("UNKNOWN"); 938 } 939 } 940 941 static int 942 is_problem(enum nametype t) 943 { 944 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 945 } 946 947 static int 948 is_defect(enum nametype t) 949 { 950 return (t == N_DEFECT); 951 } 952 953 static int 954 is_upset(enum nametype t) 955 { 956 return (t == N_UPSET); 957 } 958 959 static void 960 fme_print(int flags, struct fme *fmep) 961 { 962 struct event *ep; 963 964 out(flags, "Fault Management Exercise %d", fmep->id); 965 out(flags, "\t State: %s", fme_state2str(fmep->state)); 966 out(flags|O_NONL, "\t Start time: "); 967 ptree_timeval(flags|O_NONL, &fmep->ull); 968 out(flags, NULL); 969 if (fmep->wull) { 970 out(flags|O_NONL, "\t Wait time: "); 971 ptree_timeval(flags|O_NONL, &fmep->wull); 972 out(flags, NULL); 973 } 974 out(flags|O_NONL, "\t E0: "); 975 if (fmep->e0) 976 itree_pevent_brief(flags|O_NONL, fmep->e0); 977 else 978 out(flags|O_NONL, "NULL"); 979 out(flags, NULL); 980 out(flags|O_NONL, "\tObservations:"); 981 for (ep = fmep->observations; ep; ep = ep->observations) { 982 out(flags|O_NONL, " "); 983 itree_pevent_brief(flags|O_NONL, ep); 984 } 985 out(flags, NULL); 986 out(flags|O_NONL, "\tSuspect list:"); 987 for (ep = fmep->suspects; ep; ep = ep->suspects) { 988 out(flags|O_NONL, " "); 989 itree_pevent_brief(flags|O_NONL, ep); 990 } 991 out(flags, NULL); 992 if (fmep->eventtree != NULL) { 993 out(flags|O_VERB2, "\t Tree:"); 994 itree_ptree(flags|O_VERB2, fmep->eventtree); 995 } 996 } 997 998 static struct node * 999 pathstring2epnamenp(char *path) 1000 { 1001 char *sep = "/"; 1002 struct node *ret; 1003 char *ptr; 1004 1005 if ((ptr = strtok(path, sep)) == NULL) 1006 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1007 1008 ret = tree_iname(stable(ptr), NULL, 0); 1009 1010 while ((ptr = strtok(NULL, sep)) != NULL) 1011 ret = tree_name_append(ret, 1012 tree_iname(stable(ptr), NULL, 0)); 1013 1014 return (ret); 1015 } 1016 1017 /* 1018 * for a given upset sp, increment the corresponding SERD engine. if the 1019 * SERD engine trips, return the ename and ipp of the resulting ereport. 1020 * returns true if engine tripped and *enamep and *ippp were filled in. 1021 */ 1022 static int 1023 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1024 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1025 const struct ipath **ippp) 1026 { 1027 struct node *serdinst; 1028 char *serdname; 1029 char *serdresource; 1030 char *serdclass; 1031 struct node *nid; 1032 struct serd_entry *newentp; 1033 int i, serdn = -1, serdincrement = 1, len = 0; 1034 char *serdsuffix = NULL, *serdt = NULL; 1035 struct evalue *ep; 1036 1037 ASSERT(sp->t == N_UPSET); 1038 ASSERT(ffep != NULL); 1039 1040 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1041 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1042 ASSERT(ep->t == UINT64); 1043 serdn = (int)ep->v; 1044 } 1045 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1046 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1047 ASSERT(ep->t == STRING); 1048 serdt = (char *)(uintptr_t)ep->v; 1049 } 1050 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1051 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1052 ASSERT(ep->t == STRING); 1053 serdsuffix = (char *)(uintptr_t)ep->v; 1054 } 1055 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1056 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1057 ASSERT(ep->t == UINT64); 1058 serdincrement = (int)ep->v; 1059 } 1060 1061 /* 1062 * obtain instanced SERD engine from the upset sp. from this 1063 * derive serdname, the string used to identify the SERD engine. 1064 */ 1065 serdinst = eventprop_lookup(sp, L_engine); 1066 1067 if (serdinst == NULL) 1068 return (-1); 1069 1070 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1071 if (serdsuffix != NULL) 1072 len += strlen(serdsuffix); 1073 serdclass = MALLOC(len); 1074 if (serdsuffix != NULL) 1075 (void) snprintf(serdclass, len, "%s%s", 1076 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1077 else 1078 (void) snprintf(serdclass, len, "%s", 1079 serdinst->u.stmt.np->u.event.ename->u.name.s); 1080 serdresource = ipath2str(NULL, 1081 ipath(serdinst->u.stmt.np->u.event.epname)); 1082 len += strlen(serdresource) + 1; 1083 serdname = MALLOC(len); 1084 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1085 FREE(serdresource); 1086 1087 /* handle serd engine "id" property, if there is one */ 1088 if ((nid = 1089 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1090 struct evalue *gval; 1091 char suffixbuf[200]; 1092 char *suffix; 1093 char *nserdname; 1094 size_t nname; 1095 1096 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1097 ptree_name_iter(O_ALTFP|O_NONL, nid); 1098 1099 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1100 1101 if ((gval = lut_lookup(fmep->globals, 1102 (void *)nid->u.globid.s, NULL)) == NULL) { 1103 out(O_ALTFP, " undefined"); 1104 } else if (gval->t == UINT64) { 1105 out(O_ALTFP, " %llu", gval->v); 1106 (void) sprintf(suffixbuf, "%llu", gval->v); 1107 suffix = suffixbuf; 1108 } else { 1109 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1110 suffix = (char *)(uintptr_t)gval->v; 1111 } 1112 1113 nname = strlen(serdname) + strlen(suffix) + 2; 1114 nserdname = MALLOC(nname); 1115 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1116 FREE(serdname); 1117 serdname = nserdname; 1118 } 1119 1120 /* 1121 * if the engine is empty, and we have an override for n/t then 1122 * destroy and recreate it. 1123 */ 1124 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1125 fmd_serd_empty(hdl, serdname)) 1126 fmd_serd_destroy(hdl, serdname); 1127 1128 if (!fmd_serd_exists(hdl, serdname)) { 1129 struct node *nN, *nT; 1130 const char *s; 1131 struct node *nodep; 1132 struct config *cp; 1133 char *path; 1134 uint_t nval; 1135 hrtime_t tval; 1136 int i; 1137 char *ptr; 1138 int got_n_override = 0, got_t_override = 0; 1139 1140 /* no SERD engine yet, so create it */ 1141 nodep = serdinst->u.stmt.np->u.event.epname; 1142 path = ipath2str(NULL, ipath(nodep)); 1143 cp = config_lookup(fmep->config, path, 0); 1144 FREE((void *)path); 1145 1146 /* 1147 * We allow serd paramaters to be overridden, either from 1148 * eft.conf file values (if Serd_Override is set) or from 1149 * driver properties (for "serd.io.device" engines). 1150 */ 1151 if (Serd_Override != NULL) { 1152 char *save_ptr, *ptr1, *ptr2, *ptr3; 1153 ptr3 = save_ptr = STRDUP(Serd_Override); 1154 while (*ptr3 != '\0') { 1155 ptr1 = strchr(ptr3, ','); 1156 *ptr1 = '\0'; 1157 if (strcmp(ptr3, serdclass) == 0) { 1158 ptr2 = strchr(ptr1 + 1, ','); 1159 *ptr2 = '\0'; 1160 nval = atoi(ptr1 + 1); 1161 out(O_ALTFP, "serd override %s_n %d", 1162 serdclass, nval); 1163 ptr3 = strchr(ptr2 + 1, ' '); 1164 if (ptr3) 1165 *ptr3 = '\0'; 1166 ptr = STRDUP(ptr2 + 1); 1167 out(O_ALTFP, "serd override %s_t %s", 1168 serdclass, ptr); 1169 got_n_override = 1; 1170 got_t_override = 1; 1171 break; 1172 } else { 1173 ptr2 = strchr(ptr1 + 1, ','); 1174 ptr3 = strchr(ptr2 + 1, ' '); 1175 if (ptr3 == NULL) 1176 break; 1177 } 1178 ptr3++; 1179 } 1180 FREE(save_ptr); 1181 } 1182 1183 if (cp && got_n_override == 0) { 1184 /* 1185 * convert serd engine class into property name 1186 */ 1187 char *prop_name = MALLOC(strlen(serdclass) + 3); 1188 for (i = 0; i < strlen(serdclass); i++) { 1189 if (serdclass[i] == '.') 1190 prop_name[i] = '_'; 1191 else 1192 prop_name[i] = serdclass[i]; 1193 } 1194 prop_name[i++] = '_'; 1195 prop_name[i++] = 'n'; 1196 prop_name[i] = '\0'; 1197 if (s = config_getprop(cp, prop_name)) { 1198 nval = atoi(s); 1199 out(O_ALTFP, "serd override %s_n %s", 1200 serdclass, s); 1201 got_n_override = 1; 1202 } 1203 prop_name[i - 1] = 't'; 1204 if (s = config_getprop(cp, prop_name)) { 1205 ptr = STRDUP(s); 1206 out(O_ALTFP, "serd override %s_t %s", 1207 serdclass, s); 1208 got_t_override = 1; 1209 } 1210 FREE(prop_name); 1211 } 1212 1213 if (serdn != -1 && got_n_override == 0) { 1214 nval = serdn; 1215 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1216 got_n_override = 1; 1217 } 1218 if (serdt != NULL && got_t_override == 0) { 1219 ptr = STRDUP(serdt); 1220 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1221 got_t_override = 1; 1222 } 1223 1224 if (!got_n_override) { 1225 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1226 NULL); 1227 ASSERT(nN->t == T_NUM); 1228 nval = (uint_t)nN->u.ull; 1229 } 1230 if (!got_t_override) { 1231 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1232 NULL); 1233 ASSERT(nT->t == T_TIMEVAL); 1234 tval = (hrtime_t)nT->u.ull; 1235 } else { 1236 const unsigned long long *ullp; 1237 const char *suffix; 1238 int len; 1239 1240 len = strspn(ptr, "0123456789"); 1241 suffix = stable(&ptr[len]); 1242 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1243 (void *)suffix, NULL); 1244 ptr[len] = '\0'; 1245 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1246 FREE(ptr); 1247 } 1248 fmd_serd_create(hdl, serdname, nval, tval); 1249 } 1250 1251 newentp = MALLOC(sizeof (*newentp)); 1252 newentp->ename = stable(serdclass); 1253 FREE(serdclass); 1254 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1255 newentp->hdl = hdl; 1256 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1257 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1258 (void *)newentp, (lut_cmp)serd_cmp); 1259 Serd_need_save = 1; 1260 serd_save(); 1261 } else { 1262 FREE(newentp); 1263 } 1264 1265 1266 /* 1267 * increment SERD engine. if engine fires, reset serd 1268 * engine and return trip_strcode if required. 1269 */ 1270 for (i = 0; i < serdincrement; i++) { 1271 if (fmd_serd_record(hdl, serdname, ffep)) { 1272 fmd_case_add_serd(hdl, fmcase, serdname); 1273 fmd_serd_reset(hdl, serdname); 1274 1275 if (ippp) { 1276 struct node *tripinst = 1277 lut_lookup(serdinst->u.stmt.lutp, 1278 (void *)L_trip, NULL); 1279 ASSERT(tripinst != NULL); 1280 *enamep = tripinst->u.event.ename->u.name.s; 1281 *ippp = ipath(tripinst->u.event.epname); 1282 out(O_ALTFP|O_NONL, 1283 "[engine fired: %s, sending: ", serdname); 1284 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1285 out(O_ALTFP, "]"); 1286 } else { 1287 out(O_ALTFP, "[engine fired: %s, no trip]", 1288 serdname); 1289 } 1290 FREE(serdname); 1291 return (1); 1292 } 1293 } 1294 1295 FREE(serdname); 1296 return (0); 1297 } 1298 1299 /* 1300 * search a suspect list for upsets. feed each upset to serd_eval() and 1301 * build up tripped[], an array of ereports produced by the firing of 1302 * any SERD engines. then feed each ereport back into 1303 * fme_receive_report(). 1304 * 1305 * returns ntrip, the number of these ereports produced. 1306 */ 1307 static int 1308 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1309 { 1310 /* we build an array of tripped ereports that we send ourselves */ 1311 struct { 1312 const char *ename; 1313 const struct ipath *ipp; 1314 } *tripped; 1315 struct event *sp; 1316 int ntrip, nupset, i; 1317 1318 /* 1319 * count the number of upsets to determine the upper limit on 1320 * expected trip ereport strings. remember that one upset can 1321 * lead to at most one ereport. 1322 */ 1323 nupset = 0; 1324 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1325 if (sp->t == N_UPSET) 1326 nupset++; 1327 } 1328 1329 if (nupset == 0) 1330 return (0); 1331 1332 /* 1333 * get to this point if we have upsets and expect some trip 1334 * ereports 1335 */ 1336 tripped = alloca(sizeof (*tripped) * nupset); 1337 bzero((void *)tripped, sizeof (*tripped) * nupset); 1338 1339 ntrip = 0; 1340 for (sp = fmep->suspects; sp; sp = sp->suspects) 1341 if (sp->t == N_UPSET && 1342 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1343 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1344 ntrip++; 1345 1346 for (i = 0; i < ntrip; i++) { 1347 struct event *ep, *nep; 1348 struct fme *nfmep; 1349 fmd_case_t *fmcase; 1350 const struct ipath *ipp; 1351 const char *eventstring; 1352 int prev_verbose; 1353 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1354 enum fme_state state; 1355 1356 /* 1357 * First try and evaluate a case with the trip ereport plus 1358 * all the other ereports that cause the trip. If that fails 1359 * to evaluate then try again with just this ereport on its own. 1360 */ 1361 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1362 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1363 out(O_ALTFP|O_STAMP, NULL); 1364 ep = fmep->e0; 1365 eventstring = ep->enode->u.event.ename->u.name.s; 1366 ipp = ep->ipp; 1367 1368 /* 1369 * create a duplicate fme and case 1370 */ 1371 fmcase = fmd_case_open(fmep->hdl, NULL); 1372 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1373 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1374 out(O_ALTFP, " ]"); 1375 1376 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1377 fmcase, ffep, ep->nvp)) == NULL) { 1378 out(O_ALTFP|O_NONL, "["); 1379 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1380 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1381 continue; 1382 } 1383 1384 Open_fme_count++; 1385 nfmep->pull = fmep->pull; 1386 init_fme_bufs(nfmep); 1387 out(O_ALTFP|O_NONL, "["); 1388 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1389 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1390 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1391 if (ffep) { 1392 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1393 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1394 nfmep->e0r = ffep; 1395 } 1396 1397 /* 1398 * add the original ereports 1399 */ 1400 for (ep = fmep->observations; ep; ep = ep->observations) { 1401 eventstring = ep->enode->u.event.ename->u.name.s; 1402 ipp = ep->ipp; 1403 out(O_ALTFP|O_NONL, "adding event ["); 1404 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1405 out(O_ALTFP, " ]"); 1406 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1407 if (nep->count++ == 0) { 1408 nep->observations = nfmep->observations; 1409 nfmep->observations = nep; 1410 serialize_observation(nfmep, eventstring, ipp); 1411 nep->nvp = evnv_dupnvl(ep->nvp); 1412 } 1413 if (ep->ffep && ep->ffep != ffep) 1414 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1415 ep->ffep); 1416 stats_counter_bump(nfmep->Rcount); 1417 } 1418 1419 /* 1420 * add the serd trigger ereport 1421 */ 1422 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1423 tripped[i].ipp)) == NULL) { 1424 /* 1425 * The trigger ereport is not in the instance tree. It 1426 * was presumably removed by prune_propagations() as 1427 * this combination of events is not present in the 1428 * rules. 1429 */ 1430 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1431 Undiag_reason = UD_VAL_BADEVENTI; 1432 goto retry_lone_ereport; 1433 } 1434 out(O_ALTFP|O_NONL, "adding event ["); 1435 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1436 out(O_ALTFP, " ]"); 1437 nfmep->ecurrent = ep; 1438 ep->nvp = NULL; 1439 ep->count = 1; 1440 ep->observations = nfmep->observations; 1441 nfmep->observations = ep; 1442 1443 /* 1444 * just peek first. 1445 */ 1446 nfmep->peek = 1; 1447 prev_verbose = Verbose; 1448 if (Debug == 0) 1449 Verbose = 0; 1450 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1451 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1452 nfmep->peek = 0; 1453 Verbose = prev_verbose; 1454 if (state == FME_DISPROVED) { 1455 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1456 Undiag_reason = UD_VAL_UNSOLVD; 1457 retry_lone_ereport: 1458 /* 1459 * However the trigger ereport on its own might be 1460 * diagnosable, so check for that. Undo the new fme 1461 * and case we just created and call fme_receive_report. 1462 */ 1463 out(O_ALTFP|O_NONL, "["); 1464 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1465 tripped[i].ipp); 1466 out(O_ALTFP, " retrying with just trigger ereport]"); 1467 itree_free(nfmep->eventtree); 1468 nfmep->eventtree = NULL; 1469 structconfig_free(nfmep->config); 1470 nfmep->config = NULL; 1471 destroy_fme_bufs(nfmep); 1472 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1473 fme_receive_report(fmep->hdl, ffep, 1474 tripped[i].ename, tripped[i].ipp, NULL); 1475 continue; 1476 } 1477 1478 /* 1479 * and evaluate 1480 */ 1481 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1482 fme_eval(nfmep, ffep); 1483 } 1484 1485 return (ntrip); 1486 } 1487 1488 /* 1489 * fme_receive_external_report -- call when an external ereport comes in 1490 * 1491 * this routine just converts the relevant information from the ereport 1492 * into a format used internally and passes it on to fme_receive_report(). 1493 */ 1494 void 1495 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1496 const char *class) 1497 { 1498 struct node *epnamenp; 1499 fmd_case_t *fmcase; 1500 const struct ipath *ipp; 1501 nvlist_t *detector = NULL; 1502 1503 class = stable(class); 1504 1505 /* Get the component path from the ereport */ 1506 epnamenp = platform_getpath(nvl); 1507 1508 /* See if we ended up without a path. */ 1509 if (epnamenp == NULL) { 1510 /* See if class permits silent discard on unknown component. */ 1511 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1512 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1513 "to component path, but silent discard allowed.", 1514 class); 1515 } else { 1516 /* 1517 * XFILE: Failure to find a component is bad unless 1518 * 'discard_if_config_unknown=1' was specified in the 1519 * ereport definition. Indicate undiagnosable. 1520 */ 1521 Undiag_reason = UD_VAL_NOPATH; 1522 fmcase = fmd_case_open(hdl, NULL); 1523 1524 /* 1525 * We don't have a component path here (which means that 1526 * the detector was not in hc-scheme and couldn't be 1527 * converted to hc-scheme. Report the raw detector as 1528 * the suspect resource if there is one. 1529 */ 1530 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1531 &detector); 1532 publish_undiagnosable(hdl, ffep, fmcase, detector, 1533 (char *)class); 1534 } 1535 return; 1536 } 1537 1538 ipp = ipath(epnamenp); 1539 tree_free(epnamenp); 1540 fme_receive_report(hdl, ffep, class, ipp, nvl); 1541 } 1542 1543 /*ARGSUSED*/ 1544 void 1545 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1546 const char *eventstring) 1547 { 1548 char *uuid; 1549 nvlist_t **nva; 1550 uint_t nvc; 1551 const struct ipath *ipp; 1552 1553 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1554 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1555 &nva, &nvc) != 0) { 1556 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1557 return; 1558 } 1559 1560 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1561 1562 while (nvc-- != 0) { 1563 /* 1564 * Reset any istat or serd engine associated with this path. 1565 */ 1566 char *path; 1567 1568 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1569 continue; 1570 1571 path = ipath2str(NULL, ipp); 1572 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1573 path); 1574 FREE(path); 1575 1576 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1577 istat_save(); 1578 1579 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1580 serd_save(); 1581 } 1582 } 1583 1584 /*ARGSUSED*/ 1585 void 1586 fme_receive_topology_change(void) 1587 { 1588 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1589 istat_save(); 1590 1591 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1592 serd_save(); 1593 } 1594 1595 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1596 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1597 1598 /* ARGSUSED */ 1599 static void 1600 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1601 { 1602 struct bubble *bp; 1603 struct arrowlist *ap; 1604 1605 ep->cached_state = 0; 1606 ep->keep_in_tree = 0; 1607 for (bp = itree_next_bubble(ep, NULL); bp; 1608 bp = itree_next_bubble(ep, bp)) { 1609 if (bp->t != B_FROM) 1610 continue; 1611 bp->mark = 0; 1612 for (ap = itree_next_arrow(bp, NULL); ap; 1613 ap = itree_next_arrow(bp, ap)) 1614 ap->arrowp->mark = 0; 1615 } 1616 } 1617 1618 static void 1619 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1620 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1621 { 1622 struct event *ep; 1623 struct fme *fmep = NULL; 1624 struct fme *ofmep = NULL; 1625 struct fme *cfmep, *svfmep; 1626 int matched = 0; 1627 nvlist_t *defect; 1628 fmd_case_t *fmcase; 1629 char *reason; 1630 1631 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1632 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1633 out(O_ALTFP|O_STAMP, NULL); 1634 1635 /* decide which FME it goes to */ 1636 for (fmep = FMElist; fmep; fmep = fmep->next) { 1637 int prev_verbose; 1638 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1639 enum fme_state state; 1640 nvlist_t *pre_peek_nvp = NULL; 1641 1642 if (fmep->overflow) { 1643 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1644 ofmep = fmep; 1645 1646 continue; 1647 } 1648 1649 /* 1650 * ignore solved or closed cases 1651 */ 1652 if (fmep->posted_suspects || 1653 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1654 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1655 continue; 1656 1657 /* look up event in event tree for this FME */ 1658 if ((ep = itree_lookup(fmep->eventtree, 1659 eventstring, ipp)) == NULL) 1660 continue; 1661 1662 /* note observation */ 1663 fmep->ecurrent = ep; 1664 if (ep->count++ == 0) { 1665 /* link it into list of observations seen */ 1666 ep->observations = fmep->observations; 1667 fmep->observations = ep; 1668 ep->nvp = evnv_dupnvl(nvl); 1669 } else { 1670 /* use new payload values for peek */ 1671 pre_peek_nvp = ep->nvp; 1672 ep->nvp = evnv_dupnvl(nvl); 1673 } 1674 1675 /* tell hypothesise() not to mess with suspect list */ 1676 fmep->peek = 1; 1677 1678 /* don't want this to be verbose (unless Debug is set) */ 1679 prev_verbose = Verbose; 1680 if (Debug == 0) 1681 Verbose = 0; 1682 1683 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1684 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1685 1686 fmep->peek = 0; 1687 1688 /* put verbose flag back */ 1689 Verbose = prev_verbose; 1690 1691 if (state != FME_DISPROVED) { 1692 /* found an FME that explains the ereport */ 1693 matched++; 1694 out(O_ALTFP|O_NONL, "["); 1695 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1696 out(O_ALTFP, " explained by FME%d]", fmep->id); 1697 1698 if (pre_peek_nvp) 1699 nvlist_free(pre_peek_nvp); 1700 1701 if (ep->count == 1) 1702 serialize_observation(fmep, eventstring, ipp); 1703 1704 if (ffep) { 1705 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1706 ep->ffep = ffep; 1707 } 1708 1709 stats_counter_bump(fmep->Rcount); 1710 1711 /* re-eval FME */ 1712 fme_eval(fmep, ffep); 1713 } else { 1714 1715 /* not a match, undo noting of observation */ 1716 fmep->ecurrent = NULL; 1717 if (--ep->count == 0) { 1718 /* unlink it from observations */ 1719 fmep->observations = ep->observations; 1720 ep->observations = NULL; 1721 nvlist_free(ep->nvp); 1722 ep->nvp = NULL; 1723 } else { 1724 nvlist_free(ep->nvp); 1725 ep->nvp = pre_peek_nvp; 1726 } 1727 } 1728 } 1729 1730 if (matched) 1731 return; /* explained by at least one existing FME */ 1732 1733 /* clean up closed fmes */ 1734 cfmep = ClosedFMEs; 1735 while (cfmep != NULL) { 1736 svfmep = cfmep->next; 1737 destroy_fme(cfmep); 1738 cfmep = svfmep; 1739 } 1740 ClosedFMEs = NULL; 1741 1742 if (ofmep) { 1743 out(O_ALTFP|O_NONL, "["); 1744 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1745 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1746 if (ffep) 1747 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1748 1749 return; 1750 1751 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1752 out(O_ALTFP|O_NONL, "["); 1753 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1754 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1755 1756 fmcase = fmd_case_open(hdl, NULL); 1757 1758 /* Create overflow fme */ 1759 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1760 nvl)) == NULL) { 1761 out(O_ALTFP|O_NONL, "["); 1762 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1763 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1764 return; 1765 } 1766 1767 Open_fme_count++; 1768 1769 init_fme_bufs(fmep); 1770 fmep->overflow = B_TRUE; 1771 1772 if (ffep) 1773 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1774 1775 Undiag_reason = UD_VAL_MAXFME; 1776 defect = fmd_nvl_create_fault(hdl, 1777 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1778 reason = undiag_2reason_str(Undiag_reason, NULL); 1779 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1780 FREE(reason); 1781 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1782 fmd_case_solve(hdl, fmep->fmcase); 1783 Undiag_reason = UD_VAL_UNKNOWN; 1784 return; 1785 } 1786 1787 /* open a case */ 1788 fmcase = fmd_case_open(hdl, NULL); 1789 1790 /* start a new FME */ 1791 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1792 out(O_ALTFP|O_NONL, "["); 1793 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1794 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1795 return; 1796 } 1797 1798 Open_fme_count++; 1799 1800 init_fme_bufs(fmep); 1801 1802 out(O_ALTFP|O_NONL, "["); 1803 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1804 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1805 fmd_case_uuid(hdl, fmep->fmcase)); 1806 1807 ep = fmep->e0; 1808 ASSERT(ep != NULL); 1809 1810 /* note observation */ 1811 fmep->ecurrent = ep; 1812 if (ep->count++ == 0) { 1813 /* link it into list of observations seen */ 1814 ep->observations = fmep->observations; 1815 fmep->observations = ep; 1816 ep->nvp = evnv_dupnvl(nvl); 1817 serialize_observation(fmep, eventstring, ipp); 1818 } else { 1819 /* new payload overrides any previous */ 1820 nvlist_free(ep->nvp); 1821 ep->nvp = evnv_dupnvl(nvl); 1822 } 1823 1824 stats_counter_bump(fmep->Rcount); 1825 1826 if (ffep) { 1827 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1828 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1829 fmep->e0r = ffep; 1830 ep->ffep = ffep; 1831 } 1832 1833 /* give the diagnosis algorithm a shot at the new FME state */ 1834 fme_eval(fmep, ffep); 1835 } 1836 1837 void 1838 fme_status(int flags) 1839 { 1840 struct fme *fmep; 1841 1842 if (FMElist == NULL) { 1843 out(flags, "No fault management exercises underway."); 1844 return; 1845 } 1846 1847 for (fmep = FMElist; fmep; fmep = fmep->next) 1848 fme_print(flags, fmep); 1849 } 1850 1851 /* 1852 * "indent" routines used mostly for nicely formatted debug output, but also 1853 * for sanity checking for infinite recursion bugs. 1854 */ 1855 1856 #define MAX_INDENT 1024 1857 static const char *indent_s[MAX_INDENT]; 1858 static int current_indent; 1859 1860 static void 1861 indent_push(const char *s) 1862 { 1863 if (current_indent < MAX_INDENT) 1864 indent_s[current_indent++] = s; 1865 else 1866 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1867 } 1868 1869 static void 1870 indent_set(const char *s) 1871 { 1872 current_indent = 0; 1873 indent_push(s); 1874 } 1875 1876 static void 1877 indent_pop(void) 1878 { 1879 if (current_indent > 0) 1880 current_indent--; 1881 else 1882 out(O_DIE, "recursion underflow"); 1883 } 1884 1885 static void 1886 indent(void) 1887 { 1888 int i; 1889 if (!Verbose) 1890 return; 1891 for (i = 0; i < current_indent; i++) 1892 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1893 } 1894 1895 #define SLNEW 1 1896 #define SLCHANGED 2 1897 #define SLWAIT 3 1898 #define SLDISPROVED 4 1899 1900 static void 1901 print_suspects(int circumstance, struct fme *fmep) 1902 { 1903 struct event *ep; 1904 1905 out(O_ALTFP|O_NONL, "["); 1906 if (circumstance == SLCHANGED) { 1907 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1908 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1909 } else if (circumstance == SLWAIT) { 1910 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1911 fmep->timer); 1912 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1913 } else if (circumstance == SLDISPROVED) { 1914 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1915 } else { 1916 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1917 } 1918 1919 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1920 out(O_ALTFP, "]"); 1921 return; 1922 } 1923 1924 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1925 out(O_ALTFP|O_NONL, " "); 1926 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1927 } 1928 out(O_ALTFP, "]"); 1929 } 1930 1931 static struct node * 1932 eventprop_lookup(struct event *ep, const char *propname) 1933 { 1934 return (lut_lookup(ep->props, (void *)propname, NULL)); 1935 } 1936 1937 #define MAXDIGITIDX 23 1938 static char numbuf[MAXDIGITIDX + 1]; 1939 1940 static int 1941 node2uint(struct node *n, uint_t *valp) 1942 { 1943 struct evalue value; 1944 struct lut *globals = NULL; 1945 1946 if (n == NULL) 1947 return (1); 1948 1949 /* 1950 * check value.v since we are being asked to convert an unsigned 1951 * long long int to an unsigned int 1952 */ 1953 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1954 value.t != UINT64 || value.v > (1ULL << 32)) 1955 return (1); 1956 1957 *valp = (uint_t)value.v; 1958 1959 return (0); 1960 } 1961 1962 static nvlist_t * 1963 node2fmri(struct node *n) 1964 { 1965 nvlist_t **pa, *f, *p; 1966 struct node *nc; 1967 uint_t depth = 0; 1968 char *numstr, *nullbyte; 1969 char *failure; 1970 int err, i; 1971 1972 /* XXX do we need to be able to handle a non-T_NAME node? */ 1973 if (n == NULL || n->t != T_NAME) 1974 return (NULL); 1975 1976 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1977 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1978 break; 1979 depth++; 1980 } 1981 1982 if (nc != NULL) { 1983 /* We bailed early, something went wrong */ 1984 return (NULL); 1985 } 1986 1987 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1988 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1989 pa = alloca(depth * sizeof (nvlist_t *)); 1990 for (i = 0; i < depth; i++) 1991 pa[i] = NULL; 1992 1993 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1994 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1995 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1996 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1997 if (err != 0) { 1998 failure = "basic construction of FMRI failed"; 1999 goto boom; 2000 } 2001 2002 numbuf[MAXDIGITIDX] = '\0'; 2003 nullbyte = &numbuf[MAXDIGITIDX]; 2004 i = 0; 2005 2006 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2007 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2008 if (err != 0) { 2009 failure = "alloc of an hc-pair failed"; 2010 goto boom; 2011 } 2012 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2013 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2014 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2015 if (err != 0) { 2016 failure = "construction of an hc-pair failed"; 2017 goto boom; 2018 } 2019 pa[i++] = p; 2020 } 2021 2022 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2023 if (err == 0) { 2024 for (i = 0; i < depth; i++) 2025 if (pa[i] != NULL) 2026 nvlist_free(pa[i]); 2027 return (f); 2028 } 2029 failure = "addition of hc-pair array to FMRI failed"; 2030 2031 boom: 2032 for (i = 0; i < depth; i++) 2033 if (pa[i] != NULL) 2034 nvlist_free(pa[i]); 2035 nvlist_free(f); 2036 out(O_DIE, "%s", failure); 2037 /*NOTREACHED*/ 2038 return (NULL); 2039 } 2040 2041 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2042 struct ipath { 2043 const char *s; /* component name (in stable) */ 2044 int i; /* instance number */ 2045 }; 2046 2047 static nvlist_t * 2048 ipath2fmri(struct ipath *ipath) 2049 { 2050 nvlist_t **pa, *f, *p; 2051 uint_t depth = 0; 2052 char *numstr, *nullbyte; 2053 char *failure; 2054 int err, i; 2055 struct ipath *ipp; 2056 2057 for (ipp = ipath; ipp->s != NULL; ipp++) 2058 depth++; 2059 2060 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2061 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2062 pa = alloca(depth * sizeof (nvlist_t *)); 2063 for (i = 0; i < depth; i++) 2064 pa[i] = NULL; 2065 2066 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2067 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2068 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2069 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2070 if (err != 0) { 2071 failure = "basic construction of FMRI failed"; 2072 goto boom; 2073 } 2074 2075 numbuf[MAXDIGITIDX] = '\0'; 2076 nullbyte = &numbuf[MAXDIGITIDX]; 2077 i = 0; 2078 2079 for (ipp = ipath; ipp->s != NULL; ipp++) { 2080 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2081 if (err != 0) { 2082 failure = "alloc of an hc-pair failed"; 2083 goto boom; 2084 } 2085 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2086 numstr = ulltostr(ipp->i, nullbyte); 2087 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2088 if (err != 0) { 2089 failure = "construction of an hc-pair failed"; 2090 goto boom; 2091 } 2092 pa[i++] = p; 2093 } 2094 2095 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2096 if (err == 0) { 2097 for (i = 0; i < depth; i++) 2098 if (pa[i] != NULL) 2099 nvlist_free(pa[i]); 2100 return (f); 2101 } 2102 failure = "addition of hc-pair array to FMRI failed"; 2103 2104 boom: 2105 for (i = 0; i < depth; i++) 2106 if (pa[i] != NULL) 2107 nvlist_free(pa[i]); 2108 nvlist_free(f); 2109 out(O_DIE, "%s", failure); 2110 /*NOTREACHED*/ 2111 return (NULL); 2112 } 2113 2114 static uint8_t 2115 percentof(uint_t part, uint_t whole) 2116 { 2117 unsigned long long p = part * 1000; 2118 2119 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2120 } 2121 2122 struct rsl { 2123 struct event *suspect; 2124 nvlist_t *asru; 2125 nvlist_t *fru; 2126 nvlist_t *rsrc; 2127 }; 2128 2129 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2130 2131 /* 2132 * rslfree -- free internal members of struct rsl not expected to be 2133 * freed elsewhere. 2134 */ 2135 static void 2136 rslfree(struct rsl *freeme) 2137 { 2138 if (freeme->asru != NULL) 2139 nvlist_free(freeme->asru); 2140 if (freeme->fru != NULL) 2141 nvlist_free(freeme->fru); 2142 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2143 nvlist_free(freeme->rsrc); 2144 } 2145 2146 /* 2147 * rslcmp -- compare two rsl structures. Use the following 2148 * comparisons to establish cardinality: 2149 * 2150 * 1. Name of the suspect's class. (simple strcmp) 2151 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2152 * 2153 */ 2154 static int 2155 rslcmp(const void *a, const void *b) 2156 { 2157 struct rsl *r1 = (struct rsl *)a; 2158 struct rsl *r2 = (struct rsl *)b; 2159 int rv; 2160 2161 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2162 r2->suspect->enode->u.event.ename->u.name.s); 2163 if (rv != 0) 2164 return (rv); 2165 2166 if (r1->rsrc == NULL && r2->rsrc == NULL) 2167 return (0); 2168 if (r1->rsrc == NULL) 2169 return (-1); 2170 if (r2->rsrc == NULL) 2171 return (1); 2172 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2173 } 2174 2175 /* 2176 * get_resources -- for a given suspect, determine what ASRU, FRU and 2177 * RSRC nvlists should be advertised in the final suspect list. 2178 */ 2179 void 2180 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2181 { 2182 struct node *asrudef, *frudef; 2183 nvlist_t *asru, *fru; 2184 nvlist_t *rsrc = NULL; 2185 char *pathstr; 2186 2187 /* 2188 * First find any ASRU and/or FRU defined in the 2189 * initial fault tree. 2190 */ 2191 asrudef = eventprop_lookup(sp, L_ASRU); 2192 frudef = eventprop_lookup(sp, L_FRU); 2193 2194 /* 2195 * Create FMRIs based on those definitions 2196 */ 2197 asru = node2fmri(asrudef); 2198 fru = node2fmri(frudef); 2199 pathstr = ipath2str(NULL, sp->ipp); 2200 2201 /* 2202 * Allow for platform translations of the FMRIs 2203 */ 2204 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2205 pathstr); 2206 2207 FREE(pathstr); 2208 rsrcs->suspect = sp; 2209 rsrcs->asru = asru; 2210 rsrcs->fru = fru; 2211 rsrcs->rsrc = rsrc; 2212 } 2213 2214 /* 2215 * trim_suspects -- prior to publishing, we may need to remove some 2216 * suspects from the list. If we're auto-closing upsets, we don't 2217 * want any of those in the published list. If the ASRUs for multiple 2218 * defects resolve to the same ASRU (driver) we only want to publish 2219 * that as a single suspect. 2220 */ 2221 static int 2222 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2223 fmd_event_t *ffep) 2224 { 2225 struct event *ep; 2226 struct rsl *rp = begin; 2227 struct rsl *rp2 = begin2; 2228 int mess_zero_count = 0; 2229 int serd_rval; 2230 uint_t messval; 2231 2232 /* remove any unwanted upsets and populate our array */ 2233 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2234 if (is_upset(ep->t)) 2235 continue; 2236 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2237 NULL, NULL); 2238 if (serd_rval == 0) 2239 continue; 2240 if (node2uint(eventprop_lookup(ep, L_message), 2241 &messval) == 0 && messval == 0) { 2242 get_resources(ep, rp2, fmep->config); 2243 rp2++; 2244 mess_zero_count++; 2245 } else { 2246 get_resources(ep, rp, fmep->config); 2247 rp++; 2248 fmep->nsuspects++; 2249 } 2250 } 2251 return (mess_zero_count); 2252 } 2253 2254 /* 2255 * addpayloadprop -- add a payload prop to a problem 2256 */ 2257 static void 2258 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2259 { 2260 nvlist_t *rsrc, *hcs; 2261 2262 ASSERT(fault != NULL); 2263 ASSERT(lhs != NULL); 2264 ASSERT(rhs != NULL); 2265 2266 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2267 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2268 2269 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2270 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2271 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2272 out(O_DIE, 2273 "cannot add payloadprop \"%s\" to fault", lhs); 2274 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2275 out(O_DIE, 2276 "cannot add payloadprop \"%s\" to fault", lhs); 2277 nvlist_free(hcs); 2278 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2279 out(O_DIE, 2280 "cannot add payloadprop \"%s\" to fault", lhs); 2281 } else 2282 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2283 2284 if (rhs->t == UINT64) { 2285 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2286 2287 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2288 out(O_DIE, 2289 "cannot add payloadprop \"%s\" to fault", lhs); 2290 } else { 2291 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2292 lhs, (char *)(uintptr_t)rhs->v); 2293 2294 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2295 out(O_DIE, 2296 "cannot add payloadprop \"%s\" to fault", lhs); 2297 } 2298 } 2299 2300 static char *Istatbuf; 2301 static char *Istatbufptr; 2302 static int Istatsz; 2303 2304 /* 2305 * istataddsize -- calculate size of istat and add it to Istatsz 2306 */ 2307 /*ARGSUSED2*/ 2308 static void 2309 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2310 { 2311 int val; 2312 2313 ASSERT(lhs != NULL); 2314 ASSERT(rhs != NULL); 2315 2316 if ((val = stats_counter_value(rhs)) == 0) 2317 return; /* skip zero-valued stats */ 2318 2319 /* count up the size of the stat name */ 2320 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2321 Istatsz++; /* for the trailing NULL byte */ 2322 2323 /* count up the size of the stat value */ 2324 Istatsz += snprintf(NULL, 0, "%d", val); 2325 Istatsz++; /* for the trailing NULL byte */ 2326 } 2327 2328 /* 2329 * istat2str -- serialize an istat, writing result to *Istatbufptr 2330 */ 2331 /*ARGSUSED2*/ 2332 static void 2333 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2334 { 2335 char *str; 2336 int len; 2337 int val; 2338 2339 ASSERT(lhs != NULL); 2340 ASSERT(rhs != NULL); 2341 2342 if ((val = stats_counter_value(rhs)) == 0) 2343 return; /* skip zero-valued stats */ 2344 2345 /* serialize the stat name */ 2346 str = ipath2str(lhs->ename, lhs->ipath); 2347 len = strlen(str); 2348 2349 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2350 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2351 Istatbufptr += len; 2352 FREE(str); 2353 *Istatbufptr++ = '\0'; 2354 2355 /* serialize the stat value */ 2356 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2357 "%d", val); 2358 *Istatbufptr++ = '\0'; 2359 2360 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2361 } 2362 2363 void 2364 istat_save() 2365 { 2366 if (Istat_need_save == 0) 2367 return; 2368 2369 /* figure out how big the serialzed info is */ 2370 Istatsz = 0; 2371 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2372 2373 if (Istatsz == 0) { 2374 /* no stats to save */ 2375 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2376 return; 2377 } 2378 2379 /* create the serialized buffer */ 2380 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2381 lut_walk(Istats, (lut_cb)istat2str, NULL); 2382 2383 /* clear out current saved stats */ 2384 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2385 2386 /* write out the new version */ 2387 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2388 FREE(Istatbuf); 2389 2390 Istat_need_save = 0; 2391 } 2392 2393 int 2394 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2395 { 2396 if (ent1->ename != ent2->ename) 2397 return (ent2->ename - ent1->ename); 2398 if (ent1->ipath != ent2->ipath) 2399 return ((char *)ent2->ipath - (char *)ent1->ipath); 2400 2401 return (0); 2402 } 2403 2404 /* 2405 * istat-verify -- verify the component associated with a stat still exists 2406 * 2407 * if the component no longer exists, this routine resets the stat and 2408 * returns 0. if the component still exists, it returns 1. 2409 */ 2410 static int 2411 istat_verify(struct node *snp, struct istat_entry *entp) 2412 { 2413 struct stats *statp; 2414 nvlist_t *fmri; 2415 2416 fmri = node2fmri(snp->u.event.epname); 2417 if (platform_path_exists(fmri)) { 2418 nvlist_free(fmri); 2419 return (1); 2420 } 2421 nvlist_free(fmri); 2422 2423 /* component no longer in system. zero out the associated stats */ 2424 if ((statp = (struct stats *) 2425 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2426 stats_counter_value(statp) == 0) 2427 return (0); /* stat is already reset */ 2428 2429 Istat_need_save = 1; 2430 stats_counter_reset(statp); 2431 return (0); 2432 } 2433 2434 static void 2435 istat_bump(struct node *snp, int n) 2436 { 2437 struct stats *statp; 2438 struct istat_entry ent; 2439 2440 ASSERT(snp != NULL); 2441 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2442 ASSERT(snp->u.event.epname != NULL); 2443 2444 /* class name should be hoisted into a single stable entry */ 2445 ASSERT(snp->u.event.ename->u.name.next == NULL); 2446 ent.ename = snp->u.event.ename->u.name.s; 2447 ent.ipath = ipath(snp->u.event.epname); 2448 2449 if (!istat_verify(snp, &ent)) { 2450 /* component no longer exists in system, nothing to do */ 2451 return; 2452 } 2453 2454 if ((statp = (struct stats *) 2455 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2456 /* need to create the counter */ 2457 int cnt = 0; 2458 struct node *np; 2459 char *sname; 2460 char *snamep; 2461 struct istat_entry *newentp; 2462 2463 /* count up the size of the stat name */ 2464 np = snp->u.event.ename; 2465 while (np != NULL) { 2466 cnt += strlen(np->u.name.s); 2467 cnt++; /* for the '.' or '@' */ 2468 np = np->u.name.next; 2469 } 2470 np = snp->u.event.epname; 2471 while (np != NULL) { 2472 cnt += snprintf(NULL, 0, "%s%llu", 2473 np->u.name.s, np->u.name.child->u.ull); 2474 cnt++; /* for the '/' or trailing NULL byte */ 2475 np = np->u.name.next; 2476 } 2477 2478 /* build the stat name */ 2479 snamep = sname = alloca(cnt); 2480 np = snp->u.event.ename; 2481 while (np != NULL) { 2482 snamep += snprintf(snamep, &sname[cnt] - snamep, 2483 "%s", np->u.name.s); 2484 np = np->u.name.next; 2485 if (np) 2486 *snamep++ = '.'; 2487 } 2488 *snamep++ = '@'; 2489 np = snp->u.event.epname; 2490 while (np != NULL) { 2491 snamep += snprintf(snamep, &sname[cnt] - snamep, 2492 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2493 np = np->u.name.next; 2494 if (np) 2495 *snamep++ = '/'; 2496 } 2497 *snamep++ = '\0'; 2498 2499 /* create the new stat & add it to our list */ 2500 newentp = MALLOC(sizeof (*newentp)); 2501 *newentp = ent; 2502 statp = stats_new_counter(NULL, sname, 0); 2503 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2504 (lut_cmp)istat_cmp); 2505 } 2506 2507 /* if n is non-zero, set that value instead of bumping */ 2508 if (n) { 2509 stats_counter_reset(statp); 2510 stats_counter_add(statp, n); 2511 } else 2512 stats_counter_bump(statp); 2513 Istat_need_save = 1; 2514 2515 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2516 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2517 stats_counter_value(statp)); 2518 } 2519 2520 /*ARGSUSED*/ 2521 static void 2522 istat_destructor(void *left, void *right, void *arg) 2523 { 2524 struct istat_entry *entp = (struct istat_entry *)left; 2525 struct stats *statp = (struct stats *)right; 2526 FREE(entp); 2527 stats_delete(statp); 2528 } 2529 2530 /* 2531 * Callback used in a walk of the Istats to reset matching stat counters. 2532 */ 2533 static void 2534 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2535 const struct ipath *ipp) 2536 { 2537 char *path; 2538 2539 if (entp->ipath == ipp) { 2540 path = ipath2str(entp->ename, ipp); 2541 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2542 FREE(path); 2543 stats_counter_reset(statp); 2544 Istat_need_save = 1; 2545 } 2546 } 2547 2548 /*ARGSUSED*/ 2549 static void 2550 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2551 void *unused) 2552 { 2553 char *path; 2554 nvlist_t *fmri; 2555 2556 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2557 if (!platform_path_exists(fmri)) { 2558 path = ipath2str(entp->ename, entp->ipath); 2559 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2560 FREE(path); 2561 stats_counter_reset(statp); 2562 Istat_need_save = 1; 2563 } 2564 nvlist_free(fmri); 2565 } 2566 2567 void 2568 istat_fini(void) 2569 { 2570 lut_free(Istats, istat_destructor, NULL); 2571 } 2572 2573 static char *Serdbuf; 2574 static char *Serdbufptr; 2575 static int Serdsz; 2576 2577 /* 2578 * serdaddsize -- calculate size of serd and add it to Serdsz 2579 */ 2580 /*ARGSUSED*/ 2581 static void 2582 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2583 { 2584 ASSERT(lhs != NULL); 2585 2586 /* count up the size of the stat name */ 2587 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2588 Serdsz++; /* for the trailing NULL byte */ 2589 } 2590 2591 /* 2592 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2593 */ 2594 /*ARGSUSED*/ 2595 static void 2596 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2597 { 2598 char *str; 2599 int len; 2600 2601 ASSERT(lhs != NULL); 2602 2603 /* serialize the serd engine name */ 2604 str = ipath2str(lhs->ename, lhs->ipath); 2605 len = strlen(str); 2606 2607 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2608 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2609 Serdbufptr += len; 2610 FREE(str); 2611 *Serdbufptr++ = '\0'; 2612 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2613 } 2614 2615 void 2616 serd_save() 2617 { 2618 if (Serd_need_save == 0) 2619 return; 2620 2621 /* figure out how big the serialzed info is */ 2622 Serdsz = 0; 2623 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2624 2625 if (Serdsz == 0) { 2626 /* no serd engines to save */ 2627 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2628 return; 2629 } 2630 2631 /* create the serialized buffer */ 2632 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2633 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2634 2635 /* clear out current saved stats */ 2636 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2637 2638 /* write out the new version */ 2639 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2640 FREE(Serdbuf); 2641 Serd_need_save = 0; 2642 } 2643 2644 int 2645 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2646 { 2647 if (ent1->ename != ent2->ename) 2648 return (ent2->ename - ent1->ename); 2649 if (ent1->ipath != ent2->ipath) 2650 return ((char *)ent2->ipath - (char *)ent1->ipath); 2651 2652 return (0); 2653 } 2654 2655 void 2656 fme_serd_load(fmd_hdl_t *hdl) 2657 { 2658 int sz; 2659 char *sbuf; 2660 char *sepptr; 2661 char *ptr; 2662 struct serd_entry *newentp; 2663 struct node *epname; 2664 nvlist_t *fmri; 2665 char *namestring; 2666 2667 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2668 return; 2669 sbuf = alloca(sz); 2670 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2671 ptr = sbuf; 2672 while (ptr < &sbuf[sz]) { 2673 sepptr = strchr(ptr, '@'); 2674 *sepptr = '\0'; 2675 namestring = ptr; 2676 sepptr++; 2677 ptr = sepptr; 2678 ptr += strlen(ptr); 2679 ptr++; /* move past the '\0' separating paths */ 2680 epname = pathstring2epnamenp(sepptr); 2681 fmri = node2fmri(epname); 2682 if (platform_path_exists(fmri)) { 2683 newentp = MALLOC(sizeof (*newentp)); 2684 newentp->hdl = hdl; 2685 newentp->ipath = ipath(epname); 2686 newentp->ename = stable(namestring); 2687 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2688 (void *)newentp, (lut_cmp)serd_cmp); 2689 } else 2690 Serd_need_save = 1; 2691 tree_free(epname); 2692 nvlist_free(fmri); 2693 } 2694 /* save it back again in case some of the paths no longer exist */ 2695 serd_save(); 2696 } 2697 2698 /*ARGSUSED*/ 2699 static void 2700 serd_destructor(void *left, void *right, void *arg) 2701 { 2702 struct serd_entry *entp = (struct serd_entry *)left; 2703 FREE(entp); 2704 } 2705 2706 /* 2707 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2708 */ 2709 /*ARGSUSED*/ 2710 static void 2711 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2712 { 2713 char *path; 2714 2715 if (entp->ipath == ipp) { 2716 path = ipath2str(entp->ename, ipp); 2717 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2718 fmd_serd_reset(entp->hdl, path); 2719 FREE(path); 2720 Serd_need_save = 1; 2721 } 2722 } 2723 2724 /*ARGSUSED*/ 2725 static void 2726 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2727 { 2728 char *path; 2729 nvlist_t *fmri; 2730 2731 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2732 if (!platform_path_exists(fmri)) { 2733 path = ipath2str(entp->ename, entp->ipath); 2734 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2735 fmd_serd_reset(entp->hdl, path); 2736 FREE(path); 2737 Serd_need_save = 1; 2738 } 2739 nvlist_free(fmri); 2740 } 2741 2742 void 2743 serd_fini(void) 2744 { 2745 lut_free(SerdEngines, serd_destructor, NULL); 2746 } 2747 2748 static void 2749 publish_suspects(struct fme *fmep, struct rsl *srl) 2750 { 2751 struct rsl *rp; 2752 nvlist_t *fault; 2753 uint8_t cert; 2754 uint_t *frs; 2755 uint_t frsum, fr; 2756 uint_t messval; 2757 uint_t retireval; 2758 uint_t responseval; 2759 struct node *snp; 2760 int frcnt, fridx; 2761 boolean_t allfaulty = B_TRUE; 2762 struct rsl *erl = srl + fmep->nsuspects - 1; 2763 2764 /* 2765 * sort the array 2766 */ 2767 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2768 2769 /* sum the fitrates */ 2770 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2771 fridx = frcnt = frsum = 0; 2772 2773 for (rp = srl; rp <= erl; rp++) { 2774 struct node *n; 2775 2776 n = eventprop_lookup(rp->suspect, L_FITrate); 2777 if (node2uint(n, &fr) != 0) { 2778 out(O_DEBUG|O_NONL, "event "); 2779 ipath_print(O_DEBUG|O_NONL, 2780 rp->suspect->enode->u.event.ename->u.name.s, 2781 rp->suspect->ipp); 2782 out(O_DEBUG, " has no FITrate (using 1)"); 2783 fr = 1; 2784 } else if (fr == 0) { 2785 out(O_DEBUG|O_NONL, "event "); 2786 ipath_print(O_DEBUG|O_NONL, 2787 rp->suspect->enode->u.event.ename->u.name.s, 2788 rp->suspect->ipp); 2789 out(O_DEBUG, " has zero FITrate (using 1)"); 2790 fr = 1; 2791 } 2792 2793 frs[fridx++] = fr; 2794 frsum += fr; 2795 frcnt++; 2796 } 2797 2798 /* Add them in reverse order of our sort, as fmd reverses order */ 2799 for (rp = erl; rp >= srl; rp--) { 2800 cert = percentof(frs[--fridx], frsum); 2801 fault = fmd_nvl_create_fault(fmep->hdl, 2802 rp->suspect->enode->u.event.ename->u.name.s, 2803 cert, 2804 rp->asru, 2805 rp->fru, 2806 rp->rsrc); 2807 if (fault == NULL) 2808 out(O_DIE, "fault creation failed"); 2809 /* if "message" property exists, add it to the fault */ 2810 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2811 &messval) == 0) { 2812 2813 out(O_ALTFP, 2814 "[FME%d, %s adds message=%d to suspect list]", 2815 fmep->id, 2816 rp->suspect->enode->u.event.ename->u.name.s, 2817 messval); 2818 if (nvlist_add_boolean_value(fault, 2819 FM_SUSPECT_MESSAGE, 2820 (messval) ? B_TRUE : B_FALSE) != 0) { 2821 out(O_DIE, "cannot add no-message to fault"); 2822 } 2823 } 2824 2825 /* if "retire" property exists, add it to the fault */ 2826 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2827 &retireval) == 0) { 2828 2829 out(O_ALTFP, 2830 "[FME%d, %s adds retire=%d to suspect list]", 2831 fmep->id, 2832 rp->suspect->enode->u.event.ename->u.name.s, 2833 retireval); 2834 if (nvlist_add_boolean_value(fault, 2835 FM_SUSPECT_RETIRE, 2836 (retireval) ? B_TRUE : B_FALSE) != 0) { 2837 out(O_DIE, "cannot add no-retire to fault"); 2838 } 2839 } 2840 2841 /* if "response" property exists, add it to the fault */ 2842 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2843 &responseval) == 0) { 2844 2845 out(O_ALTFP, 2846 "[FME%d, %s adds response=%d to suspect list]", 2847 fmep->id, 2848 rp->suspect->enode->u.event.ename->u.name.s, 2849 responseval); 2850 if (nvlist_add_boolean_value(fault, 2851 FM_SUSPECT_RESPONSE, 2852 (responseval) ? B_TRUE : B_FALSE) != 0) { 2853 out(O_DIE, "cannot add no-response to fault"); 2854 } 2855 } 2856 2857 /* add any payload properties */ 2858 lut_walk(rp->suspect->payloadprops, 2859 (lut_cb)addpayloadprop, (void *)fault); 2860 rslfree(rp); 2861 2862 /* 2863 * If "action" property exists, evaluate it; this must be done 2864 * before the allfaulty check below since some actions may 2865 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2866 * needs to be restructured if any new actions are introduced 2867 * that have effects that we do not want to be visible if 2868 * we decide not to publish in the dupclose check below. 2869 */ 2870 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2871 struct evalue evalue; 2872 2873 out(O_ALTFP|O_NONL, 2874 "[FME%d, %s action ", fmep->id, 2875 rp->suspect->enode->u.event.ename->u.name.s); 2876 ptree_name_iter(O_ALTFP|O_NONL, snp); 2877 out(O_ALTFP, "]"); 2878 Action_nvl = fault; 2879 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2880 NULL, 0, &evalue); 2881 } 2882 2883 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2884 2885 /* 2886 * check if the asru is already marked as "faulty". 2887 */ 2888 if (allfaulty) { 2889 nvlist_t *asru; 2890 2891 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2892 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2893 out(O_ALTFP|O_VERB|O_NONL, " "); 2894 if (nvlist_lookup_nvlist(fault, 2895 FM_FAULT_ASRU, &asru) != 0) { 2896 out(O_ALTFP|O_VERB, "NULL asru"); 2897 allfaulty = B_FALSE; 2898 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2899 FMD_HAS_FAULT_ASRU, NULL)) { 2900 out(O_ALTFP|O_VERB, "faulty"); 2901 } else { 2902 out(O_ALTFP|O_VERB, "not faulty"); 2903 allfaulty = B_FALSE; 2904 } 2905 } 2906 2907 } 2908 2909 if (!allfaulty) { 2910 /* 2911 * don't update the count stat if all asrus are already 2912 * present and unrepaired in the asru cache 2913 */ 2914 for (rp = erl; rp >= srl; rp--) { 2915 struct event *suspect = rp->suspect; 2916 2917 if (suspect == NULL) 2918 continue; 2919 2920 /* if "count" exists, increment the appropriate stat */ 2921 if ((snp = eventprop_lookup(suspect, 2922 L_count)) != NULL) { 2923 out(O_ALTFP|O_NONL, 2924 "[FME%d, %s count ", fmep->id, 2925 suspect->enode->u.event.ename->u.name.s); 2926 ptree_name_iter(O_ALTFP|O_NONL, snp); 2927 out(O_ALTFP, "]"); 2928 istat_bump(snp, 0); 2929 2930 } 2931 } 2932 istat_save(); /* write out any istat changes */ 2933 } 2934 } 2935 2936 static const char * 2937 undiag_2defect_str(int ud) 2938 { 2939 switch (ud) { 2940 case UD_VAL_MISSINGINFO: 2941 case UD_VAL_MISSINGOBS: 2942 case UD_VAL_MISSINGPATH: 2943 case UD_VAL_MISSINGZERO: 2944 case UD_VAL_BADOBS: 2945 case UD_VAL_CFGMISMATCH: 2946 return (UNDIAG_DEFECT_CHKPT); 2947 break; 2948 2949 case UD_VAL_BADEVENTI: 2950 case UD_VAL_BADEVENTPATH: 2951 case UD_VAL_BADEVENTCLASS: 2952 case UD_VAL_INSTFAIL: 2953 case UD_VAL_NOPATH: 2954 case UD_VAL_UNSOLVD: 2955 return (UNDIAG_DEFECT_FME); 2956 break; 2957 2958 case UD_VAL_MAXFME: 2959 return (UNDIAG_DEFECT_LIMIT); 2960 break; 2961 2962 case UD_VAL_UNKNOWN: 2963 default: 2964 return (UNDIAG_DEFECT_UNKNOWN); 2965 break; 2966 } 2967 } 2968 2969 static const char * 2970 undiag_2fault_str(int ud) 2971 { 2972 switch (ud) { 2973 case UD_VAL_BADEVENTI: 2974 case UD_VAL_BADEVENTPATH: 2975 case UD_VAL_BADEVENTCLASS: 2976 case UD_VAL_INSTFAIL: 2977 case UD_VAL_NOPATH: 2978 case UD_VAL_UNSOLVD: 2979 return (UNDIAG_FAULT_FME); 2980 default: 2981 return (NULL); 2982 } 2983 } 2984 2985 static char * 2986 undiag_2reason_str(int ud, char *arg) 2987 { 2988 const char *ptr; 2989 char *buf; 2990 int with_arg = 0; 2991 2992 switch (ud) { 2993 case UD_VAL_BADEVENTPATH: 2994 ptr = UD_STR_BADEVENTPATH; 2995 with_arg = 1; 2996 break; 2997 case UD_VAL_BADEVENTCLASS: 2998 ptr = UD_STR_BADEVENTCLASS; 2999 with_arg = 1; 3000 break; 3001 case UD_VAL_BADEVENTI: 3002 ptr = UD_STR_BADEVENTI; 3003 with_arg = 1; 3004 break; 3005 case UD_VAL_BADOBS: 3006 ptr = UD_STR_BADOBS; 3007 break; 3008 case UD_VAL_CFGMISMATCH: 3009 ptr = UD_STR_CFGMISMATCH; 3010 break; 3011 case UD_VAL_INSTFAIL: 3012 ptr = UD_STR_INSTFAIL; 3013 with_arg = 1; 3014 break; 3015 case UD_VAL_MAXFME: 3016 ptr = UD_STR_MAXFME; 3017 break; 3018 case UD_VAL_MISSINGINFO: 3019 ptr = UD_STR_MISSINGINFO; 3020 break; 3021 case UD_VAL_MISSINGOBS: 3022 ptr = UD_STR_MISSINGOBS; 3023 break; 3024 case UD_VAL_MISSINGPATH: 3025 ptr = UD_STR_MISSINGPATH; 3026 break; 3027 case UD_VAL_MISSINGZERO: 3028 ptr = UD_STR_MISSINGZERO; 3029 break; 3030 case UD_VAL_NOPATH: 3031 ptr = UD_STR_NOPATH; 3032 with_arg = 1; 3033 break; 3034 case UD_VAL_UNSOLVD: 3035 ptr = UD_STR_UNSOLVD; 3036 break; 3037 case UD_VAL_UNKNOWN: 3038 default: 3039 ptr = UD_STR_UNKNOWN; 3040 break; 3041 } 3042 if (with_arg) { 3043 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3044 (void) sprintf(buf, ptr, arg); 3045 } else { 3046 buf = MALLOC(strlen(ptr) + 1); 3047 (void) sprintf(buf, ptr); 3048 } 3049 return (buf); 3050 } 3051 3052 static void 3053 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3054 nvlist_t *detector, char *arg) 3055 { 3056 struct case_list *newcase; 3057 nvlist_t *defect, *fault; 3058 const char *faultstr; 3059 char *reason = undiag_2reason_str(Undiag_reason, arg); 3060 3061 out(O_ALTFP, 3062 "[undiagnosable ereport received, " 3063 "creating and closing a new case (%s)]", reason); 3064 3065 newcase = MALLOC(sizeof (struct case_list)); 3066 newcase->next = NULL; 3067 newcase->fmcase = fmcase; 3068 if (Undiagablecaselist != NULL) 3069 newcase->next = Undiagablecaselist; 3070 Undiagablecaselist = newcase; 3071 3072 if (ffep != NULL) 3073 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3074 3075 /* add defect */ 3076 defect = fmd_nvl_create_fault(hdl, 3077 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3078 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3079 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3080 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3081 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3082 3083 /* add fault if appropriate */ 3084 faultstr = undiag_2fault_str(Undiag_reason); 3085 if (faultstr != NULL) { 3086 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3087 detector); 3088 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3089 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3090 B_FALSE); 3091 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3092 B_FALSE); 3093 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3094 } 3095 FREE(reason); 3096 3097 /* solve and close case */ 3098 fmd_case_solve(hdl, newcase->fmcase); 3099 fmd_case_close(hdl, newcase->fmcase); 3100 Undiag_reason = UD_VAL_UNKNOWN; 3101 } 3102 3103 static void 3104 fme_undiagnosable(struct fme *f) 3105 { 3106 nvlist_t *defect, *fault, *detector = NULL; 3107 struct event *ep; 3108 char *pathstr; 3109 const char *faultstr; 3110 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3111 3112 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3113 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3114 3115 for (ep = f->observations; ep; ep = ep->observations) { 3116 3117 if (ep->ffep != f->e0r) 3118 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3119 3120 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3121 platform_units_translate(0, f->config, NULL, NULL, &detector, 3122 pathstr); 3123 FREE(pathstr); 3124 3125 /* add defect */ 3126 defect = fmd_nvl_create_fault(f->hdl, 3127 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3128 NULL, NULL, detector); 3129 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3130 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3131 B_FALSE); 3132 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3133 B_FALSE); 3134 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3135 3136 /* add fault if appropriate */ 3137 faultstr = undiag_2fault_str(Undiag_reason); 3138 if (faultstr == NULL) 3139 continue; 3140 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3141 NULL, NULL, detector); 3142 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3143 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3144 B_FALSE); 3145 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3146 B_FALSE); 3147 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3148 nvlist_free(detector); 3149 } 3150 FREE(reason); 3151 fmd_case_solve(f->hdl, f->fmcase); 3152 fmd_case_close(f->hdl, f->fmcase); 3153 Undiag_reason = UD_VAL_UNKNOWN; 3154 } 3155 3156 /* 3157 * fme_close_case 3158 * 3159 * Find the requested case amongst our fmes and close it. Free up 3160 * the related fme. 3161 */ 3162 void 3163 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3164 { 3165 struct case_list *ucasep, *prevcasep = NULL; 3166 struct fme *prev = NULL; 3167 struct fme *fmep; 3168 3169 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3170 if (fmcase != ucasep->fmcase) { 3171 prevcasep = ucasep; 3172 continue; 3173 } 3174 3175 if (prevcasep == NULL) 3176 Undiagablecaselist = Undiagablecaselist->next; 3177 else 3178 prevcasep->next = ucasep->next; 3179 3180 FREE(ucasep); 3181 return; 3182 } 3183 3184 for (fmep = FMElist; fmep; fmep = fmep->next) { 3185 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3186 break; 3187 prev = fmep; 3188 } 3189 3190 if (fmep == NULL) { 3191 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3192 fmd_case_uuid(hdl, fmcase)); 3193 return; 3194 } 3195 3196 if (EFMElist == fmep) 3197 EFMElist = prev; 3198 3199 if (prev == NULL) 3200 FMElist = FMElist->next; 3201 else 3202 prev->next = fmep->next; 3203 3204 fmep->next = NULL; 3205 3206 /* Get rid of any timer this fme has set */ 3207 if (fmep->wull != 0) 3208 fmd_timer_remove(fmep->hdl, fmep->timer); 3209 3210 if (ClosedFMEs == NULL) { 3211 ClosedFMEs = fmep; 3212 } else { 3213 fmep->next = ClosedFMEs; 3214 ClosedFMEs = fmep; 3215 } 3216 3217 Open_fme_count--; 3218 3219 /* See if we can close the overflow FME */ 3220 if (Open_fme_count <= Max_fme) { 3221 for (fmep = FMElist; fmep; fmep = fmep->next) { 3222 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3223 fmep->fmcase))) 3224 break; 3225 } 3226 3227 if (fmep != NULL) 3228 fmd_case_close(fmep->hdl, fmep->fmcase); 3229 } 3230 } 3231 3232 /* 3233 * fme_set_timer() 3234 * If the time we need to wait for the given FME is less than the 3235 * current timer, kick that old timer out and establish a new one. 3236 */ 3237 static int 3238 fme_set_timer(struct fme *fmep, unsigned long long wull) 3239 { 3240 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3241 ptree_timeval(O_ALTFP|O_VERB, &wull); 3242 3243 if (wull <= fmep->pull) { 3244 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3245 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3246 out(O_ALTFP|O_VERB, NULL); 3247 /* we've waited at least wull already, don't need timer */ 3248 return (0); 3249 } 3250 3251 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3252 if (fmep->wull != 0) { 3253 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3254 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3255 out(O_ALTFP|O_VERB, NULL); 3256 } else { 3257 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3258 out(O_ALTFP|O_VERB, NULL); 3259 } 3260 3261 if (fmep->wull != 0) 3262 if (wull >= fmep->wull) 3263 /* New timer would fire later than established timer */ 3264 return (0); 3265 3266 if (fmep->wull != 0) { 3267 fmd_timer_remove(fmep->hdl, fmep->timer); 3268 } 3269 3270 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3271 fmep->e0r, wull); 3272 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3273 fmep->wull = wull; 3274 return (1); 3275 } 3276 3277 void 3278 fme_timer_fired(struct fme *fmep, id_t tid) 3279 { 3280 struct fme *ffmep = NULL; 3281 3282 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3283 if (ffmep == fmep) 3284 break; 3285 3286 if (ffmep == NULL) { 3287 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3288 (void *)fmep); 3289 return; 3290 } 3291 3292 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3293 fmep->pull = fmep->wull; 3294 fmep->wull = 0; 3295 fmd_buf_write(fmep->hdl, fmep->fmcase, 3296 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3297 3298 fme_eval(fmep, fmep->e0r); 3299 } 3300 3301 /* 3302 * Preserve the fme's suspect list in its psuspects list, NULLing the 3303 * suspects list in the meantime. 3304 */ 3305 static void 3306 save_suspects(struct fme *fmep) 3307 { 3308 struct event *ep; 3309 struct event *nextep; 3310 3311 /* zero out the previous suspect list */ 3312 for (ep = fmep->psuspects; ep; ep = nextep) { 3313 nextep = ep->psuspects; 3314 ep->psuspects = NULL; 3315 } 3316 fmep->psuspects = NULL; 3317 3318 /* zero out the suspect list, copying it to previous suspect list */ 3319 fmep->psuspects = fmep->suspects; 3320 for (ep = fmep->suspects; ep; ep = nextep) { 3321 nextep = ep->suspects; 3322 ep->psuspects = ep->suspects; 3323 ep->suspects = NULL; 3324 ep->is_suspect = 0; 3325 } 3326 fmep->suspects = NULL; 3327 fmep->nsuspects = 0; 3328 } 3329 3330 /* 3331 * Retrieve the fme's suspect list from its psuspects list. 3332 */ 3333 static void 3334 restore_suspects(struct fme *fmep) 3335 { 3336 struct event *ep; 3337 struct event *nextep; 3338 3339 fmep->nsuspects = 0; 3340 fmep->suspects = fmep->psuspects; 3341 for (ep = fmep->psuspects; ep; ep = nextep) { 3342 fmep->nsuspects++; 3343 nextep = ep->psuspects; 3344 ep->suspects = ep->psuspects; 3345 } 3346 } 3347 3348 /* 3349 * this is what we use to call the Emrys prototype code instead of main() 3350 */ 3351 static void 3352 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3353 { 3354 struct event *ep; 3355 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3356 struct rsl *srl = NULL; 3357 struct rsl *srl2 = NULL; 3358 int mess_zero_count; 3359 int rpcnt; 3360 3361 save_suspects(fmep); 3362 3363 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3364 indent_set(" "); 3365 3366 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3367 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3368 3369 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3370 fme_state2str(fmep->state)); 3371 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3372 out(O_ALTFP|O_NONL, " "); 3373 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3374 } 3375 out(O_ALTFP, NULL); 3376 3377 switch (fmep->state) { 3378 case FME_CREDIBLE: 3379 print_suspects(SLNEW, fmep); 3380 (void) upsets_eval(fmep, ffep); 3381 3382 /* 3383 * we may have already posted suspects in upsets_eval() which 3384 * can recurse into fme_eval() again. If so then just return. 3385 */ 3386 if (fmep->posted_suspects) 3387 return; 3388 3389 stats_counter_bump(fmep->diags); 3390 rpcnt = fmep->nsuspects; 3391 save_suspects(fmep); 3392 3393 /* 3394 * create two lists, one for "message=1" faults and one for 3395 * "message=0" faults. If we have a mixture we will generate 3396 * two separate suspect lists. 3397 */ 3398 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3399 bzero(srl, rpcnt * sizeof (struct rsl)); 3400 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3401 bzero(srl2, rpcnt * sizeof (struct rsl)); 3402 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3403 3404 /* 3405 * If the resulting suspect list has no members, we're 3406 * done so simply close the case. Otherwise sort and publish. 3407 */ 3408 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3409 out(O_ALTFP, 3410 "[FME%d, case %s (all suspects are upsets)]", 3411 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3412 fmd_case_close(fmep->hdl, fmep->fmcase); 3413 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3414 publish_suspects(fmep, srl); 3415 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3416 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3417 fmd_case_solve(fmep->hdl, fmep->fmcase); 3418 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3419 fmep->nsuspects = mess_zero_count; 3420 publish_suspects(fmep, srl2); 3421 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3422 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3423 fmd_case_solve(fmep->hdl, fmep->fmcase); 3424 } else { 3425 struct event *obsp; 3426 struct fme *nfmep; 3427 3428 publish_suspects(fmep, srl); 3429 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3430 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3431 fmd_case_solve(fmep->hdl, fmep->fmcase); 3432 3433 /* 3434 * Got both message=0 and message=1 so create a 3435 * duplicate case. Also need a temporary duplicate fme 3436 * structure for use by publish_suspects(). 3437 */ 3438 nfmep = alloc_fme(); 3439 nfmep->id = Nextid++; 3440 nfmep->hdl = fmep->hdl; 3441 nfmep->nsuspects = mess_zero_count; 3442 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3443 out(O_ALTFP|O_STAMP, 3444 "[creating parallel FME%d, case %s]", nfmep->id, 3445 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3446 Open_fme_count++; 3447 if (ffep) { 3448 fmd_case_setprincipal(nfmep->hdl, 3449 nfmep->fmcase, ffep); 3450 fmd_case_add_ereport(nfmep->hdl, 3451 nfmep->fmcase, ffep); 3452 } 3453 for (obsp = fmep->observations; obsp; 3454 obsp = obsp->observations) 3455 if (obsp->ffep && obsp->ffep != ffep) 3456 fmd_case_add_ereport(nfmep->hdl, 3457 nfmep->fmcase, obsp->ffep); 3458 3459 publish_suspects(nfmep, srl2); 3460 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3461 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3462 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3463 FREE(nfmep); 3464 } 3465 FREE(srl); 3466 FREE(srl2); 3467 restore_suspects(fmep); 3468 3469 fmep->posted_suspects = 1; 3470 fmd_buf_write(fmep->hdl, fmep->fmcase, 3471 WOBUF_POSTD, 3472 (void *)&fmep->posted_suspects, 3473 sizeof (fmep->posted_suspects)); 3474 3475 /* 3476 * Now the suspects have been posted, we can clear up 3477 * the instance tree as we won't be looking at it again. 3478 * Also cancel the timer as the case is now solved. 3479 */ 3480 if (fmep->wull != 0) { 3481 fmd_timer_remove(fmep->hdl, fmep->timer); 3482 fmep->wull = 0; 3483 } 3484 break; 3485 3486 case FME_WAIT: 3487 ASSERT(my_delay > fmep->ull); 3488 (void) fme_set_timer(fmep, my_delay); 3489 print_suspects(SLWAIT, fmep); 3490 itree_prune(fmep->eventtree); 3491 return; 3492 3493 case FME_DISPROVED: 3494 print_suspects(SLDISPROVED, fmep); 3495 Undiag_reason = UD_VAL_UNSOLVD; 3496 fme_undiagnosable(fmep); 3497 break; 3498 } 3499 3500 itree_free(fmep->eventtree); 3501 fmep->eventtree = NULL; 3502 structconfig_free(fmep->config); 3503 fmep->config = NULL; 3504 destroy_fme_bufs(fmep); 3505 } 3506 3507 static void indent(void); 3508 static int triggered(struct fme *fmep, struct event *ep, int mark); 3509 static enum fme_state effects_test(struct fme *fmep, 3510 struct event *fault_event, unsigned long long at_latest_by, 3511 unsigned long long *pdelay); 3512 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3513 unsigned long long at_latest_by, unsigned long long *pdelay); 3514 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3515 unsigned long long at_latest_by, unsigned long long *pdelay); 3516 3517 static int 3518 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3519 { 3520 struct constraintlist *ctp; 3521 struct evalue value; 3522 char *sep = ""; 3523 3524 if (arrowp->forever_false) { 3525 indent(); 3526 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3527 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3528 out(O_ALTFP|O_VERB|O_NONL, sep); 3529 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3530 sep = ", "; 3531 } 3532 out(O_ALTFP|O_VERB, NULL); 3533 return (0); 3534 } 3535 if (arrowp->forever_true) { 3536 indent(); 3537 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3538 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3539 out(O_ALTFP|O_VERB|O_NONL, sep); 3540 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3541 sep = ", "; 3542 } 3543 out(O_ALTFP|O_VERB, NULL); 3544 return (1); 3545 } 3546 3547 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3548 if (eval_expr(ctp->cnode, NULL, NULL, 3549 &fmep->globals, fmep->config, 3550 arrowp, 0, &value)) { 3551 /* evaluation successful */ 3552 if (value.t == UNDEFINED || value.v == 0) { 3553 /* known false */ 3554 arrowp->forever_false = 1; 3555 indent(); 3556 out(O_ALTFP|O_VERB|O_NONL, 3557 " False constraint: "); 3558 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3559 out(O_ALTFP|O_VERB, NULL); 3560 return (0); 3561 } 3562 } else { 3563 /* evaluation unsuccessful -- unknown value */ 3564 indent(); 3565 out(O_ALTFP|O_VERB|O_NONL, 3566 " Deferred constraint: "); 3567 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3568 out(O_ALTFP|O_VERB, NULL); 3569 return (1); 3570 } 3571 } 3572 /* known true */ 3573 arrowp->forever_true = 1; 3574 indent(); 3575 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3576 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3577 out(O_ALTFP|O_VERB|O_NONL, sep); 3578 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3579 sep = ", "; 3580 } 3581 out(O_ALTFP|O_VERB, NULL); 3582 return (1); 3583 } 3584 3585 static int 3586 triggered(struct fme *fmep, struct event *ep, int mark) 3587 { 3588 struct bubble *bp; 3589 struct arrowlist *ap; 3590 int count = 0; 3591 3592 stats_counter_bump(fmep->Tcallcount); 3593 for (bp = itree_next_bubble(ep, NULL); bp; 3594 bp = itree_next_bubble(ep, bp)) { 3595 if (bp->t != B_TO) 3596 continue; 3597 for (ap = itree_next_arrow(bp, NULL); ap; 3598 ap = itree_next_arrow(bp, ap)) { 3599 /* check count of marks against K in the bubble */ 3600 if ((ap->arrowp->mark & mark) && 3601 ++count >= bp->nork) 3602 return (1); 3603 } 3604 } 3605 return (0); 3606 } 3607 3608 static int 3609 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3610 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3611 { 3612 struct bubble *bp; 3613 struct arrowlist *ap; 3614 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3615 unsigned long long my_delay; 3616 enum fme_state result; 3617 int retval = 0; 3618 3619 for (bp = itree_next_bubble(ep, NULL); bp; 3620 bp = itree_next_bubble(ep, bp)) { 3621 if (bp->t != B_FROM) 3622 continue; 3623 stats_counter_bump(fmep->Marrowcount); 3624 for (ap = itree_next_arrow(bp, NULL); ap; 3625 ap = itree_next_arrow(bp, ap)) { 3626 struct event *ep2 = ap->arrowp->head->myevent; 3627 /* 3628 * if we're clearing marks, we can avoid doing 3629 * all that work evaluating constraints. 3630 */ 3631 if (mark == 0) { 3632 if (ap->arrowp->arrow_marked == 0) 3633 continue; 3634 ap->arrowp->arrow_marked = 0; 3635 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3636 if (keep && (ep2->cached_state & 3637 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3638 ep2->keep_in_tree = 1; 3639 ep2->cached_state &= 3640 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3641 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3642 keep); 3643 continue; 3644 } 3645 ap->arrowp->arrow_marked = 1; 3646 if (ep2->cached_state & REQMNTS_DISPROVED) { 3647 indent(); 3648 out(O_ALTFP|O_VERB|O_NONL, 3649 " ALREADY DISPROVED "); 3650 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3651 out(O_ALTFP|O_VERB, NULL); 3652 continue; 3653 } 3654 if (ep2->cached_state & WAIT_EFFECT) { 3655 indent(); 3656 out(O_ALTFP|O_VERB|O_NONL, 3657 " ALREADY EFFECTS WAIT "); 3658 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3659 out(O_ALTFP|O_VERB, NULL); 3660 continue; 3661 } 3662 if (ep2->cached_state & CREDIBLE_EFFECT) { 3663 indent(); 3664 out(O_ALTFP|O_VERB|O_NONL, 3665 " ALREADY EFFECTS CREDIBLE "); 3666 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3667 out(O_ALTFP|O_VERB, NULL); 3668 continue; 3669 } 3670 if ((ep2->cached_state & PARENT_WAIT) && 3671 (mark & PARENT_WAIT)) { 3672 indent(); 3673 out(O_ALTFP|O_VERB|O_NONL, 3674 " ALREADY PARENT EFFECTS WAIT "); 3675 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3676 out(O_ALTFP|O_VERB, NULL); 3677 continue; 3678 } 3679 platform_set_payloadnvp(ep2->nvp); 3680 if (checkconstraints(fmep, ap->arrowp) == 0) { 3681 platform_set_payloadnvp(NULL); 3682 indent(); 3683 out(O_ALTFP|O_VERB|O_NONL, 3684 " CONSTRAINTS FAIL "); 3685 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3686 out(O_ALTFP|O_VERB, NULL); 3687 continue; 3688 } 3689 platform_set_payloadnvp(NULL); 3690 ap->arrowp->mark |= EFFECTS_COUNTER; 3691 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3692 indent(); 3693 out(O_ALTFP|O_VERB|O_NONL, 3694 " K-COUNT NOT YET MET "); 3695 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3696 out(O_ALTFP|O_VERB, NULL); 3697 continue; 3698 } 3699 ep2->cached_state &= ~PARENT_WAIT; 3700 /* 3701 * if we've reached an ereport and no propagation time 3702 * is specified, use the Hesitate value 3703 */ 3704 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3705 ap->arrowp->maxdelay == 0ULL) { 3706 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3707 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3708 out(O_ALTFP|O_VERB, NULL); 3709 result = requirements_test(fmep, ep2, Hesitate, 3710 &my_delay); 3711 } else { 3712 result = requirements_test(fmep, ep2, 3713 at_latest_by + ap->arrowp->maxdelay, 3714 &my_delay); 3715 } 3716 if (result == FME_WAIT) { 3717 retval = WAIT_EFFECT; 3718 if (overall_delay > my_delay) 3719 overall_delay = my_delay; 3720 ep2->cached_state |= WAIT_EFFECT; 3721 indent(); 3722 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3723 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3724 out(O_ALTFP|O_VERB, NULL); 3725 indent_push(" E"); 3726 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3727 at_latest_by, &my_delay, 0) == 3728 WAIT_EFFECT) { 3729 retval = WAIT_EFFECT; 3730 if (overall_delay > my_delay) 3731 overall_delay = my_delay; 3732 } 3733 indent_pop(); 3734 } else if (result == FME_DISPROVED) { 3735 indent(); 3736 out(O_ALTFP|O_VERB|O_NONL, 3737 " EFFECTS DISPROVED "); 3738 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3739 out(O_ALTFP|O_VERB, NULL); 3740 } else { 3741 ep2->cached_state |= mark; 3742 indent(); 3743 if (mark == CREDIBLE_EFFECT) 3744 out(O_ALTFP|O_VERB|O_NONL, 3745 " EFFECTS CREDIBLE "); 3746 else 3747 out(O_ALTFP|O_VERB|O_NONL, 3748 " PARENT EFFECTS WAIT "); 3749 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3750 out(O_ALTFP|O_VERB, NULL); 3751 indent_push(" E"); 3752 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3753 &my_delay, 0) == WAIT_EFFECT) { 3754 retval = WAIT_EFFECT; 3755 if (overall_delay > my_delay) 3756 overall_delay = my_delay; 3757 } 3758 indent_pop(); 3759 } 3760 } 3761 } 3762 if (retval == WAIT_EFFECT) 3763 *pdelay = overall_delay; 3764 return (retval); 3765 } 3766 3767 static enum fme_state 3768 effects_test(struct fme *fmep, struct event *fault_event, 3769 unsigned long long at_latest_by, unsigned long long *pdelay) 3770 { 3771 struct event *error_event; 3772 enum fme_state return_value = FME_CREDIBLE; 3773 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3774 unsigned long long my_delay; 3775 3776 stats_counter_bump(fmep->Ecallcount); 3777 indent_push(" E"); 3778 indent(); 3779 out(O_ALTFP|O_VERB|O_NONL, "->"); 3780 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3781 out(O_ALTFP|O_VERB, NULL); 3782 3783 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3784 &my_delay, 0) == WAIT_EFFECT) { 3785 return_value = FME_WAIT; 3786 if (overall_delay > my_delay) 3787 overall_delay = my_delay; 3788 } 3789 for (error_event = fmep->observations; 3790 error_event; error_event = error_event->observations) { 3791 indent(); 3792 out(O_ALTFP|O_VERB|O_NONL, " "); 3793 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3794 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3795 if (error_event->cached_state & 3796 (PARENT_WAIT|WAIT_EFFECT)) { 3797 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3798 continue; 3799 } 3800 return_value = FME_DISPROVED; 3801 out(O_ALTFP|O_VERB, " NOT triggered"); 3802 break; 3803 } else { 3804 out(O_ALTFP|O_VERB, " triggered"); 3805 } 3806 } 3807 if (return_value == FME_DISPROVED) { 3808 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3809 } else { 3810 fault_event->keep_in_tree = 1; 3811 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3812 } 3813 3814 indent(); 3815 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3816 fme_state2str(return_value)); 3817 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3818 out(O_ALTFP|O_VERB, NULL); 3819 indent_pop(); 3820 if (return_value == FME_WAIT) 3821 *pdelay = overall_delay; 3822 return (return_value); 3823 } 3824 3825 static enum fme_state 3826 requirements_test(struct fme *fmep, struct event *ep, 3827 unsigned long long at_latest_by, unsigned long long *pdelay) 3828 { 3829 int waiting_events; 3830 int credible_events; 3831 int deferred_events; 3832 enum fme_state return_value = FME_CREDIBLE; 3833 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3834 unsigned long long arrow_delay; 3835 unsigned long long my_delay; 3836 struct event *ep2; 3837 struct bubble *bp; 3838 struct arrowlist *ap; 3839 3840 if (ep->cached_state & REQMNTS_CREDIBLE) { 3841 indent(); 3842 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3843 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3844 out(O_ALTFP|O_VERB, NULL); 3845 return (FME_CREDIBLE); 3846 } 3847 if (ep->cached_state & REQMNTS_DISPROVED) { 3848 indent(); 3849 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3850 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3851 out(O_ALTFP|O_VERB, NULL); 3852 return (FME_DISPROVED); 3853 } 3854 if (ep->cached_state & REQMNTS_WAIT) { 3855 indent(); 3856 *pdelay = ep->cached_delay; 3857 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3858 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3859 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3860 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3861 out(O_ALTFP|O_VERB, NULL); 3862 return (FME_WAIT); 3863 } 3864 stats_counter_bump(fmep->Rcallcount); 3865 indent_push(" R"); 3866 indent(); 3867 out(O_ALTFP|O_VERB|O_NONL, "->"); 3868 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3869 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3870 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3871 out(O_ALTFP|O_VERB, NULL); 3872 3873 if (ep->t == N_EREPORT) { 3874 if (ep->count == 0) { 3875 if (fmep->pull >= at_latest_by) { 3876 return_value = FME_DISPROVED; 3877 } else { 3878 ep->cached_delay = *pdelay = at_latest_by; 3879 return_value = FME_WAIT; 3880 } 3881 } 3882 3883 indent(); 3884 switch (return_value) { 3885 case FME_CREDIBLE: 3886 ep->cached_state |= REQMNTS_CREDIBLE; 3887 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3888 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3889 break; 3890 case FME_DISPROVED: 3891 ep->cached_state |= REQMNTS_DISPROVED; 3892 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3893 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3894 break; 3895 case FME_WAIT: 3896 ep->cached_state |= REQMNTS_WAIT; 3897 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3898 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3899 out(O_ALTFP|O_VERB|O_NONL, " to "); 3900 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3901 break; 3902 default: 3903 out(O_DIE, "requirements_test: unexpected fme_state"); 3904 break; 3905 } 3906 out(O_ALTFP|O_VERB, NULL); 3907 indent_pop(); 3908 3909 return (return_value); 3910 } 3911 3912 /* this event is not a report, descend the tree */ 3913 for (bp = itree_next_bubble(ep, NULL); bp; 3914 bp = itree_next_bubble(ep, bp)) { 3915 int n; 3916 3917 if (bp->t != B_FROM) 3918 continue; 3919 3920 n = bp->nork; 3921 3922 credible_events = 0; 3923 waiting_events = 0; 3924 deferred_events = 0; 3925 arrow_delay = TIMEVAL_EVENTUALLY; 3926 /* 3927 * n is -1 for 'A' so adjust it. 3928 * XXX just count up the arrows for now. 3929 */ 3930 if (n < 0) { 3931 n = 0; 3932 for (ap = itree_next_arrow(bp, NULL); ap; 3933 ap = itree_next_arrow(bp, ap)) 3934 n++; 3935 indent(); 3936 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3937 } else { 3938 indent(); 3939 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3940 } 3941 3942 if (n == 0) 3943 continue; 3944 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3945 for (ap = itree_next_arrow(bp, NULL); ap; 3946 ap = itree_next_arrow(bp, ap)) { 3947 ep2 = ap->arrowp->head->myevent; 3948 platform_set_payloadnvp(ep2->nvp); 3949 (void) checkconstraints(fmep, ap->arrowp); 3950 if (!ap->arrowp->forever_false) { 3951 /* 3952 * if all arrows are invalidated by the 3953 * constraints, then we should elide the 3954 * whole bubble to be consistant with 3955 * the tree creation time behaviour 3956 */ 3957 bp->mark |= BUBBLE_OK; 3958 platform_set_payloadnvp(NULL); 3959 break; 3960 } 3961 platform_set_payloadnvp(NULL); 3962 } 3963 } 3964 for (ap = itree_next_arrow(bp, NULL); ap; 3965 ap = itree_next_arrow(bp, ap)) { 3966 ep2 = ap->arrowp->head->myevent; 3967 if (n <= credible_events) 3968 break; 3969 3970 ap->arrowp->mark |= REQMNTS_COUNTER; 3971 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3972 /* XXX adding max timevals! */ 3973 switch (requirements_test(fmep, ep2, 3974 at_latest_by + ap->arrowp->maxdelay, 3975 &my_delay)) { 3976 case FME_DEFERRED: 3977 deferred_events++; 3978 break; 3979 case FME_CREDIBLE: 3980 credible_events++; 3981 break; 3982 case FME_DISPROVED: 3983 break; 3984 case FME_WAIT: 3985 if (my_delay < arrow_delay) 3986 arrow_delay = my_delay; 3987 waiting_events++; 3988 break; 3989 default: 3990 out(O_DIE, 3991 "Bug in requirements_test."); 3992 } 3993 else 3994 deferred_events++; 3995 } 3996 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3997 bp->mark |= BUBBLE_ELIDED; 3998 continue; 3999 } 4000 indent(); 4001 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 4002 credible_events + deferred_events, waiting_events); 4003 if (credible_events + deferred_events + waiting_events < n) { 4004 /* Can never meet requirements */ 4005 ep->cached_state |= REQMNTS_DISPROVED; 4006 indent(); 4007 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4008 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4009 out(O_ALTFP|O_VERB, NULL); 4010 indent_pop(); 4011 return (FME_DISPROVED); 4012 } 4013 if (credible_events + deferred_events < n) { 4014 /* will have to wait */ 4015 /* wait time is shortest known */ 4016 if (arrow_delay < overall_delay) 4017 overall_delay = arrow_delay; 4018 return_value = FME_WAIT; 4019 } else if (credible_events < n) { 4020 if (return_value != FME_WAIT) 4021 return_value = FME_DEFERRED; 4022 } 4023 } 4024 4025 /* 4026 * don't mark as FME_DEFERRED. If this event isn't reached by another 4027 * path, then this will be considered FME_CREDIBLE. But if it is 4028 * reached by a different path so the K-count is met, then might 4029 * get overridden by FME_WAIT or FME_DISPROVED. 4030 */ 4031 if (return_value == FME_WAIT) { 4032 ep->cached_state |= REQMNTS_WAIT; 4033 ep->cached_delay = *pdelay = overall_delay; 4034 } else if (return_value == FME_CREDIBLE) { 4035 ep->cached_state |= REQMNTS_CREDIBLE; 4036 } 4037 indent(); 4038 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4039 fme_state2str(return_value)); 4040 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4041 out(O_ALTFP|O_VERB, NULL); 4042 indent_pop(); 4043 return (return_value); 4044 } 4045 4046 static enum fme_state 4047 causes_test(struct fme *fmep, struct event *ep, 4048 unsigned long long at_latest_by, unsigned long long *pdelay) 4049 { 4050 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4051 unsigned long long my_delay; 4052 int credible_results = 0; 4053 int waiting_results = 0; 4054 enum fme_state fstate; 4055 struct event *tail_event; 4056 struct bubble *bp; 4057 struct arrowlist *ap; 4058 int k = 1; 4059 4060 stats_counter_bump(fmep->Ccallcount); 4061 indent_push(" C"); 4062 indent(); 4063 out(O_ALTFP|O_VERB|O_NONL, "->"); 4064 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4065 out(O_ALTFP|O_VERB, NULL); 4066 4067 for (bp = itree_next_bubble(ep, NULL); bp; 4068 bp = itree_next_bubble(ep, bp)) { 4069 if (bp->t != B_TO) 4070 continue; 4071 k = bp->nork; /* remember the K value */ 4072 for (ap = itree_next_arrow(bp, NULL); ap; 4073 ap = itree_next_arrow(bp, ap)) { 4074 int do_not_follow = 0; 4075 4076 /* 4077 * if we get to the same event multiple times 4078 * only worry about the first one. 4079 */ 4080 if (ap->arrowp->tail->myevent->cached_state & 4081 CAUSES_TESTED) { 4082 indent(); 4083 out(O_ALTFP|O_VERB|O_NONL, 4084 " causes test already run for "); 4085 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4086 ap->arrowp->tail->myevent); 4087 out(O_ALTFP|O_VERB, NULL); 4088 continue; 4089 } 4090 4091 /* 4092 * see if false constraint prevents us 4093 * from traversing this arrow 4094 */ 4095 platform_set_payloadnvp(ep->nvp); 4096 if (checkconstraints(fmep, ap->arrowp) == 0) 4097 do_not_follow = 1; 4098 platform_set_payloadnvp(NULL); 4099 if (do_not_follow) { 4100 indent(); 4101 out(O_ALTFP|O_VERB|O_NONL, 4102 " False arrow from "); 4103 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4104 ap->arrowp->tail->myevent); 4105 out(O_ALTFP|O_VERB, NULL); 4106 continue; 4107 } 4108 4109 ap->arrowp->tail->myevent->cached_state |= 4110 CAUSES_TESTED; 4111 tail_event = ap->arrowp->tail->myevent; 4112 fstate = hypothesise(fmep, tail_event, at_latest_by, 4113 &my_delay); 4114 4115 switch (fstate) { 4116 case FME_WAIT: 4117 if (my_delay < overall_delay) 4118 overall_delay = my_delay; 4119 waiting_results++; 4120 break; 4121 case FME_CREDIBLE: 4122 credible_results++; 4123 break; 4124 case FME_DISPROVED: 4125 break; 4126 default: 4127 out(O_DIE, "Bug in causes_test"); 4128 } 4129 } 4130 } 4131 /* compare against K */ 4132 if (credible_results + waiting_results < k) { 4133 indent(); 4134 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4135 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4136 out(O_ALTFP|O_VERB, NULL); 4137 indent_pop(); 4138 return (FME_DISPROVED); 4139 } 4140 if (waiting_results != 0) { 4141 *pdelay = overall_delay; 4142 indent(); 4143 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4144 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4145 out(O_ALTFP|O_VERB|O_NONL, " to "); 4146 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4147 out(O_ALTFP|O_VERB, NULL); 4148 indent_pop(); 4149 return (FME_WAIT); 4150 } 4151 indent(); 4152 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4153 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4154 out(O_ALTFP|O_VERB, NULL); 4155 indent_pop(); 4156 return (FME_CREDIBLE); 4157 } 4158 4159 static enum fme_state 4160 hypothesise(struct fme *fmep, struct event *ep, 4161 unsigned long long at_latest_by, unsigned long long *pdelay) 4162 { 4163 enum fme_state rtr, otr; 4164 unsigned long long my_delay; 4165 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4166 4167 stats_counter_bump(fmep->Hcallcount); 4168 indent_push(" H"); 4169 indent(); 4170 out(O_ALTFP|O_VERB|O_NONL, "->"); 4171 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4172 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4173 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4174 out(O_ALTFP|O_VERB, NULL); 4175 4176 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4177 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4178 overall_delay = my_delay; 4179 if (rtr != FME_DISPROVED) { 4180 if (is_problem(ep->t)) { 4181 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4182 if (otr != FME_DISPROVED) { 4183 if (fmep->peek == 0 && ep->is_suspect == 0) { 4184 ep->suspects = fmep->suspects; 4185 ep->is_suspect = 1; 4186 fmep->suspects = ep; 4187 fmep->nsuspects++; 4188 } 4189 } 4190 } else 4191 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4192 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4193 overall_delay = my_delay; 4194 if ((otr != FME_DISPROVED) && 4195 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4196 *pdelay = overall_delay; 4197 } 4198 if (rtr == FME_DISPROVED) { 4199 indent(); 4200 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4201 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4202 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4203 indent_pop(); 4204 return (FME_DISPROVED); 4205 } 4206 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4207 indent(); 4208 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4209 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4210 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4211 indent_pop(); 4212 return (FME_DISPROVED); 4213 } 4214 if (otr == FME_DISPROVED) { 4215 indent(); 4216 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4217 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4218 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4219 indent_pop(); 4220 return (FME_DISPROVED); 4221 } 4222 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4223 indent(); 4224 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4225 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4226 out(O_ALTFP|O_VERB|O_NONL, " to "); 4227 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4228 out(O_ALTFP|O_VERB, NULL); 4229 indent_pop(); 4230 return (FME_WAIT); 4231 } 4232 indent(); 4233 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4234 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4235 out(O_ALTFP|O_VERB, NULL); 4236 indent_pop(); 4237 return (FME_CREDIBLE); 4238 } 4239 4240 /* 4241 * fme_istat_load -- reconstitute any persistent istats 4242 */ 4243 void 4244 fme_istat_load(fmd_hdl_t *hdl) 4245 { 4246 int sz; 4247 char *sbuf; 4248 char *ptr; 4249 4250 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4251 out(O_ALTFP, "fme_istat_load: No stats"); 4252 return; 4253 } 4254 4255 sbuf = alloca(sz); 4256 4257 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4258 4259 /* 4260 * pick apart the serialized stats 4261 * 4262 * format is: 4263 * <class-name>, '@', <path>, '\0', <value>, '\0' 4264 * for example: 4265 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4266 * 4267 * since this is parsing our own serialized data, any parsing issues 4268 * are fatal, so we check for them all with ASSERT() below. 4269 */ 4270 ptr = sbuf; 4271 while (ptr < &sbuf[sz]) { 4272 char *sepptr; 4273 struct node *np; 4274 int val; 4275 4276 sepptr = strchr(ptr, '@'); 4277 ASSERT(sepptr != NULL); 4278 *sepptr = '\0'; 4279 4280 /* construct the event */ 4281 np = newnode(T_EVENT, NULL, 0); 4282 np->u.event.ename = newnode(T_NAME, NULL, 0); 4283 np->u.event.ename->u.name.t = N_STAT; 4284 np->u.event.ename->u.name.s = stable(ptr); 4285 np->u.event.ename->u.name.it = IT_ENAME; 4286 np->u.event.ename->u.name.last = np->u.event.ename; 4287 4288 ptr = sepptr + 1; 4289 ASSERT(ptr < &sbuf[sz]); 4290 ptr += strlen(ptr); 4291 ptr++; /* move past the '\0' separating path from value */ 4292 ASSERT(ptr < &sbuf[sz]); 4293 ASSERT(isdigit(*ptr)); 4294 val = atoi(ptr); 4295 ASSERT(val > 0); 4296 ptr += strlen(ptr); 4297 ptr++; /* move past the final '\0' for this entry */ 4298 4299 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4300 ASSERT(np->u.event.epname != NULL); 4301 4302 istat_bump(np, val); 4303 tree_free(np); 4304 } 4305 4306 istat_save(); 4307 } 4308