1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <strings.h> 35 #include <ctype.h> 36 #include <alloca.h> 37 #include <libnvpair.h> 38 #include <sys/fm/protocol.h> 39 #include <fm/fmd_api.h> 40 #include "alloc.h" 41 #include "out.h" 42 #include "stats.h" 43 #include "stable.h" 44 #include "literals.h" 45 #include "lut.h" 46 #include "tree.h" 47 #include "ptree.h" 48 #include "itree.h" 49 #include "ipath.h" 50 #include "fme.h" 51 #include "evnv.h" 52 #include "eval.h" 53 #include "config.h" 54 #include "platform.h" 55 #include "esclex.h" 56 57 /* imported from eft.c... */ 58 extern hrtime_t Hesitate; 59 extern char *Serd_Override; 60 extern nv_alloc_t Eft_nv_hdl; 61 extern int Max_fme; 62 extern fmd_hdl_t *Hdl; 63 64 static int Istat_need_save; 65 static int Serd_need_save; 66 void istat_save(void); 67 void serd_save(void); 68 69 /* fme under construction is global so we can free it on module abort */ 70 static struct fme *Nfmep; 71 72 static int Undiag_reason = UD_VAL_UNKNOWN; 73 74 static int Nextid = 0; 75 76 static int Open_fme_count = 0; /* Count of open FMEs */ 77 78 /* list of fault management exercises underway */ 79 static struct fme { 80 struct fme *next; /* next exercise */ 81 unsigned long long ull; /* time when fme was created */ 82 int id; /* FME id */ 83 struct config *config; /* cooked configuration data */ 84 struct lut *eventtree; /* propagation tree for this FME */ 85 /* 86 * The initial error report that created this FME is kept in 87 * two forms. e0 points to the instance tree node and is used 88 * by fme_eval() as the starting point for the inference 89 * algorithm. e0r is the event handle FMD passed to us when 90 * the ereport first arrived and is used when setting timers, 91 * which are always relative to the time of this initial 92 * report. 93 */ 94 struct event *e0; 95 fmd_event_t *e0r; 96 97 id_t timer; /* for setting an fmd time-out */ 98 99 struct event *ecurrent; /* ereport under consideration */ 100 struct event *suspects; /* current suspect list */ 101 struct event *psuspects; /* previous suspect list */ 102 int nsuspects; /* count of suspects */ 103 int posted_suspects; /* true if we've posted a diagnosis */ 104 int uniqobs; /* number of unique events observed */ 105 int peek; /* just peeking, don't track suspects */ 106 int overflow; /* true if overflow FME */ 107 enum fme_state { 108 FME_NOTHING = 5000, /* not evaluated yet */ 109 FME_WAIT, /* need to wait for more info */ 110 FME_CREDIBLE, /* suspect list is credible */ 111 FME_DISPROVED, /* no valid suspects found */ 112 FME_DEFERRED /* don't know yet (k-count not met) */ 113 } state; 114 115 unsigned long long pull; /* time passed since created */ 116 unsigned long long wull; /* wait until this time for re-eval */ 117 struct event *observations; /* observation list */ 118 struct lut *globals; /* values of global variables */ 119 /* fmd interfacing */ 120 fmd_hdl_t *hdl; /* handle for talking with fmd */ 121 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 122 /* stats */ 123 struct stats *Rcount; 124 struct stats *Hcallcount; 125 struct stats *Rcallcount; 126 struct stats *Ccallcount; 127 struct stats *Ecallcount; 128 struct stats *Tcallcount; 129 struct stats *Marrowcount; 130 struct stats *diags; 131 } *FMElist, *EFMElist, *ClosedFMEs; 132 133 static struct case_list { 134 fmd_case_t *fmcase; 135 struct case_list *next; 136 } *Undiagablecaselist; 137 138 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 139 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 140 unsigned long long at_latest_by, unsigned long long *pdelay); 141 static struct node *eventprop_lookup(struct event *ep, const char *propname); 142 static struct node *pathstring2epnamenp(char *path); 143 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 144 fmd_case_t *fmcase); 145 static const char *undiag_2reason_str(int ud); 146 static const char *undiag_2defect_str(int ud); 147 static void restore_suspects(struct fme *fmep); 148 static void save_suspects(struct fme *fmep); 149 static void destroy_fme(struct fme *f); 150 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 151 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 152 static void istat_counter_reset_cb(struct istat_entry *entp, 153 struct stats *statp, const struct ipath *ipp); 154 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 155 struct stats *statp, void *unused); 156 static void serd_reset_cb(struct serd_entry *entp, void *unused, 157 const struct ipath *ipp); 158 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 159 void *unused2); 160 static void destroy_fme_bufs(struct fme *fp); 161 162 static struct fme * 163 alloc_fme(void) 164 { 165 struct fme *fmep; 166 167 fmep = MALLOC(sizeof (*fmep)); 168 bzero(fmep, sizeof (*fmep)); 169 return (fmep); 170 } 171 172 /* 173 * fme_ready -- called when all initialization of the FME (except for 174 * stats) has completed successfully. Adds the fme to global lists 175 * and establishes its stats. 176 */ 177 static struct fme * 178 fme_ready(struct fme *fmep) 179 { 180 char nbuf[100]; 181 182 Nfmep = NULL; /* don't need to free this on module abort now */ 183 184 if (EFMElist) { 185 EFMElist->next = fmep; 186 EFMElist = fmep; 187 } else 188 FMElist = EFMElist = fmep; 189 190 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 191 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 192 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 193 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 194 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 195 fmep->Rcallcount = stats_new_counter(nbuf, 196 "calls to requirements_test()", 1); 197 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 198 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 199 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 200 fmep->Ecallcount = 201 stats_new_counter(nbuf, "calls to effects_test()", 1); 202 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 203 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 204 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 205 fmep->Marrowcount = stats_new_counter(nbuf, 206 "arrows marked by mark_arrows()", 1); 207 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 208 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 209 210 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 211 config_print(O_ALTFP|O_VERB2, fmep->config); 212 213 return (fmep); 214 } 215 216 extern void ipath_dummy_lut(struct arrow *); 217 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 218 219 /* ARGSUSED */ 220 static void 221 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 222 { 223 struct bubble *bp; 224 struct arrowlist *ap; 225 226 for (bp = itree_next_bubble(ep, NULL); bp; 227 bp = itree_next_bubble(ep, bp)) { 228 if (bp->t != B_FROM) 229 continue; 230 for (ap = itree_next_arrow(bp, NULL); ap; 231 ap = itree_next_arrow(bp, ap)) { 232 ap->arrowp->pnode->u.arrow.needed = 1; 233 ipath_dummy_lut(ap->arrowp); 234 } 235 } 236 } 237 238 /* ARGSUSED */ 239 static void 240 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 241 { 242 struct bubble *bp; 243 struct arrowlist *ap; 244 245 for (bp = itree_next_bubble(ep, NULL); bp; 246 bp = itree_next_bubble(ep, bp)) { 247 if (bp->t != B_FROM) 248 continue; 249 for (ap = itree_next_arrow(bp, NULL); ap; 250 ap = itree_next_arrow(bp, ap)) 251 ap->arrowp->pnode->u.arrow.needed = 0; 252 } 253 } 254 255 static void globals_destructor(void *left, void *right, void *arg); 256 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 257 258 static void 259 prune_propagations(const char *e0class, const struct ipath *e0ipp) 260 { 261 char nbuf[100]; 262 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 263 extern struct lut *Usednames; 264 265 Nfmep = alloc_fme(); 266 Nfmep->id = Nextid; 267 Nfmep->state = FME_NOTHING; 268 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 269 if ((Nfmep->e0 = 270 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 271 out(O_ALTFP, "prune_propagations: e0 not in instance tree"); 272 itree_free(Nfmep->eventtree); 273 FREE(Nfmep); 274 Nfmep = NULL; 275 return; 276 } 277 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 278 Nfmep->e0->count++; 279 280 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 281 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 282 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 283 Nfmep->Hcallcount = 284 stats_new_counter(nbuf, "calls to hypothesise()", 1); 285 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 286 Nfmep->Rcallcount = stats_new_counter(nbuf, 287 "calls to requirements_test()", 1); 288 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 289 Nfmep->Ccallcount = 290 stats_new_counter(nbuf, "calls to causes_test()", 1); 291 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 292 Nfmep->Ecallcount = 293 stats_new_counter(nbuf, "calls to effects_test()", 1); 294 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 295 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 296 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 297 Nfmep->Marrowcount = stats_new_counter(nbuf, 298 "arrows marked by mark_arrows()", 1); 299 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 300 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 301 302 Nfmep->peek = 1; 303 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 304 lut_free(Usednames, NULL, NULL); 305 Usednames = NULL; 306 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 307 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 308 itree_prune(Nfmep->eventtree); 309 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 310 311 stats_delete(Nfmep->Rcount); 312 stats_delete(Nfmep->Hcallcount); 313 stats_delete(Nfmep->Rcallcount); 314 stats_delete(Nfmep->Ccallcount); 315 stats_delete(Nfmep->Ecallcount); 316 stats_delete(Nfmep->Tcallcount); 317 stats_delete(Nfmep->Marrowcount); 318 stats_delete(Nfmep->diags); 319 itree_free(Nfmep->eventtree); 320 lut_free(Nfmep->globals, globals_destructor, NULL); 321 FREE(Nfmep); 322 } 323 324 static struct fme * 325 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 326 fmd_case_t *fmcase) 327 { 328 struct cfgdata *cfgdata; 329 int init_size; 330 extern int alloc_total(); 331 332 init_size = alloc_total(); 333 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 334 cfgdata = config_snapshot(); 335 platform_save_config(hdl, fmcase); 336 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 337 alloc_total() - init_size); 338 339 Nfmep = alloc_fme(); 340 341 Nfmep->id = Nextid++; 342 Nfmep->config = cfgdata->cooked; 343 config_free(cfgdata); 344 Nfmep->posted_suspects = 0; 345 Nfmep->uniqobs = 0; 346 Nfmep->state = FME_NOTHING; 347 Nfmep->pull = 0ULL; 348 Nfmep->overflow = 0; 349 350 Nfmep->fmcase = fmcase; 351 Nfmep->hdl = hdl; 352 353 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 354 out(O_ALTFP, "newfme: NULL instance tree"); 355 Undiag_reason = UD_VAL_INSTFAIL; 356 structconfig_free(Nfmep->config); 357 destroy_fme_bufs(Nfmep); 358 FREE(Nfmep); 359 Nfmep = NULL; 360 return (NULL); 361 } 362 363 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 364 365 if ((Nfmep->e0 = 366 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 367 out(O_ALTFP, "newfme: e0 not in instance tree"); 368 Undiag_reason = UD_VAL_BADEVENTI; 369 itree_free(Nfmep->eventtree); 370 structconfig_free(Nfmep->config); 371 destroy_fme_bufs(Nfmep); 372 FREE(Nfmep); 373 Nfmep = NULL; 374 return (NULL); 375 } 376 377 return (fme_ready(Nfmep)); 378 } 379 380 void 381 fme_fini(void) 382 { 383 struct fme *sfp, *fp; 384 struct case_list *ucasep, *nextcasep; 385 386 ucasep = Undiagablecaselist; 387 while (ucasep != NULL) { 388 nextcasep = ucasep->next; 389 FREE(ucasep); 390 ucasep = nextcasep; 391 } 392 Undiagablecaselist = NULL; 393 394 /* clean up closed fmes */ 395 fp = ClosedFMEs; 396 while (fp != NULL) { 397 sfp = fp->next; 398 destroy_fme(fp); 399 fp = sfp; 400 } 401 ClosedFMEs = NULL; 402 403 fp = FMElist; 404 while (fp != NULL) { 405 sfp = fp->next; 406 destroy_fme(fp); 407 fp = sfp; 408 } 409 FMElist = EFMElist = NULL; 410 411 /* if we were in the middle of creating an fme, free it now */ 412 if (Nfmep) { 413 destroy_fme(Nfmep); 414 Nfmep = NULL; 415 } 416 } 417 418 /* 419 * Allocated space for a buffer name. 20 bytes allows for 420 * a ridiculous 9,999,999 unique observations. 421 */ 422 #define OBBUFNMSZ 20 423 424 /* 425 * serialize_observation 426 * 427 * Create a recoverable version of the current observation 428 * (f->ecurrent). We keep a serialized version of each unique 429 * observation in order that we may resume correctly the fme in the 430 * correct state if eft or fmd crashes and we're restarted. 431 */ 432 static void 433 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 434 { 435 size_t pkdlen; 436 char tmpbuf[OBBUFNMSZ]; 437 char *pkd = NULL; 438 char *estr; 439 440 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 441 estr = ipath2str(cls, ipp); 442 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 443 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 444 strlen(estr) + 1); 445 FREE(estr); 446 447 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 448 (void) snprintf(tmpbuf, 449 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 450 if (nvlist_xpack(fp->ecurrent->nvp, 451 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 452 out(O_DIE|O_SYS, "pack of observed nvl failed"); 453 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 454 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 455 FREE(pkd); 456 } 457 458 fp->uniqobs++; 459 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 460 sizeof (fp->uniqobs)); 461 } 462 463 /* 464 * init_fme_bufs -- We keep several bits of state about an fme for 465 * use if eft or fmd crashes and we're restarted. 466 */ 467 static void 468 init_fme_bufs(struct fme *fp) 469 { 470 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 471 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 472 sizeof (fp->pull)); 473 474 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 475 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 476 sizeof (fp->id)); 477 478 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 479 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 480 sizeof (fp->uniqobs)); 481 482 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 483 sizeof (fp->posted_suspects)); 484 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 485 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 486 } 487 488 static void 489 destroy_fme_bufs(struct fme *fp) 490 { 491 char tmpbuf[OBBUFNMSZ]; 492 int o; 493 494 platform_restore_config(fp->hdl, fp->fmcase); 495 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 496 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 497 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 498 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 499 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 500 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 501 502 for (o = 0; o < fp->uniqobs; o++) { 503 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 504 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 505 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 506 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 507 } 508 } 509 510 /* 511 * reconstitute_observations -- convert a case's serialized observations 512 * back into struct events. Returns zero if all observations are 513 * successfully reconstituted. 514 */ 515 static int 516 reconstitute_observations(struct fme *fmep) 517 { 518 struct event *ep; 519 struct node *epnamenp = NULL; 520 size_t pkdlen; 521 char *pkd = NULL; 522 char *tmpbuf = alloca(OBBUFNMSZ); 523 char *sepptr; 524 char *estr; 525 int ocnt; 526 int elen; 527 528 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 529 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 530 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 531 if (elen == 0) { 532 out(O_ALTFP, 533 "reconstitute_observation: no %s buffer found.", 534 tmpbuf); 535 Undiag_reason = UD_VAL_MISSINGOBS; 536 break; 537 } 538 539 estr = MALLOC(elen); 540 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 541 sepptr = strchr(estr, '@'); 542 if (sepptr == NULL) { 543 out(O_ALTFP, 544 "reconstitute_observation: %s: " 545 "missing @ separator in %s.", 546 tmpbuf, estr); 547 Undiag_reason = UD_VAL_MISSINGPATH; 548 FREE(estr); 549 break; 550 } 551 552 *sepptr = '\0'; 553 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 554 out(O_ALTFP, 555 "reconstitute_observation: %s: " 556 "trouble converting path string \"%s\" " 557 "to internal representation.", 558 tmpbuf, sepptr + 1); 559 Undiag_reason = UD_VAL_MISSINGPATH; 560 FREE(estr); 561 break; 562 } 563 564 /* construct the event */ 565 ep = itree_lookup(fmep->eventtree, 566 stable(estr), ipath(epnamenp)); 567 if (ep == NULL) { 568 out(O_ALTFP, 569 "reconstitute_observation: %s: " 570 "lookup of \"%s\" in itree failed.", 571 tmpbuf, ipath2str(estr, ipath(epnamenp))); 572 Undiag_reason = UD_VAL_BADOBS; 573 tree_free(epnamenp); 574 FREE(estr); 575 break; 576 } 577 tree_free(epnamenp); 578 579 /* 580 * We may or may not have a saved nvlist for the observation 581 */ 582 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 583 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 584 if (pkdlen != 0) { 585 pkd = MALLOC(pkdlen); 586 fmd_buf_read(fmep->hdl, 587 fmep->fmcase, tmpbuf, pkd, pkdlen); 588 ASSERT(ep->nvp == NULL); 589 if (nvlist_xunpack(pkd, 590 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 591 out(O_DIE|O_SYS, "pack of observed nvl failed"); 592 FREE(pkd); 593 } 594 595 if (ocnt == 0) 596 fmep->e0 = ep; 597 598 FREE(estr); 599 fmep->ecurrent = ep; 600 ep->count++; 601 602 /* link it into list of observations seen */ 603 ep->observations = fmep->observations; 604 fmep->observations = ep; 605 } 606 607 if (ocnt == fmep->uniqobs) { 608 (void) fme_ready(fmep); 609 return (0); 610 } 611 612 return (1); 613 } 614 615 /* 616 * restart_fme -- called during eft initialization. Reconstitutes 617 * an in-progress fme. 618 */ 619 void 620 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 621 { 622 nvlist_t *defect; 623 struct case_list *bad; 624 struct fme *fmep; 625 struct cfgdata *cfgdata; 626 size_t rawsz; 627 struct event *ep; 628 char *tmpbuf = alloca(OBBUFNMSZ); 629 char *sepptr; 630 char *estr; 631 int elen; 632 struct node *epnamenp = NULL; 633 int init_size; 634 extern int alloc_total(); 635 636 /* 637 * ignore solved or closed cases 638 */ 639 if (fmd_case_solved(hdl, inprogress) || 640 fmd_case_closed(hdl, inprogress)) 641 return; 642 643 fmep = alloc_fme(); 644 fmep->fmcase = inprogress; 645 fmep->hdl = hdl; 646 647 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 648 out(O_ALTFP, "restart_fme: no saved posted status"); 649 Undiag_reason = UD_VAL_MISSINGINFO; 650 goto badcase; 651 } else { 652 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 653 (void *)&fmep->posted_suspects, 654 sizeof (fmep->posted_suspects)); 655 } 656 657 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 658 out(O_ALTFP, "restart_fme: no saved id"); 659 Undiag_reason = UD_VAL_MISSINGINFO; 660 goto badcase; 661 } else { 662 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 663 sizeof (fmep->id)); 664 } 665 if (Nextid <= fmep->id) 666 Nextid = fmep->id + 1; 667 668 out(O_ALTFP, "Replay FME %d", fmep->id); 669 670 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 671 out(O_ALTFP, "restart_fme: No config data"); 672 Undiag_reason = UD_VAL_MISSINGINFO; 673 goto badcase; 674 } 675 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 676 sizeof (size_t)); 677 678 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 679 out(O_ALTFP, "restart_fme: No event zero"); 680 Undiag_reason = UD_VAL_MISSINGZERO; 681 goto badcase; 682 } 683 684 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 685 out(O_ALTFP, "restart_fme: no saved wait time"); 686 Undiag_reason = UD_VAL_MISSINGINFO; 687 goto badcase; 688 } else { 689 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 690 sizeof (fmep->pull)); 691 } 692 693 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 694 out(O_ALTFP, "restart_fme: no count of observations"); 695 Undiag_reason = UD_VAL_MISSINGINFO; 696 goto badcase; 697 } else { 698 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 699 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 700 } 701 702 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 703 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 704 if (elen == 0) { 705 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 706 tmpbuf); 707 Undiag_reason = UD_VAL_MISSINGOBS; 708 goto badcase; 709 } 710 estr = MALLOC(elen); 711 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 712 sepptr = strchr(estr, '@'); 713 if (sepptr == NULL) { 714 out(O_ALTFP, "reconstitute_observation: %s: " 715 "missing @ separator in %s.", 716 tmpbuf, estr); 717 Undiag_reason = UD_VAL_MISSINGPATH; 718 FREE(estr); 719 goto badcase; 720 } 721 *sepptr = '\0'; 722 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 723 out(O_ALTFP, "reconstitute_observation: %s: " 724 "trouble converting path string \"%s\" " 725 "to internal representation.", tmpbuf, sepptr + 1); 726 Undiag_reason = UD_VAL_MISSINGPATH; 727 FREE(estr); 728 goto badcase; 729 } 730 prune_propagations(stable(estr), ipath(epnamenp)); 731 tree_free(epnamenp); 732 FREE(estr); 733 734 init_size = alloc_total(); 735 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 736 cfgdata = MALLOC(sizeof (struct cfgdata)); 737 cfgdata->cooked = NULL; 738 cfgdata->devcache = NULL; 739 cfgdata->devidcache = NULL; 740 cfgdata->cpucache = NULL; 741 cfgdata->raw_refcnt = 1; 742 743 if (rawsz > 0) { 744 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 745 out(O_ALTFP, "restart_fme: Config data size mismatch"); 746 Undiag_reason = UD_VAL_CFGMISMATCH; 747 goto badcase; 748 } 749 cfgdata->begin = MALLOC(rawsz); 750 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 751 fmd_buf_read(hdl, 752 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 753 } else { 754 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 755 } 756 757 config_cook(cfgdata); 758 fmep->config = cfgdata->cooked; 759 config_free(cfgdata); 760 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 761 alloc_total() - init_size); 762 763 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 764 /* case not properly saved or irretrievable */ 765 out(O_ALTFP, "restart_fme: NULL instance tree"); 766 Undiag_reason = UD_VAL_INSTFAIL; 767 goto badcase; 768 } 769 770 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 771 772 if (reconstitute_observations(fmep) != 0) 773 goto badcase; 774 775 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 776 for (ep = fmep->observations; ep; ep = ep->observations) { 777 out(O_ALTFP|O_NONL, " "); 778 itree_pevent_brief(O_ALTFP|O_NONL, ep); 779 } 780 out(O_ALTFP, NULL); 781 782 Open_fme_count++; 783 784 /* give the diagnosis algorithm a shot at the new FME state */ 785 fme_eval(fmep, fmep->e0r); 786 return; 787 788 badcase: 789 if (fmep->eventtree != NULL) 790 itree_free(fmep->eventtree); 791 if (fmep->config) 792 structconfig_free(fmep->config); 793 destroy_fme_bufs(fmep); 794 FREE(fmep); 795 796 /* 797 * Since we're unable to restart the case, add it to the undiagable 798 * list and solve and close it as appropriate. 799 */ 800 bad = MALLOC(sizeof (struct case_list)); 801 bad->next = NULL; 802 803 if (Undiagablecaselist != NULL) 804 bad->next = Undiagablecaselist; 805 Undiagablecaselist = bad; 806 bad->fmcase = inprogress; 807 808 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 809 fmd_case_uuid(hdl, bad->fmcase)); 810 811 if (fmd_case_solved(hdl, bad->fmcase)) { 812 out(O_ALTFP|O_NONL, "already solved, "); 813 } else { 814 out(O_ALTFP|O_NONL, "solving, "); 815 defect = fmd_nvl_create_fault(hdl, 816 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 817 (void) nvlist_add_string(defect, UNDIAG_REASON, 818 undiag_2reason_str(Undiag_reason)); 819 fmd_case_add_suspect(hdl, bad->fmcase, defect); 820 fmd_case_solve(hdl, bad->fmcase); 821 Undiag_reason = UD_VAL_UNKNOWN; 822 } 823 824 if (fmd_case_closed(hdl, bad->fmcase)) { 825 out(O_ALTFP, "already closed ]"); 826 } else { 827 out(O_ALTFP, "closing ]"); 828 fmd_case_close(hdl, bad->fmcase); 829 } 830 } 831 832 /*ARGSUSED*/ 833 static void 834 globals_destructor(void *left, void *right, void *arg) 835 { 836 struct evalue *evp = (struct evalue *)right; 837 if (evp->t == NODEPTR) 838 tree_free((struct node *)(uintptr_t)evp->v); 839 evp->v = (uintptr_t)NULL; 840 FREE(evp); 841 } 842 843 void 844 destroy_fme(struct fme *f) 845 { 846 stats_delete(f->Rcount); 847 stats_delete(f->Hcallcount); 848 stats_delete(f->Rcallcount); 849 stats_delete(f->Ccallcount); 850 stats_delete(f->Ecallcount); 851 stats_delete(f->Tcallcount); 852 stats_delete(f->Marrowcount); 853 stats_delete(f->diags); 854 855 if (f->eventtree != NULL) 856 itree_free(f->eventtree); 857 if (f->config) 858 structconfig_free(f->config); 859 lut_free(f->globals, globals_destructor, NULL); 860 FREE(f); 861 } 862 863 static const char * 864 fme_state2str(enum fme_state s) 865 { 866 switch (s) { 867 case FME_NOTHING: return ("NOTHING"); 868 case FME_WAIT: return ("WAIT"); 869 case FME_CREDIBLE: return ("CREDIBLE"); 870 case FME_DISPROVED: return ("DISPROVED"); 871 case FME_DEFERRED: return ("DEFERRED"); 872 default: return ("UNKNOWN"); 873 } 874 } 875 876 static int 877 is_problem(enum nametype t) 878 { 879 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 880 } 881 882 static int 883 is_defect(enum nametype t) 884 { 885 return (t == N_DEFECT); 886 } 887 888 static int 889 is_upset(enum nametype t) 890 { 891 return (t == N_UPSET); 892 } 893 894 static void 895 fme_print(int flags, struct fme *fmep) 896 { 897 struct event *ep; 898 899 out(flags, "Fault Management Exercise %d", fmep->id); 900 out(flags, "\t State: %s", fme_state2str(fmep->state)); 901 out(flags|O_NONL, "\t Start time: "); 902 ptree_timeval(flags|O_NONL, &fmep->ull); 903 out(flags, NULL); 904 if (fmep->wull) { 905 out(flags|O_NONL, "\t Wait time: "); 906 ptree_timeval(flags|O_NONL, &fmep->wull); 907 out(flags, NULL); 908 } 909 out(flags|O_NONL, "\t E0: "); 910 if (fmep->e0) 911 itree_pevent_brief(flags|O_NONL, fmep->e0); 912 else 913 out(flags|O_NONL, "NULL"); 914 out(flags, NULL); 915 out(flags|O_NONL, "\tObservations:"); 916 for (ep = fmep->observations; ep; ep = ep->observations) { 917 out(flags|O_NONL, " "); 918 itree_pevent_brief(flags|O_NONL, ep); 919 } 920 out(flags, NULL); 921 out(flags|O_NONL, "\tSuspect list:"); 922 for (ep = fmep->suspects; ep; ep = ep->suspects) { 923 out(flags|O_NONL, " "); 924 itree_pevent_brief(flags|O_NONL, ep); 925 } 926 out(flags, NULL); 927 if (fmep->eventtree != NULL) { 928 out(flags|O_VERB2, "\t Tree:"); 929 itree_ptree(flags|O_VERB2, fmep->eventtree); 930 } 931 } 932 933 static struct node * 934 pathstring2epnamenp(char *path) 935 { 936 char *sep = "/"; 937 struct node *ret; 938 char *ptr; 939 940 if ((ptr = strtok(path, sep)) == NULL) 941 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 942 943 ret = tree_iname(stable(ptr), NULL, 0); 944 945 while ((ptr = strtok(NULL, sep)) != NULL) 946 ret = tree_name_append(ret, 947 tree_iname(stable(ptr), NULL, 0)); 948 949 return (ret); 950 } 951 952 /* 953 * for a given upset sp, increment the corresponding SERD engine. if the 954 * SERD engine trips, return the ename and ipp of the resulting ereport. 955 * returns true if engine tripped and *enamep and *ippp were filled in. 956 */ 957 static int 958 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 959 fmd_case_t *fmcase, struct event *sp, const char **enamep, 960 const struct ipath **ippp) 961 { 962 struct node *serdinst; 963 char *serdname; 964 char *serdresource; 965 char *serdclass; 966 struct node *nid; 967 struct serd_entry *newentp; 968 int i, serdn = -1, serdincrement = 1, len = 0; 969 char *serdsuffix = NULL, *serdt = NULL; 970 struct evalue *ep; 971 972 ASSERT(sp->t == N_UPSET); 973 ASSERT(ffep != NULL); 974 975 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 976 (void *)"n", (lut_cmp)strcmp)) != NULL) { 977 ASSERT(ep->t == UINT64); 978 serdn = (int)ep->v; 979 } 980 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 981 (void *)"t", (lut_cmp)strcmp)) != NULL) { 982 ASSERT(ep->t == STRING); 983 serdt = (char *)(uintptr_t)ep->v; 984 } 985 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 986 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 987 ASSERT(ep->t == STRING); 988 serdsuffix = (char *)(uintptr_t)ep->v; 989 } 990 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 991 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 992 ASSERT(ep->t == UINT64); 993 serdincrement = (int)ep->v; 994 } 995 996 /* 997 * obtain instanced SERD engine from the upset sp. from this 998 * derive serdname, the string used to identify the SERD engine. 999 */ 1000 serdinst = eventprop_lookup(sp, L_engine); 1001 1002 if (serdinst == NULL) 1003 return (-1); 1004 1005 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1006 if (serdsuffix != NULL) 1007 len += strlen(serdsuffix); 1008 serdclass = MALLOC(len); 1009 if (serdsuffix != NULL) 1010 (void) snprintf(serdclass, len, "%s%s", 1011 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1012 else 1013 (void) snprintf(serdclass, len, "%s", 1014 serdinst->u.stmt.np->u.event.ename->u.name.s); 1015 serdresource = ipath2str(NULL, 1016 ipath(serdinst->u.stmt.np->u.event.epname)); 1017 len += strlen(serdresource) + 1; 1018 serdname = MALLOC(len); 1019 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1020 FREE(serdresource); 1021 1022 /* handle serd engine "id" property, if there is one */ 1023 if ((nid = 1024 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1025 struct evalue *gval; 1026 char suffixbuf[200]; 1027 char *suffix; 1028 char *nserdname; 1029 size_t nname; 1030 1031 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1032 ptree_name_iter(O_ALTFP|O_NONL, nid); 1033 1034 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1035 1036 if ((gval = lut_lookup(fmep->globals, 1037 (void *)nid->u.globid.s, NULL)) == NULL) { 1038 out(O_ALTFP, " undefined"); 1039 } else if (gval->t == UINT64) { 1040 out(O_ALTFP, " %llu", gval->v); 1041 (void) sprintf(suffixbuf, "%llu", gval->v); 1042 suffix = suffixbuf; 1043 } else { 1044 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1045 suffix = (char *)(uintptr_t)gval->v; 1046 } 1047 1048 nname = strlen(serdname) + strlen(suffix) + 2; 1049 nserdname = MALLOC(nname); 1050 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1051 FREE(serdname); 1052 serdname = nserdname; 1053 } 1054 1055 /* 1056 * if the engine is empty, and we have an override for n/t then 1057 * destroy and recreate it. 1058 */ 1059 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1060 fmd_serd_empty(hdl, serdname)) 1061 fmd_serd_destroy(hdl, serdname); 1062 1063 if (!fmd_serd_exists(hdl, serdname)) { 1064 struct node *nN, *nT; 1065 const char *s; 1066 struct node *nodep; 1067 struct config *cp; 1068 char *path; 1069 uint_t nval; 1070 hrtime_t tval; 1071 int i; 1072 char *ptr; 1073 int got_n_override = 0, got_t_override = 0; 1074 1075 /* no SERD engine yet, so create it */ 1076 nodep = serdinst->u.stmt.np->u.event.epname; 1077 path = ipath2str(NULL, ipath(nodep)); 1078 cp = config_lookup(fmep->config, path, 0); 1079 FREE((void *)path); 1080 1081 /* 1082 * We allow serd paramaters to be overridden, either from 1083 * eft.conf file values (if Serd_Override is set) or from 1084 * driver properties (for "serd.io.device" engines). 1085 */ 1086 if (Serd_Override != NULL) { 1087 char *save_ptr, *ptr1, *ptr2, *ptr3; 1088 ptr3 = save_ptr = STRDUP(Serd_Override); 1089 while (*ptr3 != '\0') { 1090 ptr1 = strchr(ptr3, ','); 1091 *ptr1 = '\0'; 1092 if (strcmp(ptr3, serdclass) == 0) { 1093 ptr2 = strchr(ptr1 + 1, ','); 1094 *ptr2 = '\0'; 1095 nval = atoi(ptr1 + 1); 1096 out(O_ALTFP, "serd override %s_n %d", 1097 serdclass, nval); 1098 ptr3 = strchr(ptr2 + 1, ' '); 1099 if (ptr3) 1100 *ptr3 = '\0'; 1101 ptr = STRDUP(ptr2 + 1); 1102 out(O_ALTFP, "serd override %s_t %s", 1103 serdclass, ptr); 1104 got_n_override = 1; 1105 got_t_override = 1; 1106 break; 1107 } else { 1108 ptr2 = strchr(ptr1 + 1, ','); 1109 ptr3 = strchr(ptr2 + 1, ' '); 1110 if (ptr3 == NULL) 1111 break; 1112 } 1113 ptr3++; 1114 } 1115 FREE(save_ptr); 1116 } 1117 1118 if (cp && got_n_override == 0) { 1119 /* 1120 * convert serd engine class into property name 1121 */ 1122 char *prop_name = MALLOC(strlen(serdclass) + 3); 1123 for (i = 0; i < strlen(serdclass); i++) { 1124 if (serdclass[i] == '.') 1125 prop_name[i] = '_'; 1126 else 1127 prop_name[i] = serdclass[i]; 1128 } 1129 prop_name[i++] = '_'; 1130 prop_name[i++] = 'n'; 1131 prop_name[i] = '\0'; 1132 if (s = config_getprop(cp, prop_name)) { 1133 nval = atoi(s); 1134 out(O_ALTFP, "serd override %s_n %s", 1135 serdclass, s); 1136 got_n_override = 1; 1137 } 1138 prop_name[i - 1] = 't'; 1139 if (s = config_getprop(cp, prop_name)) { 1140 ptr = STRDUP(s); 1141 out(O_ALTFP, "serd override %s_t %s", 1142 serdclass, s); 1143 got_t_override = 1; 1144 } 1145 FREE(prop_name); 1146 } 1147 1148 if (serdn != -1 && got_n_override == 0) { 1149 nval = serdn; 1150 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1151 got_n_override = 1; 1152 } 1153 if (serdt != NULL && got_t_override == 0) { 1154 ptr = STRDUP(serdt); 1155 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1156 got_t_override = 1; 1157 } 1158 1159 if (!got_n_override) { 1160 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1161 NULL); 1162 ASSERT(nN->t == T_NUM); 1163 nval = (uint_t)nN->u.ull; 1164 } 1165 if (!got_t_override) { 1166 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1167 NULL); 1168 ASSERT(nT->t == T_TIMEVAL); 1169 tval = (hrtime_t)nT->u.ull; 1170 } else { 1171 const unsigned long long *ullp; 1172 const char *suffix; 1173 int len; 1174 1175 len = strspn(ptr, "0123456789"); 1176 suffix = stable(&ptr[len]); 1177 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1178 (void *)suffix, NULL); 1179 ptr[len] = '\0'; 1180 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1181 FREE(ptr); 1182 } 1183 fmd_serd_create(hdl, serdname, nval, tval); 1184 } 1185 1186 newentp = MALLOC(sizeof (*newentp)); 1187 newentp->ename = stable(serdclass); 1188 FREE(serdclass); 1189 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1190 newentp->hdl = hdl; 1191 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1192 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1193 (void *)newentp, (lut_cmp)serd_cmp); 1194 Serd_need_save = 1; 1195 serd_save(); 1196 } else { 1197 FREE(newentp); 1198 } 1199 1200 1201 /* 1202 * increment SERD engine. if engine fires, reset serd 1203 * engine and return trip_strcode if required. 1204 */ 1205 for (i = 0; i < serdincrement; i++) { 1206 if (fmd_serd_record(hdl, serdname, ffep)) { 1207 fmd_case_add_serd(hdl, fmcase, serdname); 1208 fmd_serd_reset(hdl, serdname); 1209 1210 if (ippp) { 1211 struct node *tripinst = 1212 lut_lookup(serdinst->u.stmt.lutp, 1213 (void *)L_trip, NULL); 1214 ASSERT(tripinst != NULL); 1215 *enamep = tripinst->u.event.ename->u.name.s; 1216 *ippp = ipath(tripinst->u.event.epname); 1217 out(O_ALTFP|O_NONL, 1218 "[engine fired: %s, sending: ", serdname); 1219 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1220 out(O_ALTFP, "]"); 1221 } else { 1222 out(O_ALTFP, "[engine fired: %s, no trip]", 1223 serdname); 1224 } 1225 FREE(serdname); 1226 return (1); 1227 } 1228 } 1229 1230 FREE(serdname); 1231 return (0); 1232 } 1233 1234 /* 1235 * search a suspect list for upsets. feed each upset to serd_eval() and 1236 * build up tripped[], an array of ereports produced by the firing of 1237 * any SERD engines. then feed each ereport back into 1238 * fme_receive_report(). 1239 * 1240 * returns ntrip, the number of these ereports produced. 1241 */ 1242 static int 1243 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1244 { 1245 /* we build an array of tripped ereports that we send ourselves */ 1246 struct { 1247 const char *ename; 1248 const struct ipath *ipp; 1249 } *tripped; 1250 struct event *sp; 1251 int ntrip, nupset, i; 1252 1253 /* 1254 * count the number of upsets to determine the upper limit on 1255 * expected trip ereport strings. remember that one upset can 1256 * lead to at most one ereport. 1257 */ 1258 nupset = 0; 1259 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1260 if (sp->t == N_UPSET) 1261 nupset++; 1262 } 1263 1264 if (nupset == 0) 1265 return (0); 1266 1267 /* 1268 * get to this point if we have upsets and expect some trip 1269 * ereports 1270 */ 1271 tripped = alloca(sizeof (*tripped) * nupset); 1272 bzero((void *)tripped, sizeof (*tripped) * nupset); 1273 1274 ntrip = 0; 1275 for (sp = fmep->suspects; sp; sp = sp->suspects) 1276 if (sp->t == N_UPSET && 1277 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1278 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1279 ntrip++; 1280 1281 for (i = 0; i < ntrip; i++) { 1282 struct event *ep, *nep; 1283 struct fme *nfmep; 1284 fmd_case_t *fmcase; 1285 const struct ipath *ipp; 1286 const char *eventstring; 1287 int prev_verbose; 1288 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1289 enum fme_state state; 1290 1291 /* 1292 * First try and evaluate a case with the trip ereport plus 1293 * all the other ereports that cause the trip. If that fails 1294 * to evaluate then try again with just this ereport on its own. 1295 */ 1296 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1297 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1298 out(O_ALTFP|O_STAMP, NULL); 1299 ep = fmep->e0; 1300 eventstring = ep->enode->u.event.ename->u.name.s; 1301 ipp = ep->ipp; 1302 prune_propagations(eventstring, ipp); 1303 1304 /* 1305 * create a duplicate fme and case 1306 */ 1307 fmcase = fmd_case_open(fmep->hdl, NULL); 1308 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1309 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1310 out(O_ALTFP, " ]"); 1311 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1312 fmcase)) == NULL) { 1313 out(O_ALTFP|O_NONL, "["); 1314 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1315 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1316 publish_undiagnosable(fmep->hdl, ffep, fmcase); 1317 continue; 1318 } 1319 Open_fme_count++; 1320 nfmep->pull = fmep->pull; 1321 init_fme_bufs(nfmep); 1322 out(O_ALTFP|O_NONL, "["); 1323 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1324 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1325 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1326 if (ffep) { 1327 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1328 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1329 nfmep->e0r = ffep; 1330 } 1331 1332 /* 1333 * add the original ereports 1334 */ 1335 for (ep = fmep->observations; ep; ep = ep->observations) { 1336 eventstring = ep->enode->u.event.ename->u.name.s; 1337 ipp = ep->ipp; 1338 out(O_ALTFP|O_NONL, "adding event ["); 1339 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1340 out(O_ALTFP, " ]"); 1341 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1342 if (nep->count++ == 0) { 1343 nep->observations = nfmep->observations; 1344 nfmep->observations = nep; 1345 serialize_observation(nfmep, eventstring, ipp); 1346 nep->nvp = evnv_dupnvl(ep->nvp); 1347 } 1348 if (ep->ffep && ep->ffep != ffep) 1349 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1350 ep->ffep); 1351 stats_counter_bump(nfmep->Rcount); 1352 } 1353 1354 /* 1355 * add the serd trigger ereport 1356 */ 1357 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1358 tripped[i].ipp)) == NULL) { 1359 /* 1360 * The trigger ereport is not in the instance tree. It 1361 * was presumably removed by prune_propagations() as 1362 * this combination of events is not present in the 1363 * rules. 1364 */ 1365 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1366 Undiag_reason = UD_VAL_BADEVENTI; 1367 goto retry_lone_ereport; 1368 } 1369 out(O_ALTFP|O_NONL, "adding event ["); 1370 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1371 out(O_ALTFP, " ]"); 1372 nfmep->ecurrent = ep; 1373 ep->nvp = NULL; 1374 ep->count = 1; 1375 ep->observations = nfmep->observations; 1376 nfmep->observations = ep; 1377 1378 /* 1379 * just peek first. 1380 */ 1381 nfmep->peek = 1; 1382 prev_verbose = Verbose; 1383 if (Debug == 0) 1384 Verbose = 0; 1385 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1386 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1387 nfmep->peek = 0; 1388 Verbose = prev_verbose; 1389 if (state == FME_DISPROVED) { 1390 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1391 Undiag_reason = UD_VAL_UNSOLVD; 1392 retry_lone_ereport: 1393 /* 1394 * However the trigger ereport on its own might be 1395 * diagnosable, so check for that. Undo the new fme 1396 * and case we just created and call fme_receive_report. 1397 */ 1398 out(O_ALTFP|O_NONL, "["); 1399 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1400 tripped[i].ipp); 1401 out(O_ALTFP, " retrying with just trigger ereport]"); 1402 itree_free(nfmep->eventtree); 1403 nfmep->eventtree = NULL; 1404 structconfig_free(nfmep->config); 1405 nfmep->config = NULL; 1406 destroy_fme_bufs(nfmep); 1407 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1408 fme_receive_report(fmep->hdl, ffep, 1409 tripped[i].ename, tripped[i].ipp, NULL); 1410 continue; 1411 } 1412 1413 /* 1414 * and evaluate 1415 */ 1416 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1417 fme_eval(nfmep, ffep); 1418 } 1419 1420 return (ntrip); 1421 } 1422 1423 /* 1424 * fme_receive_external_report -- call when an external ereport comes in 1425 * 1426 * this routine just converts the relevant information from the ereport 1427 * into a format used internally and passes it on to fme_receive_report(). 1428 */ 1429 void 1430 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1431 const char *class) 1432 { 1433 struct node *epnamenp; 1434 fmd_case_t *fmcase; 1435 const struct ipath *ipp; 1436 1437 class = stable(class); 1438 1439 /* Get the component path from the ereport */ 1440 epnamenp = platform_getpath(nvl); 1441 1442 /* See if we ended up without a path. */ 1443 if (epnamenp == NULL) { 1444 /* See if class permits silent discard on unknown component. */ 1445 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1446 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1447 "to component path, but silent discard allowed.", 1448 class); 1449 } else { 1450 /* 1451 * XFILE: Failure to find a component is bad unless 1452 * 'discard_if_config_unknown=1' was specified in the 1453 * ereport definition. Indicate undiagnosable. 1454 */ 1455 out(O_ALTFP, "XFILE: Unable to map \"%s\" ereport " 1456 "to component path.", class); 1457 Undiag_reason = UD_VAL_NOPATH; 1458 fmcase = fmd_case_open(hdl, NULL); 1459 publish_undiagnosable(hdl, ffep, fmcase); 1460 } 1461 return; 1462 } 1463 1464 ipp = ipath(epnamenp); 1465 tree_free(epnamenp); 1466 fme_receive_report(hdl, ffep, class, ipp, nvl); 1467 } 1468 1469 /*ARGSUSED*/ 1470 void 1471 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1472 const char *eventstring) 1473 { 1474 char *uuid; 1475 nvlist_t **nva; 1476 uint_t nvc; 1477 const struct ipath *ipp; 1478 1479 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1480 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1481 &nva, &nvc) != 0) { 1482 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1483 return; 1484 } 1485 1486 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1487 1488 while (nvc-- != 0) { 1489 /* 1490 * Reset any istat or serd engine associated with this path. 1491 */ 1492 char *path; 1493 1494 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1495 continue; 1496 1497 path = ipath2str(NULL, ipp); 1498 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1499 path); 1500 FREE(path); 1501 1502 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1503 istat_save(); 1504 1505 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1506 serd_save(); 1507 } 1508 } 1509 1510 /*ARGSUSED*/ 1511 void 1512 fme_receive_topology_change(void) 1513 { 1514 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1515 istat_save(); 1516 1517 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1518 serd_save(); 1519 } 1520 1521 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1522 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1523 1524 /* ARGSUSED */ 1525 static void 1526 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1527 { 1528 struct bubble *bp; 1529 struct arrowlist *ap; 1530 1531 ep->cached_state = 0; 1532 ep->keep_in_tree = 0; 1533 for (bp = itree_next_bubble(ep, NULL); bp; 1534 bp = itree_next_bubble(ep, bp)) { 1535 if (bp->t != B_FROM) 1536 continue; 1537 bp->mark = 0; 1538 for (ap = itree_next_arrow(bp, NULL); ap; 1539 ap = itree_next_arrow(bp, ap)) 1540 ap->arrowp->mark = 0; 1541 } 1542 } 1543 1544 static void 1545 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1546 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1547 { 1548 struct event *ep; 1549 struct fme *fmep = NULL; 1550 struct fme *ofmep = NULL; 1551 struct fme *cfmep, *svfmep; 1552 int matched = 0; 1553 nvlist_t *defect; 1554 fmd_case_t *fmcase; 1555 1556 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1557 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1558 out(O_ALTFP|O_STAMP, NULL); 1559 1560 /* decide which FME it goes to */ 1561 for (fmep = FMElist; fmep; fmep = fmep->next) { 1562 int prev_verbose; 1563 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1564 enum fme_state state; 1565 nvlist_t *pre_peek_nvp = NULL; 1566 1567 if (fmep->overflow) { 1568 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1569 ofmep = fmep; 1570 1571 continue; 1572 } 1573 1574 /* 1575 * ignore solved or closed cases 1576 */ 1577 if (fmep->posted_suspects || 1578 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1579 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1580 continue; 1581 1582 /* look up event in event tree for this FME */ 1583 if ((ep = itree_lookup(fmep->eventtree, 1584 eventstring, ipp)) == NULL) 1585 continue; 1586 1587 /* note observation */ 1588 fmep->ecurrent = ep; 1589 if (ep->count++ == 0) { 1590 /* link it into list of observations seen */ 1591 ep->observations = fmep->observations; 1592 fmep->observations = ep; 1593 ep->nvp = evnv_dupnvl(nvl); 1594 } else { 1595 /* use new payload values for peek */ 1596 pre_peek_nvp = ep->nvp; 1597 ep->nvp = evnv_dupnvl(nvl); 1598 } 1599 1600 /* tell hypothesise() not to mess with suspect list */ 1601 fmep->peek = 1; 1602 1603 /* don't want this to be verbose (unless Debug is set) */ 1604 prev_verbose = Verbose; 1605 if (Debug == 0) 1606 Verbose = 0; 1607 1608 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1609 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1610 1611 fmep->peek = 0; 1612 1613 /* put verbose flag back */ 1614 Verbose = prev_verbose; 1615 1616 if (state != FME_DISPROVED) { 1617 /* found an FME that explains the ereport */ 1618 matched++; 1619 out(O_ALTFP|O_NONL, "["); 1620 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1621 out(O_ALTFP, " explained by FME%d]", fmep->id); 1622 1623 if (pre_peek_nvp) 1624 nvlist_free(pre_peek_nvp); 1625 1626 if (ep->count == 1) 1627 serialize_observation(fmep, eventstring, ipp); 1628 1629 if (ffep) { 1630 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1631 ep->ffep = ffep; 1632 } 1633 1634 stats_counter_bump(fmep->Rcount); 1635 1636 /* re-eval FME */ 1637 fme_eval(fmep, ffep); 1638 } else { 1639 1640 /* not a match, undo noting of observation */ 1641 fmep->ecurrent = NULL; 1642 if (--ep->count == 0) { 1643 /* unlink it from observations */ 1644 fmep->observations = ep->observations; 1645 ep->observations = NULL; 1646 nvlist_free(ep->nvp); 1647 ep->nvp = NULL; 1648 } else { 1649 nvlist_free(ep->nvp); 1650 ep->nvp = pre_peek_nvp; 1651 } 1652 } 1653 } 1654 1655 if (matched) 1656 return; /* explained by at least one existing FME */ 1657 1658 /* clean up closed fmes */ 1659 cfmep = ClosedFMEs; 1660 while (cfmep != NULL) { 1661 svfmep = cfmep->next; 1662 destroy_fme(cfmep); 1663 cfmep = svfmep; 1664 } 1665 ClosedFMEs = NULL; 1666 prune_propagations(eventstring, ipp); 1667 1668 if (ofmep) { 1669 out(O_ALTFP|O_NONL, "["); 1670 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1671 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1672 if (ffep) 1673 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1674 1675 return; 1676 1677 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1678 out(O_ALTFP|O_NONL, "["); 1679 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1680 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1681 1682 fmcase = fmd_case_open(hdl, NULL); 1683 1684 /* Create overflow fme */ 1685 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1686 out(O_ALTFP|O_NONL, "["); 1687 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1688 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1689 publish_undiagnosable(hdl, ffep, fmcase); 1690 return; 1691 } 1692 1693 Open_fme_count++; 1694 1695 init_fme_bufs(fmep); 1696 fmep->overflow = B_TRUE; 1697 1698 if (ffep) 1699 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1700 1701 Undiag_reason = UD_VAL_MAXFME; 1702 defect = fmd_nvl_create_fault(hdl, 1703 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1704 (void) nvlist_add_string(defect, UNDIAG_REASON, 1705 undiag_2reason_str(Undiag_reason)); 1706 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1707 fmd_case_solve(hdl, fmep->fmcase); 1708 Undiag_reason = UD_VAL_UNKNOWN; 1709 return; 1710 } 1711 1712 /* open a case */ 1713 fmcase = fmd_case_open(hdl, NULL); 1714 1715 /* start a new FME */ 1716 if ((fmep = newfme(eventstring, ipp, hdl, fmcase)) == NULL) { 1717 out(O_ALTFP|O_NONL, "["); 1718 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1719 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1720 publish_undiagnosable(hdl, ffep, fmcase); 1721 return; 1722 } 1723 1724 Open_fme_count++; 1725 1726 init_fme_bufs(fmep); 1727 1728 out(O_ALTFP|O_NONL, "["); 1729 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1730 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1731 fmd_case_uuid(hdl, fmep->fmcase)); 1732 1733 ep = fmep->e0; 1734 ASSERT(ep != NULL); 1735 1736 /* note observation */ 1737 fmep->ecurrent = ep; 1738 if (ep->count++ == 0) { 1739 /* link it into list of observations seen */ 1740 ep->observations = fmep->observations; 1741 fmep->observations = ep; 1742 ep->nvp = evnv_dupnvl(nvl); 1743 serialize_observation(fmep, eventstring, ipp); 1744 } else { 1745 /* new payload overrides any previous */ 1746 nvlist_free(ep->nvp); 1747 ep->nvp = evnv_dupnvl(nvl); 1748 } 1749 1750 stats_counter_bump(fmep->Rcount); 1751 1752 if (ffep) { 1753 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1754 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1755 fmep->e0r = ffep; 1756 ep->ffep = ffep; 1757 } 1758 1759 /* give the diagnosis algorithm a shot at the new FME state */ 1760 fme_eval(fmep, ffep); 1761 } 1762 1763 void 1764 fme_status(int flags) 1765 { 1766 struct fme *fmep; 1767 1768 if (FMElist == NULL) { 1769 out(flags, "No fault management exercises underway."); 1770 return; 1771 } 1772 1773 for (fmep = FMElist; fmep; fmep = fmep->next) 1774 fme_print(flags, fmep); 1775 } 1776 1777 /* 1778 * "indent" routines used mostly for nicely formatted debug output, but also 1779 * for sanity checking for infinite recursion bugs. 1780 */ 1781 1782 #define MAX_INDENT 1024 1783 static const char *indent_s[MAX_INDENT]; 1784 static int current_indent; 1785 1786 static void 1787 indent_push(const char *s) 1788 { 1789 if (current_indent < MAX_INDENT) 1790 indent_s[current_indent++] = s; 1791 else 1792 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1793 } 1794 1795 static void 1796 indent_set(const char *s) 1797 { 1798 current_indent = 0; 1799 indent_push(s); 1800 } 1801 1802 static void 1803 indent_pop(void) 1804 { 1805 if (current_indent > 0) 1806 current_indent--; 1807 else 1808 out(O_DIE, "recursion underflow"); 1809 } 1810 1811 static void 1812 indent(void) 1813 { 1814 int i; 1815 if (!Verbose) 1816 return; 1817 for (i = 0; i < current_indent; i++) 1818 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1819 } 1820 1821 #define SLNEW 1 1822 #define SLCHANGED 2 1823 #define SLWAIT 3 1824 #define SLDISPROVED 4 1825 1826 static void 1827 print_suspects(int circumstance, struct fme *fmep) 1828 { 1829 struct event *ep; 1830 1831 out(O_ALTFP|O_NONL, "["); 1832 if (circumstance == SLCHANGED) { 1833 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1834 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1835 } else if (circumstance == SLWAIT) { 1836 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1837 fmep->timer); 1838 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1839 } else if (circumstance == SLDISPROVED) { 1840 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1841 } else { 1842 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1843 } 1844 1845 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1846 out(O_ALTFP, "]"); 1847 return; 1848 } 1849 1850 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1851 out(O_ALTFP|O_NONL, " "); 1852 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1853 } 1854 out(O_ALTFP, "]"); 1855 } 1856 1857 static struct node * 1858 eventprop_lookup(struct event *ep, const char *propname) 1859 { 1860 return (lut_lookup(ep->props, (void *)propname, NULL)); 1861 } 1862 1863 #define MAXDIGITIDX 23 1864 static char numbuf[MAXDIGITIDX + 1]; 1865 1866 static int 1867 node2uint(struct node *n, uint_t *valp) 1868 { 1869 struct evalue value; 1870 struct lut *globals = NULL; 1871 1872 if (n == NULL) 1873 return (1); 1874 1875 /* 1876 * check value.v since we are being asked to convert an unsigned 1877 * long long int to an unsigned int 1878 */ 1879 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1880 value.t != UINT64 || value.v > (1ULL << 32)) 1881 return (1); 1882 1883 *valp = (uint_t)value.v; 1884 1885 return (0); 1886 } 1887 1888 static nvlist_t * 1889 node2fmri(struct node *n) 1890 { 1891 nvlist_t **pa, *f, *p; 1892 struct node *nc; 1893 uint_t depth = 0; 1894 char *numstr, *nullbyte; 1895 char *failure; 1896 int err, i; 1897 1898 /* XXX do we need to be able to handle a non-T_NAME node? */ 1899 if (n == NULL || n->t != T_NAME) 1900 return (NULL); 1901 1902 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1903 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1904 break; 1905 depth++; 1906 } 1907 1908 if (nc != NULL) { 1909 /* We bailed early, something went wrong */ 1910 return (NULL); 1911 } 1912 1913 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1914 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1915 pa = alloca(depth * sizeof (nvlist_t *)); 1916 for (i = 0; i < depth; i++) 1917 pa[i] = NULL; 1918 1919 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1920 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1921 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1922 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1923 if (err != 0) { 1924 failure = "basic construction of FMRI failed"; 1925 goto boom; 1926 } 1927 1928 numbuf[MAXDIGITIDX] = '\0'; 1929 nullbyte = &numbuf[MAXDIGITIDX]; 1930 i = 0; 1931 1932 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1933 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 1934 if (err != 0) { 1935 failure = "alloc of an hc-pair failed"; 1936 goto boom; 1937 } 1938 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 1939 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 1940 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 1941 if (err != 0) { 1942 failure = "construction of an hc-pair failed"; 1943 goto boom; 1944 } 1945 pa[i++] = p; 1946 } 1947 1948 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 1949 if (err == 0) { 1950 for (i = 0; i < depth; i++) 1951 if (pa[i] != NULL) 1952 nvlist_free(pa[i]); 1953 return (f); 1954 } 1955 failure = "addition of hc-pair array to FMRI failed"; 1956 1957 boom: 1958 for (i = 0; i < depth; i++) 1959 if (pa[i] != NULL) 1960 nvlist_free(pa[i]); 1961 nvlist_free(f); 1962 out(O_DIE, "%s", failure); 1963 /*NOTREACHED*/ 1964 return (NULL); 1965 } 1966 1967 /* an ipath cache entry is an array of these, with s==NULL at the end */ 1968 struct ipath { 1969 const char *s; /* component name (in stable) */ 1970 int i; /* instance number */ 1971 }; 1972 1973 static nvlist_t * 1974 ipath2fmri(struct ipath *ipath) 1975 { 1976 nvlist_t **pa, *f, *p; 1977 uint_t depth = 0; 1978 char *numstr, *nullbyte; 1979 char *failure; 1980 int err, i; 1981 struct ipath *ipp; 1982 1983 for (ipp = ipath; ipp->s != NULL; ipp++) 1984 depth++; 1985 1986 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1987 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1988 pa = alloca(depth * sizeof (nvlist_t *)); 1989 for (i = 0; i < depth; i++) 1990 pa[i] = NULL; 1991 1992 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1993 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 1994 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 1995 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 1996 if (err != 0) { 1997 failure = "basic construction of FMRI failed"; 1998 goto boom; 1999 } 2000 2001 numbuf[MAXDIGITIDX] = '\0'; 2002 nullbyte = &numbuf[MAXDIGITIDX]; 2003 i = 0; 2004 2005 for (ipp = ipath; ipp->s != NULL; ipp++) { 2006 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2007 if (err != 0) { 2008 failure = "alloc of an hc-pair failed"; 2009 goto boom; 2010 } 2011 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2012 numstr = ulltostr(ipp->i, nullbyte); 2013 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2014 if (err != 0) { 2015 failure = "construction of an hc-pair failed"; 2016 goto boom; 2017 } 2018 pa[i++] = p; 2019 } 2020 2021 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2022 if (err == 0) { 2023 for (i = 0; i < depth; i++) 2024 if (pa[i] != NULL) 2025 nvlist_free(pa[i]); 2026 return (f); 2027 } 2028 failure = "addition of hc-pair array to FMRI failed"; 2029 2030 boom: 2031 for (i = 0; i < depth; i++) 2032 if (pa[i] != NULL) 2033 nvlist_free(pa[i]); 2034 nvlist_free(f); 2035 out(O_DIE, "%s", failure); 2036 /*NOTREACHED*/ 2037 return (NULL); 2038 } 2039 2040 static uint8_t 2041 percentof(uint_t part, uint_t whole) 2042 { 2043 unsigned long long p = part * 1000; 2044 2045 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2046 } 2047 2048 struct rsl { 2049 struct event *suspect; 2050 nvlist_t *asru; 2051 nvlist_t *fru; 2052 nvlist_t *rsrc; 2053 }; 2054 2055 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2056 2057 /* 2058 * rslfree -- free internal members of struct rsl not expected to be 2059 * freed elsewhere. 2060 */ 2061 static void 2062 rslfree(struct rsl *freeme) 2063 { 2064 if (freeme->asru != NULL) 2065 nvlist_free(freeme->asru); 2066 if (freeme->fru != NULL) 2067 nvlist_free(freeme->fru); 2068 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2069 nvlist_free(freeme->rsrc); 2070 } 2071 2072 /* 2073 * rslcmp -- compare two rsl structures. Use the following 2074 * comparisons to establish cardinality: 2075 * 2076 * 1. Name of the suspect's class. (simple strcmp) 2077 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2078 * 2079 */ 2080 static int 2081 rslcmp(const void *a, const void *b) 2082 { 2083 struct rsl *r1 = (struct rsl *)a; 2084 struct rsl *r2 = (struct rsl *)b; 2085 int rv; 2086 2087 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2088 r2->suspect->enode->u.event.ename->u.name.s); 2089 if (rv != 0) 2090 return (rv); 2091 2092 if (r1->rsrc == NULL && r2->rsrc == NULL) 2093 return (0); 2094 if (r1->rsrc == NULL) 2095 return (-1); 2096 if (r2->rsrc == NULL) 2097 return (1); 2098 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2099 } 2100 2101 /* 2102 * get_resources -- for a given suspect, determine what ASRU, FRU and 2103 * RSRC nvlists should be advertised in the final suspect list. 2104 */ 2105 void 2106 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2107 { 2108 struct node *asrudef, *frudef; 2109 nvlist_t *asru, *fru; 2110 nvlist_t *rsrc = NULL; 2111 char *pathstr; 2112 2113 /* 2114 * First find any ASRU and/or FRU defined in the 2115 * initial fault tree. 2116 */ 2117 asrudef = eventprop_lookup(sp, L_ASRU); 2118 frudef = eventprop_lookup(sp, L_FRU); 2119 2120 /* 2121 * Create FMRIs based on those definitions 2122 */ 2123 asru = node2fmri(asrudef); 2124 fru = node2fmri(frudef); 2125 pathstr = ipath2str(NULL, sp->ipp); 2126 2127 /* 2128 * Allow for platform translations of the FMRIs 2129 */ 2130 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2131 pathstr); 2132 2133 FREE(pathstr); 2134 rsrcs->suspect = sp; 2135 rsrcs->asru = asru; 2136 rsrcs->fru = fru; 2137 rsrcs->rsrc = rsrc; 2138 } 2139 2140 /* 2141 * trim_suspects -- prior to publishing, we may need to remove some 2142 * suspects from the list. If we're auto-closing upsets, we don't 2143 * want any of those in the published list. If the ASRUs for multiple 2144 * defects resolve to the same ASRU (driver) we only want to publish 2145 * that as a single suspect. 2146 */ 2147 static int 2148 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2149 fmd_event_t *ffep) 2150 { 2151 struct event *ep; 2152 struct rsl *rp = begin; 2153 struct rsl *rp2 = begin2; 2154 int mess_zero_count = 0; 2155 int serd_rval; 2156 uint_t messval; 2157 2158 /* remove any unwanted upsets and populate our array */ 2159 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2160 if (is_upset(ep->t)) 2161 continue; 2162 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2163 NULL, NULL); 2164 if (serd_rval == 0) 2165 continue; 2166 if (node2uint(eventprop_lookup(ep, L_message), 2167 &messval) == 0 && messval == 0) { 2168 get_resources(ep, rp2, fmep->config); 2169 rp2++; 2170 mess_zero_count++; 2171 } else { 2172 get_resources(ep, rp, fmep->config); 2173 rp++; 2174 fmep->nsuspects++; 2175 } 2176 } 2177 return (mess_zero_count); 2178 } 2179 2180 /* 2181 * addpayloadprop -- add a payload prop to a problem 2182 */ 2183 static void 2184 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2185 { 2186 nvlist_t *rsrc, *hcs; 2187 2188 ASSERT(fault != NULL); 2189 ASSERT(lhs != NULL); 2190 ASSERT(rhs != NULL); 2191 2192 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2193 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2194 2195 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2196 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2197 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2198 out(O_DIE, 2199 "cannot add payloadprop \"%s\" to fault", lhs); 2200 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2201 out(O_DIE, 2202 "cannot add payloadprop \"%s\" to fault", lhs); 2203 nvlist_free(hcs); 2204 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2205 out(O_DIE, 2206 "cannot add payloadprop \"%s\" to fault", lhs); 2207 } else 2208 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2209 2210 if (rhs->t == UINT64) { 2211 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2212 2213 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2214 out(O_DIE, 2215 "cannot add payloadprop \"%s\" to fault", lhs); 2216 } else { 2217 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2218 lhs, (char *)(uintptr_t)rhs->v); 2219 2220 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2221 out(O_DIE, 2222 "cannot add payloadprop \"%s\" to fault", lhs); 2223 } 2224 } 2225 2226 static char *Istatbuf; 2227 static char *Istatbufptr; 2228 static int Istatsz; 2229 2230 /* 2231 * istataddsize -- calculate size of istat and add it to Istatsz 2232 */ 2233 /*ARGSUSED2*/ 2234 static void 2235 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2236 { 2237 int val; 2238 2239 ASSERT(lhs != NULL); 2240 ASSERT(rhs != NULL); 2241 2242 if ((val = stats_counter_value(rhs)) == 0) 2243 return; /* skip zero-valued stats */ 2244 2245 /* count up the size of the stat name */ 2246 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2247 Istatsz++; /* for the trailing NULL byte */ 2248 2249 /* count up the size of the stat value */ 2250 Istatsz += snprintf(NULL, 0, "%d", val); 2251 Istatsz++; /* for the trailing NULL byte */ 2252 } 2253 2254 /* 2255 * istat2str -- serialize an istat, writing result to *Istatbufptr 2256 */ 2257 /*ARGSUSED2*/ 2258 static void 2259 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2260 { 2261 char *str; 2262 int len; 2263 int val; 2264 2265 ASSERT(lhs != NULL); 2266 ASSERT(rhs != NULL); 2267 2268 if ((val = stats_counter_value(rhs)) == 0) 2269 return; /* skip zero-valued stats */ 2270 2271 /* serialize the stat name */ 2272 str = ipath2str(lhs->ename, lhs->ipath); 2273 len = strlen(str); 2274 2275 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2276 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2277 Istatbufptr += len; 2278 FREE(str); 2279 *Istatbufptr++ = '\0'; 2280 2281 /* serialize the stat value */ 2282 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2283 "%d", val); 2284 *Istatbufptr++ = '\0'; 2285 2286 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2287 } 2288 2289 void 2290 istat_save() 2291 { 2292 if (Istat_need_save == 0) 2293 return; 2294 2295 /* figure out how big the serialzed info is */ 2296 Istatsz = 0; 2297 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2298 2299 if (Istatsz == 0) { 2300 /* no stats to save */ 2301 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2302 return; 2303 } 2304 2305 /* create the serialized buffer */ 2306 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2307 lut_walk(Istats, (lut_cb)istat2str, NULL); 2308 2309 /* clear out current saved stats */ 2310 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2311 2312 /* write out the new version */ 2313 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2314 FREE(Istatbuf); 2315 2316 Istat_need_save = 0; 2317 } 2318 2319 int 2320 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2321 { 2322 if (ent1->ename != ent2->ename) 2323 return (ent2->ename - ent1->ename); 2324 if (ent1->ipath != ent2->ipath) 2325 return ((char *)ent2->ipath - (char *)ent1->ipath); 2326 2327 return (0); 2328 } 2329 2330 /* 2331 * istat-verify -- verify the component associated with a stat still exists 2332 * 2333 * if the component no longer exists, this routine resets the stat and 2334 * returns 0. if the component still exists, it returns 1. 2335 */ 2336 static int 2337 istat_verify(struct node *snp, struct istat_entry *entp) 2338 { 2339 struct stats *statp; 2340 nvlist_t *fmri; 2341 2342 fmri = node2fmri(snp->u.event.epname); 2343 if (platform_path_exists(fmri)) { 2344 nvlist_free(fmri); 2345 return (1); 2346 } 2347 nvlist_free(fmri); 2348 2349 /* component no longer in system. zero out the associated stats */ 2350 if ((statp = (struct stats *) 2351 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2352 stats_counter_value(statp) == 0) 2353 return (0); /* stat is already reset */ 2354 2355 Istat_need_save = 1; 2356 stats_counter_reset(statp); 2357 return (0); 2358 } 2359 2360 static void 2361 istat_bump(struct node *snp, int n) 2362 { 2363 struct stats *statp; 2364 struct istat_entry ent; 2365 2366 ASSERT(snp != NULL); 2367 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2368 ASSERT(snp->u.event.epname != NULL); 2369 2370 /* class name should be hoisted into a single stable entry */ 2371 ASSERT(snp->u.event.ename->u.name.next == NULL); 2372 ent.ename = snp->u.event.ename->u.name.s; 2373 ent.ipath = ipath(snp->u.event.epname); 2374 2375 if (!istat_verify(snp, &ent)) { 2376 /* component no longer exists in system, nothing to do */ 2377 return; 2378 } 2379 2380 if ((statp = (struct stats *) 2381 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2382 /* need to create the counter */ 2383 int cnt = 0; 2384 struct node *np; 2385 char *sname; 2386 char *snamep; 2387 struct istat_entry *newentp; 2388 2389 /* count up the size of the stat name */ 2390 np = snp->u.event.ename; 2391 while (np != NULL) { 2392 cnt += strlen(np->u.name.s); 2393 cnt++; /* for the '.' or '@' */ 2394 np = np->u.name.next; 2395 } 2396 np = snp->u.event.epname; 2397 while (np != NULL) { 2398 cnt += snprintf(NULL, 0, "%s%llu", 2399 np->u.name.s, np->u.name.child->u.ull); 2400 cnt++; /* for the '/' or trailing NULL byte */ 2401 np = np->u.name.next; 2402 } 2403 2404 /* build the stat name */ 2405 snamep = sname = alloca(cnt); 2406 np = snp->u.event.ename; 2407 while (np != NULL) { 2408 snamep += snprintf(snamep, &sname[cnt] - snamep, 2409 "%s", np->u.name.s); 2410 np = np->u.name.next; 2411 if (np) 2412 *snamep++ = '.'; 2413 } 2414 *snamep++ = '@'; 2415 np = snp->u.event.epname; 2416 while (np != NULL) { 2417 snamep += snprintf(snamep, &sname[cnt] - snamep, 2418 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2419 np = np->u.name.next; 2420 if (np) 2421 *snamep++ = '/'; 2422 } 2423 *snamep++ = '\0'; 2424 2425 /* create the new stat & add it to our list */ 2426 newentp = MALLOC(sizeof (*newentp)); 2427 *newentp = ent; 2428 statp = stats_new_counter(NULL, sname, 0); 2429 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2430 (lut_cmp)istat_cmp); 2431 } 2432 2433 /* if n is non-zero, set that value instead of bumping */ 2434 if (n) { 2435 stats_counter_reset(statp); 2436 stats_counter_add(statp, n); 2437 } else 2438 stats_counter_bump(statp); 2439 Istat_need_save = 1; 2440 2441 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2442 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2443 stats_counter_value(statp)); 2444 } 2445 2446 /*ARGSUSED*/ 2447 static void 2448 istat_destructor(void *left, void *right, void *arg) 2449 { 2450 struct istat_entry *entp = (struct istat_entry *)left; 2451 struct stats *statp = (struct stats *)right; 2452 FREE(entp); 2453 stats_delete(statp); 2454 } 2455 2456 /* 2457 * Callback used in a walk of the Istats to reset matching stat counters. 2458 */ 2459 static void 2460 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2461 const struct ipath *ipp) 2462 { 2463 char *path; 2464 2465 if (entp->ipath == ipp) { 2466 path = ipath2str(entp->ename, ipp); 2467 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2468 FREE(path); 2469 stats_counter_reset(statp); 2470 Istat_need_save = 1; 2471 } 2472 } 2473 2474 /*ARGSUSED*/ 2475 static void 2476 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2477 void *unused) 2478 { 2479 char *path; 2480 nvlist_t *fmri; 2481 2482 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2483 if (!platform_path_exists(fmri)) { 2484 path = ipath2str(entp->ename, entp->ipath); 2485 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2486 FREE(path); 2487 stats_counter_reset(statp); 2488 Istat_need_save = 1; 2489 } 2490 nvlist_free(fmri); 2491 } 2492 2493 void 2494 istat_fini(void) 2495 { 2496 lut_free(Istats, istat_destructor, NULL); 2497 } 2498 2499 static char *Serdbuf; 2500 static char *Serdbufptr; 2501 static int Serdsz; 2502 2503 /* 2504 * serdaddsize -- calculate size of serd and add it to Serdsz 2505 */ 2506 /*ARGSUSED*/ 2507 static void 2508 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2509 { 2510 ASSERT(lhs != NULL); 2511 2512 /* count up the size of the stat name */ 2513 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2514 Serdsz++; /* for the trailing NULL byte */ 2515 } 2516 2517 /* 2518 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2519 */ 2520 /*ARGSUSED*/ 2521 static void 2522 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2523 { 2524 char *str; 2525 int len; 2526 2527 ASSERT(lhs != NULL); 2528 2529 /* serialize the serd engine name */ 2530 str = ipath2str(lhs->ename, lhs->ipath); 2531 len = strlen(str); 2532 2533 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2534 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2535 Serdbufptr += len; 2536 FREE(str); 2537 *Serdbufptr++ = '\0'; 2538 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2539 } 2540 2541 void 2542 serd_save() 2543 { 2544 if (Serd_need_save == 0) 2545 return; 2546 2547 /* figure out how big the serialzed info is */ 2548 Serdsz = 0; 2549 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2550 2551 if (Serdsz == 0) { 2552 /* no serd engines to save */ 2553 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2554 return; 2555 } 2556 2557 /* create the serialized buffer */ 2558 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2559 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2560 2561 /* clear out current saved stats */ 2562 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2563 2564 /* write out the new version */ 2565 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2566 FREE(Serdbuf); 2567 Serd_need_save = 0; 2568 } 2569 2570 int 2571 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2572 { 2573 if (ent1->ename != ent2->ename) 2574 return (ent2->ename - ent1->ename); 2575 if (ent1->ipath != ent2->ipath) 2576 return ((char *)ent2->ipath - (char *)ent1->ipath); 2577 2578 return (0); 2579 } 2580 2581 void 2582 fme_serd_load(fmd_hdl_t *hdl) 2583 { 2584 int sz; 2585 char *sbuf; 2586 char *sepptr; 2587 char *ptr; 2588 struct serd_entry *newentp; 2589 struct node *epname; 2590 nvlist_t *fmri; 2591 char *namestring; 2592 2593 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2594 return; 2595 sbuf = alloca(sz); 2596 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2597 ptr = sbuf; 2598 while (ptr < &sbuf[sz]) { 2599 sepptr = strchr(ptr, '@'); 2600 *sepptr = '\0'; 2601 namestring = ptr; 2602 sepptr++; 2603 ptr = sepptr; 2604 ptr += strlen(ptr); 2605 ptr++; /* move past the '\0' separating paths */ 2606 epname = pathstring2epnamenp(sepptr); 2607 fmri = node2fmri(epname); 2608 if (platform_path_exists(fmri)) { 2609 newentp = MALLOC(sizeof (*newentp)); 2610 newentp->hdl = hdl; 2611 newentp->ipath = ipath(epname); 2612 newentp->ename = stable(namestring); 2613 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2614 (void *)newentp, (lut_cmp)serd_cmp); 2615 } else 2616 Serd_need_save = 1; 2617 tree_free(epname); 2618 nvlist_free(fmri); 2619 } 2620 /* save it back again in case some of the paths no longer exist */ 2621 serd_save(); 2622 } 2623 2624 /*ARGSUSED*/ 2625 static void 2626 serd_destructor(void *left, void *right, void *arg) 2627 { 2628 struct serd_entry *entp = (struct serd_entry *)left; 2629 FREE(entp); 2630 } 2631 2632 /* 2633 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2634 */ 2635 /*ARGSUSED*/ 2636 static void 2637 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2638 { 2639 char *path; 2640 2641 if (entp->ipath == ipp) { 2642 path = ipath2str(entp->ename, ipp); 2643 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2644 fmd_serd_reset(entp->hdl, path); 2645 FREE(path); 2646 Serd_need_save = 1; 2647 } 2648 } 2649 2650 /*ARGSUSED*/ 2651 static void 2652 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2653 { 2654 char *path; 2655 nvlist_t *fmri; 2656 2657 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2658 if (!platform_path_exists(fmri)) { 2659 path = ipath2str(entp->ename, entp->ipath); 2660 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2661 fmd_serd_reset(entp->hdl, path); 2662 FREE(path); 2663 Serd_need_save = 1; 2664 } 2665 nvlist_free(fmri); 2666 } 2667 2668 void 2669 serd_fini(void) 2670 { 2671 lut_free(SerdEngines, serd_destructor, NULL); 2672 } 2673 2674 static void 2675 publish_suspects(struct fme *fmep, struct rsl *srl) 2676 { 2677 struct rsl *rp; 2678 nvlist_t *fault; 2679 uint8_t cert; 2680 uint_t *frs; 2681 uint_t frsum, fr; 2682 uint_t messval; 2683 uint_t retireval; 2684 uint_t responseval; 2685 struct node *snp; 2686 int frcnt, fridx; 2687 boolean_t allfaulty = B_TRUE; 2688 struct rsl *erl = srl + fmep->nsuspects - 1; 2689 2690 /* 2691 * sort the array 2692 */ 2693 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2694 2695 /* sum the fitrates */ 2696 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2697 fridx = frcnt = frsum = 0; 2698 2699 for (rp = srl; rp <= erl; rp++) { 2700 struct node *n; 2701 2702 n = eventprop_lookup(rp->suspect, L_FITrate); 2703 if (node2uint(n, &fr) != 0) { 2704 out(O_DEBUG|O_NONL, "event "); 2705 ipath_print(O_DEBUG|O_NONL, 2706 rp->suspect->enode->u.event.ename->u.name.s, 2707 rp->suspect->ipp); 2708 out(O_DEBUG, " has no FITrate (using 1)"); 2709 fr = 1; 2710 } else if (fr == 0) { 2711 out(O_DEBUG|O_NONL, "event "); 2712 ipath_print(O_DEBUG|O_NONL, 2713 rp->suspect->enode->u.event.ename->u.name.s, 2714 rp->suspect->ipp); 2715 out(O_DEBUG, " has zero FITrate (using 1)"); 2716 fr = 1; 2717 } 2718 2719 frs[fridx++] = fr; 2720 frsum += fr; 2721 frcnt++; 2722 } 2723 2724 /* Add them in reverse order of our sort, as fmd reverses order */ 2725 for (rp = erl; rp >= srl; rp--) { 2726 cert = percentof(frs[--fridx], frsum); 2727 fault = fmd_nvl_create_fault(fmep->hdl, 2728 rp->suspect->enode->u.event.ename->u.name.s, 2729 cert, 2730 rp->asru, 2731 rp->fru, 2732 rp->rsrc); 2733 if (fault == NULL) 2734 out(O_DIE, "fault creation failed"); 2735 /* if "message" property exists, add it to the fault */ 2736 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2737 &messval) == 0) { 2738 2739 out(O_ALTFP, 2740 "[FME%d, %s adds message=%d to suspect list]", 2741 fmep->id, 2742 rp->suspect->enode->u.event.ename->u.name.s, 2743 messval); 2744 if (nvlist_add_boolean_value(fault, 2745 FM_SUSPECT_MESSAGE, 2746 (messval) ? B_TRUE : B_FALSE) != 0) { 2747 out(O_DIE, "cannot add no-message to fault"); 2748 } 2749 } 2750 2751 /* if "retire" property exists, add it to the fault */ 2752 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2753 &retireval) == 0) { 2754 2755 out(O_ALTFP, 2756 "[FME%d, %s adds retire=%d to suspect list]", 2757 fmep->id, 2758 rp->suspect->enode->u.event.ename->u.name.s, 2759 retireval); 2760 if (nvlist_add_boolean_value(fault, 2761 FM_SUSPECT_RETIRE, 2762 (retireval) ? B_TRUE : B_FALSE) != 0) { 2763 out(O_DIE, "cannot add no-retire to fault"); 2764 } 2765 } 2766 2767 /* if "response" property exists, add it to the fault */ 2768 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2769 &responseval) == 0) { 2770 2771 out(O_ALTFP, 2772 "[FME%d, %s adds response=%d to suspect list]", 2773 fmep->id, 2774 rp->suspect->enode->u.event.ename->u.name.s, 2775 responseval); 2776 if (nvlist_add_boolean_value(fault, 2777 FM_SUSPECT_RESPONSE, 2778 (responseval) ? B_TRUE : B_FALSE) != 0) { 2779 out(O_DIE, "cannot add no-response to fault"); 2780 } 2781 } 2782 2783 /* add any payload properties */ 2784 lut_walk(rp->suspect->payloadprops, 2785 (lut_cb)addpayloadprop, (void *)fault); 2786 rslfree(rp); 2787 2788 /* 2789 * If "action" property exists, evaluate it; this must be done 2790 * before the allfaulty check below since some actions may 2791 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2792 * needs to be restructured if any new actions are introduced 2793 * that have effects that we do not want to be visible if 2794 * we decide not to publish in the dupclose check below. 2795 */ 2796 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2797 struct evalue evalue; 2798 2799 out(O_ALTFP|O_NONL, 2800 "[FME%d, %s action ", fmep->id, 2801 rp->suspect->enode->u.event.ename->u.name.s); 2802 ptree_name_iter(O_ALTFP|O_NONL, snp); 2803 out(O_ALTFP, "]"); 2804 Action_nvl = fault; 2805 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2806 NULL, 0, &evalue); 2807 } 2808 2809 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2810 2811 /* 2812 * check if the asru is already marked as "faulty". 2813 */ 2814 if (allfaulty) { 2815 nvlist_t *asru; 2816 2817 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2818 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2819 out(O_ALTFP|O_VERB|O_NONL, " "); 2820 if (nvlist_lookup_nvlist(fault, 2821 FM_FAULT_ASRU, &asru) != 0) { 2822 out(O_ALTFP|O_VERB, "NULL asru"); 2823 allfaulty = B_FALSE; 2824 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2825 FMD_HAS_FAULT_ASRU, NULL)) { 2826 out(O_ALTFP|O_VERB, "faulty"); 2827 } else { 2828 out(O_ALTFP|O_VERB, "not faulty"); 2829 allfaulty = B_FALSE; 2830 } 2831 } 2832 2833 } 2834 2835 if (!allfaulty) { 2836 /* 2837 * don't update the count stat if all asrus are already 2838 * present and unrepaired in the asru cache 2839 */ 2840 for (rp = erl; rp >= srl; rp--) { 2841 struct event *suspect = rp->suspect; 2842 2843 if (suspect == NULL) 2844 continue; 2845 2846 /* if "count" exists, increment the appropriate stat */ 2847 if ((snp = eventprop_lookup(suspect, 2848 L_count)) != NULL) { 2849 out(O_ALTFP|O_NONL, 2850 "[FME%d, %s count ", fmep->id, 2851 suspect->enode->u.event.ename->u.name.s); 2852 ptree_name_iter(O_ALTFP|O_NONL, snp); 2853 out(O_ALTFP, "]"); 2854 istat_bump(snp, 0); 2855 2856 } 2857 } 2858 istat_save(); /* write out any istat changes */ 2859 } 2860 } 2861 2862 static const char * 2863 undiag_2defect_str(int ud) 2864 { 2865 switch (ud) { 2866 case UD_VAL_MISSINGINFO: 2867 case UD_VAL_MISSINGOBS: 2868 case UD_VAL_MISSINGPATH: 2869 case UD_VAL_MISSINGZERO: 2870 case UD_VAL_BADOBS: 2871 case UD_VAL_CFGMISMATCH: 2872 return (UNDIAG_DEFECT_CHKPT); 2873 break; 2874 2875 case UD_VAL_BADEVENTI: 2876 case UD_VAL_INSTFAIL: 2877 case UD_VAL_NOPATH: 2878 case UD_VAL_UNSOLVD: 2879 return (UNDIAG_DEFECT_FME); 2880 break; 2881 2882 case UD_VAL_MAXFME: 2883 return (UNDIAG_DEFECT_LIMIT); 2884 break; 2885 2886 case UD_VAL_UNKNOWN: 2887 default: 2888 return (UNDIAG_DEFECT_UNKNOWN); 2889 break; 2890 } 2891 } 2892 2893 const char * 2894 undiag_2reason_str(int ud) 2895 { 2896 switch (ud) { 2897 case UD_VAL_BADEVENTI: 2898 return (UD_STR_BADEVENTI); 2899 case UD_VAL_BADOBS: 2900 return (UD_STR_BADOBS); 2901 case UD_VAL_CFGMISMATCH: 2902 return (UD_STR_CFGMISMATCH); 2903 case UD_VAL_INSTFAIL: 2904 return (UD_STR_INSTFAIL); 2905 case UD_VAL_MAXFME: 2906 return (UD_STR_MAXFME); 2907 case UD_VAL_MISSINGINFO: 2908 return (UD_STR_MISSINGINFO); 2909 case UD_VAL_MISSINGOBS: 2910 return (UD_STR_MISSINGOBS); 2911 case UD_VAL_MISSINGPATH: 2912 return (UD_STR_MISSINGPATH); 2913 case UD_VAL_MISSINGZERO: 2914 return (UD_STR_MISSINGZERO); 2915 case UD_VAL_NOPATH: 2916 return (UD_STR_NOPATH); 2917 case UD_VAL_UNSOLVD: 2918 return (UD_STR_UNSOLVD); 2919 case UD_VAL_UNKNOWN: 2920 default: 2921 return (UD_STR_UNKNOWN); 2922 } 2923 } 2924 2925 static void 2926 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase) 2927 { 2928 struct case_list *newcase; 2929 nvlist_t *defect; 2930 2931 out(O_ALTFP, 2932 "[undiagnosable ereport received, " 2933 "creating and closing a new case (%s)]", 2934 undiag_2reason_str(Undiag_reason)); 2935 2936 newcase = MALLOC(sizeof (struct case_list)); 2937 newcase->next = NULL; 2938 newcase->fmcase = fmcase; 2939 if (Undiagablecaselist != NULL) 2940 newcase->next = Undiagablecaselist; 2941 Undiagablecaselist = newcase; 2942 2943 if (ffep != NULL) 2944 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 2945 2946 defect = fmd_nvl_create_fault(hdl, 2947 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 2948 (void) nvlist_add_string(defect, UNDIAG_REASON, 2949 undiag_2reason_str(Undiag_reason)); 2950 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 2951 2952 fmd_case_solve(hdl, newcase->fmcase); 2953 fmd_case_close(hdl, newcase->fmcase); 2954 Undiag_reason = UD_VAL_UNKNOWN; 2955 } 2956 2957 static void 2958 fme_undiagnosable(struct fme *f) 2959 { 2960 nvlist_t *defect; 2961 2962 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 2963 f->id, fmd_case_uuid(f->hdl, f->fmcase), 2964 undiag_2reason_str(Undiag_reason)); 2965 2966 defect = fmd_nvl_create_fault(f->hdl, 2967 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 2968 (void) nvlist_add_string(defect, UNDIAG_REASON, 2969 undiag_2reason_str(Undiag_reason)); 2970 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 2971 fmd_case_solve(f->hdl, f->fmcase); 2972 fmd_case_close(f->hdl, f->fmcase); 2973 Undiag_reason = UD_VAL_UNKNOWN; 2974 } 2975 2976 /* 2977 * fme_close_case 2978 * 2979 * Find the requested case amongst our fmes and close it. Free up 2980 * the related fme. 2981 */ 2982 void 2983 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 2984 { 2985 struct case_list *ucasep, *prevcasep = NULL; 2986 struct fme *prev = NULL; 2987 struct fme *fmep; 2988 2989 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 2990 if (fmcase != ucasep->fmcase) { 2991 prevcasep = ucasep; 2992 continue; 2993 } 2994 2995 if (prevcasep == NULL) 2996 Undiagablecaselist = Undiagablecaselist->next; 2997 else 2998 prevcasep->next = ucasep->next; 2999 3000 FREE(ucasep); 3001 return; 3002 } 3003 3004 for (fmep = FMElist; fmep; fmep = fmep->next) { 3005 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3006 break; 3007 prev = fmep; 3008 } 3009 3010 if (fmep == NULL) { 3011 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3012 fmd_case_uuid(hdl, fmcase)); 3013 return; 3014 } 3015 3016 if (EFMElist == fmep) 3017 EFMElist = prev; 3018 3019 if (prev == NULL) 3020 FMElist = FMElist->next; 3021 else 3022 prev->next = fmep->next; 3023 3024 fmep->next = NULL; 3025 3026 /* Get rid of any timer this fme has set */ 3027 if (fmep->wull != 0) 3028 fmd_timer_remove(fmep->hdl, fmep->timer); 3029 3030 if (ClosedFMEs == NULL) { 3031 ClosedFMEs = fmep; 3032 } else { 3033 fmep->next = ClosedFMEs; 3034 ClosedFMEs = fmep; 3035 } 3036 3037 Open_fme_count--; 3038 3039 /* See if we can close the overflow FME */ 3040 if (Open_fme_count <= Max_fme) { 3041 for (fmep = FMElist; fmep; fmep = fmep->next) { 3042 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3043 fmep->fmcase))) 3044 break; 3045 } 3046 3047 if (fmep != NULL) 3048 fmd_case_close(fmep->hdl, fmep->fmcase); 3049 } 3050 } 3051 3052 /* 3053 * fme_set_timer() 3054 * If the time we need to wait for the given FME is less than the 3055 * current timer, kick that old timer out and establish a new one. 3056 */ 3057 static int 3058 fme_set_timer(struct fme *fmep, unsigned long long wull) 3059 { 3060 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3061 ptree_timeval(O_ALTFP|O_VERB, &wull); 3062 3063 if (wull <= fmep->pull) { 3064 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3065 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3066 out(O_ALTFP|O_VERB, NULL); 3067 /* we've waited at least wull already, don't need timer */ 3068 return (0); 3069 } 3070 3071 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3072 if (fmep->wull != 0) { 3073 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3074 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3075 out(O_ALTFP|O_VERB, NULL); 3076 } else { 3077 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3078 out(O_ALTFP|O_VERB, NULL); 3079 } 3080 3081 if (fmep->wull != 0) 3082 if (wull >= fmep->wull) 3083 /* New timer would fire later than established timer */ 3084 return (0); 3085 3086 if (fmep->wull != 0) { 3087 fmd_timer_remove(fmep->hdl, fmep->timer); 3088 } 3089 3090 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3091 fmep->e0r, wull); 3092 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3093 fmep->wull = wull; 3094 return (1); 3095 } 3096 3097 void 3098 fme_timer_fired(struct fme *fmep, id_t tid) 3099 { 3100 struct fme *ffmep = NULL; 3101 3102 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3103 if (ffmep == fmep) 3104 break; 3105 3106 if (ffmep == NULL) { 3107 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3108 (void *)fmep); 3109 return; 3110 } 3111 3112 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3113 fmep->pull = fmep->wull; 3114 fmep->wull = 0; 3115 fmd_buf_write(fmep->hdl, fmep->fmcase, 3116 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3117 3118 fme_eval(fmep, fmep->e0r); 3119 } 3120 3121 /* 3122 * Preserve the fme's suspect list in its psuspects list, NULLing the 3123 * suspects list in the meantime. 3124 */ 3125 static void 3126 save_suspects(struct fme *fmep) 3127 { 3128 struct event *ep; 3129 struct event *nextep; 3130 3131 /* zero out the previous suspect list */ 3132 for (ep = fmep->psuspects; ep; ep = nextep) { 3133 nextep = ep->psuspects; 3134 ep->psuspects = NULL; 3135 } 3136 fmep->psuspects = NULL; 3137 3138 /* zero out the suspect list, copying it to previous suspect list */ 3139 fmep->psuspects = fmep->suspects; 3140 for (ep = fmep->suspects; ep; ep = nextep) { 3141 nextep = ep->suspects; 3142 ep->psuspects = ep->suspects; 3143 ep->suspects = NULL; 3144 ep->is_suspect = 0; 3145 } 3146 fmep->suspects = NULL; 3147 fmep->nsuspects = 0; 3148 } 3149 3150 /* 3151 * Retrieve the fme's suspect list from its psuspects list. 3152 */ 3153 static void 3154 restore_suspects(struct fme *fmep) 3155 { 3156 struct event *ep; 3157 struct event *nextep; 3158 3159 fmep->nsuspects = 0; 3160 fmep->suspects = fmep->psuspects; 3161 for (ep = fmep->psuspects; ep; ep = nextep) { 3162 fmep->nsuspects++; 3163 nextep = ep->psuspects; 3164 ep->suspects = ep->psuspects; 3165 } 3166 } 3167 3168 /* 3169 * this is what we use to call the Emrys prototype code instead of main() 3170 */ 3171 static void 3172 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3173 { 3174 struct event *ep; 3175 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3176 struct rsl *srl = NULL; 3177 struct rsl *srl2 = NULL; 3178 int mess_zero_count; 3179 int rpcnt; 3180 3181 save_suspects(fmep); 3182 3183 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3184 indent_set(" "); 3185 3186 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3187 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3188 3189 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3190 fme_state2str(fmep->state)); 3191 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3192 out(O_ALTFP|O_NONL, " "); 3193 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3194 } 3195 out(O_ALTFP, NULL); 3196 3197 switch (fmep->state) { 3198 case FME_CREDIBLE: 3199 print_suspects(SLNEW, fmep); 3200 (void) upsets_eval(fmep, ffep); 3201 3202 /* 3203 * we may have already posted suspects in upsets_eval() which 3204 * can recurse into fme_eval() again. If so then just return. 3205 */ 3206 if (fmep->posted_suspects) 3207 return; 3208 3209 stats_counter_bump(fmep->diags); 3210 rpcnt = fmep->nsuspects; 3211 save_suspects(fmep); 3212 3213 /* 3214 * create two lists, one for "message=1" faults and one for 3215 * "message=0" faults. If we have a mixture we will generate 3216 * two separate suspect lists. 3217 */ 3218 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3219 bzero(srl, rpcnt * sizeof (struct rsl)); 3220 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3221 bzero(srl2, rpcnt * sizeof (struct rsl)); 3222 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3223 3224 /* 3225 * If the resulting suspect list has no members, we're 3226 * done so simply close the case. Otherwise sort and publish. 3227 */ 3228 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3229 out(O_ALTFP, 3230 "[FME%d, case %s (all suspects are upsets)]", 3231 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3232 fmd_case_close(fmep->hdl, fmep->fmcase); 3233 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3234 publish_suspects(fmep, srl); 3235 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3236 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3237 fmd_case_solve(fmep->hdl, fmep->fmcase); 3238 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3239 fmep->nsuspects = mess_zero_count; 3240 publish_suspects(fmep, srl2); 3241 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3242 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3243 fmd_case_solve(fmep->hdl, fmep->fmcase); 3244 } else { 3245 struct event *obsp; 3246 struct fme *nfmep; 3247 3248 publish_suspects(fmep, srl); 3249 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3250 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3251 fmd_case_solve(fmep->hdl, fmep->fmcase); 3252 3253 /* 3254 * Got both message=0 and message=1 so create a 3255 * duplicate case. Also need a temporary duplicate fme 3256 * structure for use by publish_suspects(). 3257 */ 3258 nfmep = alloc_fme(); 3259 nfmep->id = Nextid++; 3260 nfmep->hdl = fmep->hdl; 3261 nfmep->nsuspects = mess_zero_count; 3262 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3263 out(O_ALTFP|O_STAMP, 3264 "[creating parallel FME%d, case %s]", nfmep->id, 3265 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3266 Open_fme_count++; 3267 if (ffep) { 3268 fmd_case_setprincipal(nfmep->hdl, 3269 nfmep->fmcase, ffep); 3270 fmd_case_add_ereport(nfmep->hdl, 3271 nfmep->fmcase, ffep); 3272 } 3273 for (obsp = fmep->observations; obsp; 3274 obsp = obsp->observations) 3275 if (obsp->ffep && obsp->ffep != ffep) 3276 fmd_case_add_ereport(nfmep->hdl, 3277 nfmep->fmcase, obsp->ffep); 3278 3279 publish_suspects(nfmep, srl2); 3280 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3281 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3282 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3283 FREE(nfmep); 3284 } 3285 FREE(srl); 3286 FREE(srl2); 3287 restore_suspects(fmep); 3288 3289 fmep->posted_suspects = 1; 3290 fmd_buf_write(fmep->hdl, fmep->fmcase, 3291 WOBUF_POSTD, 3292 (void *)&fmep->posted_suspects, 3293 sizeof (fmep->posted_suspects)); 3294 3295 /* 3296 * Now the suspects have been posted, we can clear up 3297 * the instance tree as we won't be looking at it again. 3298 * Also cancel the timer as the case is now solved. 3299 */ 3300 if (fmep->wull != 0) { 3301 fmd_timer_remove(fmep->hdl, fmep->timer); 3302 fmep->wull = 0; 3303 } 3304 break; 3305 3306 case FME_WAIT: 3307 ASSERT(my_delay > fmep->ull); 3308 (void) fme_set_timer(fmep, my_delay); 3309 print_suspects(SLWAIT, fmep); 3310 itree_prune(fmep->eventtree); 3311 return; 3312 3313 case FME_DISPROVED: 3314 print_suspects(SLDISPROVED, fmep); 3315 Undiag_reason = UD_VAL_UNSOLVD; 3316 fme_undiagnosable(fmep); 3317 break; 3318 } 3319 3320 itree_free(fmep->eventtree); 3321 fmep->eventtree = NULL; 3322 structconfig_free(fmep->config); 3323 fmep->config = NULL; 3324 destroy_fme_bufs(fmep); 3325 } 3326 3327 static void indent(void); 3328 static int triggered(struct fme *fmep, struct event *ep, int mark); 3329 static enum fme_state effects_test(struct fme *fmep, 3330 struct event *fault_event, unsigned long long at_latest_by, 3331 unsigned long long *pdelay); 3332 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3333 unsigned long long at_latest_by, unsigned long long *pdelay); 3334 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3335 unsigned long long at_latest_by, unsigned long long *pdelay); 3336 3337 static int 3338 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3339 { 3340 struct constraintlist *ctp; 3341 struct evalue value; 3342 char *sep = ""; 3343 3344 if (arrowp->forever_false) { 3345 indent(); 3346 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3347 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3348 out(O_ALTFP|O_VERB|O_NONL, sep); 3349 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3350 sep = ", "; 3351 } 3352 out(O_ALTFP|O_VERB, NULL); 3353 return (0); 3354 } 3355 if (arrowp->forever_true) { 3356 indent(); 3357 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3358 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3359 out(O_ALTFP|O_VERB|O_NONL, sep); 3360 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3361 sep = ", "; 3362 } 3363 out(O_ALTFP|O_VERB, NULL); 3364 return (1); 3365 } 3366 3367 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3368 if (eval_expr(ctp->cnode, NULL, NULL, 3369 &fmep->globals, fmep->config, 3370 arrowp, 0, &value)) { 3371 /* evaluation successful */ 3372 if (value.t == UNDEFINED || value.v == 0) { 3373 /* known false */ 3374 arrowp->forever_false = 1; 3375 indent(); 3376 out(O_ALTFP|O_VERB|O_NONL, 3377 " False constraint: "); 3378 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3379 out(O_ALTFP|O_VERB, NULL); 3380 return (0); 3381 } 3382 } else { 3383 /* evaluation unsuccessful -- unknown value */ 3384 indent(); 3385 out(O_ALTFP|O_VERB|O_NONL, 3386 " Deferred constraint: "); 3387 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3388 out(O_ALTFP|O_VERB, NULL); 3389 return (1); 3390 } 3391 } 3392 /* known true */ 3393 arrowp->forever_true = 1; 3394 indent(); 3395 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3396 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3397 out(O_ALTFP|O_VERB|O_NONL, sep); 3398 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3399 sep = ", "; 3400 } 3401 out(O_ALTFP|O_VERB, NULL); 3402 return (1); 3403 } 3404 3405 static int 3406 triggered(struct fme *fmep, struct event *ep, int mark) 3407 { 3408 struct bubble *bp; 3409 struct arrowlist *ap; 3410 int count = 0; 3411 3412 stats_counter_bump(fmep->Tcallcount); 3413 for (bp = itree_next_bubble(ep, NULL); bp; 3414 bp = itree_next_bubble(ep, bp)) { 3415 if (bp->t != B_TO) 3416 continue; 3417 for (ap = itree_next_arrow(bp, NULL); ap; 3418 ap = itree_next_arrow(bp, ap)) { 3419 /* check count of marks against K in the bubble */ 3420 if ((ap->arrowp->mark & mark) && 3421 ++count >= bp->nork) 3422 return (1); 3423 } 3424 } 3425 return (0); 3426 } 3427 3428 static int 3429 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3430 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3431 { 3432 struct bubble *bp; 3433 struct arrowlist *ap; 3434 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3435 unsigned long long my_delay; 3436 enum fme_state result; 3437 int retval = 0; 3438 3439 for (bp = itree_next_bubble(ep, NULL); bp; 3440 bp = itree_next_bubble(ep, bp)) { 3441 if (bp->t != B_FROM) 3442 continue; 3443 stats_counter_bump(fmep->Marrowcount); 3444 for (ap = itree_next_arrow(bp, NULL); ap; 3445 ap = itree_next_arrow(bp, ap)) { 3446 struct event *ep2 = ap->arrowp->head->myevent; 3447 /* 3448 * if we're clearing marks, we can avoid doing 3449 * all that work evaluating constraints. 3450 */ 3451 if (mark == 0) { 3452 if (ap->arrowp->arrow_marked == 0) 3453 continue; 3454 ap->arrowp->arrow_marked = 0; 3455 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3456 if (keep && (ep2->cached_state & 3457 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3458 ep2->keep_in_tree = 1; 3459 ep2->cached_state &= 3460 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3461 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3462 keep); 3463 continue; 3464 } 3465 ap->arrowp->arrow_marked = 1; 3466 if (ep2->cached_state & REQMNTS_DISPROVED) { 3467 indent(); 3468 out(O_ALTFP|O_VERB|O_NONL, 3469 " ALREADY DISPROVED "); 3470 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3471 out(O_ALTFP|O_VERB, NULL); 3472 continue; 3473 } 3474 if (ep2->cached_state & WAIT_EFFECT) { 3475 indent(); 3476 out(O_ALTFP|O_VERB|O_NONL, 3477 " ALREADY EFFECTS WAIT "); 3478 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3479 out(O_ALTFP|O_VERB, NULL); 3480 continue; 3481 } 3482 if (ep2->cached_state & CREDIBLE_EFFECT) { 3483 indent(); 3484 out(O_ALTFP|O_VERB|O_NONL, 3485 " ALREADY EFFECTS CREDIBLE "); 3486 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3487 out(O_ALTFP|O_VERB, NULL); 3488 continue; 3489 } 3490 if ((ep2->cached_state & PARENT_WAIT) && 3491 (mark & PARENT_WAIT)) { 3492 indent(); 3493 out(O_ALTFP|O_VERB|O_NONL, 3494 " ALREADY PARENT EFFECTS WAIT "); 3495 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3496 out(O_ALTFP|O_VERB, NULL); 3497 continue; 3498 } 3499 platform_set_payloadnvp(ep2->nvp); 3500 if (checkconstraints(fmep, ap->arrowp) == 0) { 3501 platform_set_payloadnvp(NULL); 3502 indent(); 3503 out(O_ALTFP|O_VERB|O_NONL, 3504 " CONSTRAINTS FAIL "); 3505 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3506 out(O_ALTFP|O_VERB, NULL); 3507 continue; 3508 } 3509 platform_set_payloadnvp(NULL); 3510 ap->arrowp->mark |= EFFECTS_COUNTER; 3511 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3512 indent(); 3513 out(O_ALTFP|O_VERB|O_NONL, 3514 " K-COUNT NOT YET MET "); 3515 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3516 out(O_ALTFP|O_VERB, NULL); 3517 continue; 3518 } 3519 ep2->cached_state &= ~PARENT_WAIT; 3520 /* 3521 * if we've reached an ereport and no propagation time 3522 * is specified, use the Hesitate value 3523 */ 3524 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3525 ap->arrowp->maxdelay == 0ULL) { 3526 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3527 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3528 out(O_ALTFP|O_VERB, NULL); 3529 result = requirements_test(fmep, ep2, Hesitate, 3530 &my_delay); 3531 } else { 3532 result = requirements_test(fmep, ep2, 3533 at_latest_by + ap->arrowp->maxdelay, 3534 &my_delay); 3535 } 3536 if (result == FME_WAIT) { 3537 retval = WAIT_EFFECT; 3538 if (overall_delay > my_delay) 3539 overall_delay = my_delay; 3540 ep2->cached_state |= WAIT_EFFECT; 3541 indent(); 3542 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3543 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3544 out(O_ALTFP|O_VERB, NULL); 3545 indent_push(" E"); 3546 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3547 at_latest_by, &my_delay, 0) == 3548 WAIT_EFFECT) { 3549 retval = WAIT_EFFECT; 3550 if (overall_delay > my_delay) 3551 overall_delay = my_delay; 3552 } 3553 indent_pop(); 3554 } else if (result == FME_DISPROVED) { 3555 indent(); 3556 out(O_ALTFP|O_VERB|O_NONL, 3557 " EFFECTS DISPROVED "); 3558 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3559 out(O_ALTFP|O_VERB, NULL); 3560 } else { 3561 ep2->cached_state |= mark; 3562 indent(); 3563 if (mark == CREDIBLE_EFFECT) 3564 out(O_ALTFP|O_VERB|O_NONL, 3565 " EFFECTS CREDIBLE "); 3566 else 3567 out(O_ALTFP|O_VERB|O_NONL, 3568 " PARENT EFFECTS WAIT "); 3569 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3570 out(O_ALTFP|O_VERB, NULL); 3571 indent_push(" E"); 3572 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3573 &my_delay, 0) == WAIT_EFFECT) { 3574 retval = WAIT_EFFECT; 3575 if (overall_delay > my_delay) 3576 overall_delay = my_delay; 3577 } 3578 indent_pop(); 3579 } 3580 } 3581 } 3582 if (retval == WAIT_EFFECT) 3583 *pdelay = overall_delay; 3584 return (retval); 3585 } 3586 3587 static enum fme_state 3588 effects_test(struct fme *fmep, struct event *fault_event, 3589 unsigned long long at_latest_by, unsigned long long *pdelay) 3590 { 3591 struct event *error_event; 3592 enum fme_state return_value = FME_CREDIBLE; 3593 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3594 unsigned long long my_delay; 3595 3596 stats_counter_bump(fmep->Ecallcount); 3597 indent_push(" E"); 3598 indent(); 3599 out(O_ALTFP|O_VERB|O_NONL, "->"); 3600 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3601 out(O_ALTFP|O_VERB, NULL); 3602 3603 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3604 &my_delay, 0) == WAIT_EFFECT) { 3605 return_value = FME_WAIT; 3606 if (overall_delay > my_delay) 3607 overall_delay = my_delay; 3608 } 3609 for (error_event = fmep->observations; 3610 error_event; error_event = error_event->observations) { 3611 indent(); 3612 out(O_ALTFP|O_VERB|O_NONL, " "); 3613 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3614 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3615 if (error_event->cached_state & 3616 (PARENT_WAIT|WAIT_EFFECT)) { 3617 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3618 continue; 3619 } 3620 return_value = FME_DISPROVED; 3621 out(O_ALTFP|O_VERB, " NOT triggered"); 3622 break; 3623 } else { 3624 out(O_ALTFP|O_VERB, " triggered"); 3625 } 3626 } 3627 if (return_value == FME_DISPROVED) { 3628 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3629 } else { 3630 fault_event->keep_in_tree = 1; 3631 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3632 } 3633 3634 indent(); 3635 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3636 fme_state2str(return_value)); 3637 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3638 out(O_ALTFP|O_VERB, NULL); 3639 indent_pop(); 3640 if (return_value == FME_WAIT) 3641 *pdelay = overall_delay; 3642 return (return_value); 3643 } 3644 3645 static enum fme_state 3646 requirements_test(struct fme *fmep, struct event *ep, 3647 unsigned long long at_latest_by, unsigned long long *pdelay) 3648 { 3649 int waiting_events; 3650 int credible_events; 3651 int deferred_events; 3652 enum fme_state return_value = FME_CREDIBLE; 3653 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3654 unsigned long long arrow_delay; 3655 unsigned long long my_delay; 3656 struct event *ep2; 3657 struct bubble *bp; 3658 struct arrowlist *ap; 3659 3660 if (ep->cached_state & REQMNTS_CREDIBLE) { 3661 indent(); 3662 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3663 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3664 out(O_ALTFP|O_VERB, NULL); 3665 return (FME_CREDIBLE); 3666 } 3667 if (ep->cached_state & REQMNTS_DISPROVED) { 3668 indent(); 3669 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3670 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3671 out(O_ALTFP|O_VERB, NULL); 3672 return (FME_DISPROVED); 3673 } 3674 if (ep->cached_state & REQMNTS_WAIT) { 3675 indent(); 3676 *pdelay = ep->cached_delay; 3677 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3678 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3679 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3680 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3681 out(O_ALTFP|O_VERB, NULL); 3682 return (FME_WAIT); 3683 } 3684 stats_counter_bump(fmep->Rcallcount); 3685 indent_push(" R"); 3686 indent(); 3687 out(O_ALTFP|O_VERB|O_NONL, "->"); 3688 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3689 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3690 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3691 out(O_ALTFP|O_VERB, NULL); 3692 3693 if (ep->t == N_EREPORT) { 3694 if (ep->count == 0) { 3695 if (fmep->pull >= at_latest_by) { 3696 return_value = FME_DISPROVED; 3697 } else { 3698 ep->cached_delay = *pdelay = at_latest_by; 3699 return_value = FME_WAIT; 3700 } 3701 } 3702 3703 indent(); 3704 switch (return_value) { 3705 case FME_CREDIBLE: 3706 ep->cached_state |= REQMNTS_CREDIBLE; 3707 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3708 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3709 break; 3710 case FME_DISPROVED: 3711 ep->cached_state |= REQMNTS_DISPROVED; 3712 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3713 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3714 break; 3715 case FME_WAIT: 3716 ep->cached_state |= REQMNTS_WAIT; 3717 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3718 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3719 out(O_ALTFP|O_VERB|O_NONL, " to "); 3720 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3721 break; 3722 default: 3723 out(O_DIE, "requirements_test: unexpected fme_state"); 3724 break; 3725 } 3726 out(O_ALTFP|O_VERB, NULL); 3727 indent_pop(); 3728 3729 return (return_value); 3730 } 3731 3732 /* this event is not a report, descend the tree */ 3733 for (bp = itree_next_bubble(ep, NULL); bp; 3734 bp = itree_next_bubble(ep, bp)) { 3735 int n; 3736 3737 if (bp->t != B_FROM) 3738 continue; 3739 3740 n = bp->nork; 3741 3742 credible_events = 0; 3743 waiting_events = 0; 3744 deferred_events = 0; 3745 arrow_delay = TIMEVAL_EVENTUALLY; 3746 /* 3747 * n is -1 for 'A' so adjust it. 3748 * XXX just count up the arrows for now. 3749 */ 3750 if (n < 0) { 3751 n = 0; 3752 for (ap = itree_next_arrow(bp, NULL); ap; 3753 ap = itree_next_arrow(bp, ap)) 3754 n++; 3755 indent(); 3756 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3757 } else { 3758 indent(); 3759 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3760 } 3761 3762 if (n == 0) 3763 continue; 3764 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3765 for (ap = itree_next_arrow(bp, NULL); ap; 3766 ap = itree_next_arrow(bp, ap)) { 3767 ep2 = ap->arrowp->head->myevent; 3768 platform_set_payloadnvp(ep2->nvp); 3769 (void) checkconstraints(fmep, ap->arrowp); 3770 if (!ap->arrowp->forever_false) { 3771 /* 3772 * if all arrows are invalidated by the 3773 * constraints, then we should elide the 3774 * whole bubble to be consistant with 3775 * the tree creation time behaviour 3776 */ 3777 bp->mark |= BUBBLE_OK; 3778 platform_set_payloadnvp(NULL); 3779 break; 3780 } 3781 platform_set_payloadnvp(NULL); 3782 } 3783 } 3784 for (ap = itree_next_arrow(bp, NULL); ap; 3785 ap = itree_next_arrow(bp, ap)) { 3786 ep2 = ap->arrowp->head->myevent; 3787 if (n <= credible_events) 3788 break; 3789 3790 ap->arrowp->mark |= REQMNTS_COUNTER; 3791 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3792 /* XXX adding max timevals! */ 3793 switch (requirements_test(fmep, ep2, 3794 at_latest_by + ap->arrowp->maxdelay, 3795 &my_delay)) { 3796 case FME_DEFERRED: 3797 deferred_events++; 3798 break; 3799 case FME_CREDIBLE: 3800 credible_events++; 3801 break; 3802 case FME_DISPROVED: 3803 break; 3804 case FME_WAIT: 3805 if (my_delay < arrow_delay) 3806 arrow_delay = my_delay; 3807 waiting_events++; 3808 break; 3809 default: 3810 out(O_DIE, 3811 "Bug in requirements_test."); 3812 } 3813 else 3814 deferred_events++; 3815 } 3816 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3817 bp->mark |= BUBBLE_ELIDED; 3818 continue; 3819 } 3820 indent(); 3821 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3822 credible_events + deferred_events, waiting_events); 3823 if (credible_events + deferred_events + waiting_events < n) { 3824 /* Can never meet requirements */ 3825 ep->cached_state |= REQMNTS_DISPROVED; 3826 indent(); 3827 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3828 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3829 out(O_ALTFP|O_VERB, NULL); 3830 indent_pop(); 3831 return (FME_DISPROVED); 3832 } 3833 if (credible_events + deferred_events < n) { 3834 /* will have to wait */ 3835 /* wait time is shortest known */ 3836 if (arrow_delay < overall_delay) 3837 overall_delay = arrow_delay; 3838 return_value = FME_WAIT; 3839 } else if (credible_events < n) { 3840 if (return_value != FME_WAIT) 3841 return_value = FME_DEFERRED; 3842 } 3843 } 3844 3845 /* 3846 * don't mark as FME_DEFERRED. If this event isn't reached by another 3847 * path, then this will be considered FME_CREDIBLE. But if it is 3848 * reached by a different path so the K-count is met, then might 3849 * get overridden by FME_WAIT or FME_DISPROVED. 3850 */ 3851 if (return_value == FME_WAIT) { 3852 ep->cached_state |= REQMNTS_WAIT; 3853 ep->cached_delay = *pdelay = overall_delay; 3854 } else if (return_value == FME_CREDIBLE) { 3855 ep->cached_state |= REQMNTS_CREDIBLE; 3856 } 3857 indent(); 3858 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 3859 fme_state2str(return_value)); 3860 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3861 out(O_ALTFP|O_VERB, NULL); 3862 indent_pop(); 3863 return (return_value); 3864 } 3865 3866 static enum fme_state 3867 causes_test(struct fme *fmep, struct event *ep, 3868 unsigned long long at_latest_by, unsigned long long *pdelay) 3869 { 3870 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3871 unsigned long long my_delay; 3872 int credible_results = 0; 3873 int waiting_results = 0; 3874 enum fme_state fstate; 3875 struct event *tail_event; 3876 struct bubble *bp; 3877 struct arrowlist *ap; 3878 int k = 1; 3879 3880 stats_counter_bump(fmep->Ccallcount); 3881 indent_push(" C"); 3882 indent(); 3883 out(O_ALTFP|O_VERB|O_NONL, "->"); 3884 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3885 out(O_ALTFP|O_VERB, NULL); 3886 3887 for (bp = itree_next_bubble(ep, NULL); bp; 3888 bp = itree_next_bubble(ep, bp)) { 3889 if (bp->t != B_TO) 3890 continue; 3891 k = bp->nork; /* remember the K value */ 3892 for (ap = itree_next_arrow(bp, NULL); ap; 3893 ap = itree_next_arrow(bp, ap)) { 3894 int do_not_follow = 0; 3895 3896 /* 3897 * if we get to the same event multiple times 3898 * only worry about the first one. 3899 */ 3900 if (ap->arrowp->tail->myevent->cached_state & 3901 CAUSES_TESTED) { 3902 indent(); 3903 out(O_ALTFP|O_VERB|O_NONL, 3904 " causes test already run for "); 3905 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3906 ap->arrowp->tail->myevent); 3907 out(O_ALTFP|O_VERB, NULL); 3908 continue; 3909 } 3910 3911 /* 3912 * see if false constraint prevents us 3913 * from traversing this arrow 3914 */ 3915 platform_set_payloadnvp(ep->nvp); 3916 if (checkconstraints(fmep, ap->arrowp) == 0) 3917 do_not_follow = 1; 3918 platform_set_payloadnvp(NULL); 3919 if (do_not_follow) { 3920 indent(); 3921 out(O_ALTFP|O_VERB|O_NONL, 3922 " False arrow from "); 3923 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 3924 ap->arrowp->tail->myevent); 3925 out(O_ALTFP|O_VERB, NULL); 3926 continue; 3927 } 3928 3929 ap->arrowp->tail->myevent->cached_state |= 3930 CAUSES_TESTED; 3931 tail_event = ap->arrowp->tail->myevent; 3932 fstate = hypothesise(fmep, tail_event, at_latest_by, 3933 &my_delay); 3934 3935 switch (fstate) { 3936 case FME_WAIT: 3937 if (my_delay < overall_delay) 3938 overall_delay = my_delay; 3939 waiting_results++; 3940 break; 3941 case FME_CREDIBLE: 3942 credible_results++; 3943 break; 3944 case FME_DISPROVED: 3945 break; 3946 default: 3947 out(O_DIE, "Bug in causes_test"); 3948 } 3949 } 3950 } 3951 /* compare against K */ 3952 if (credible_results + waiting_results < k) { 3953 indent(); 3954 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 3955 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3956 out(O_ALTFP|O_VERB, NULL); 3957 indent_pop(); 3958 return (FME_DISPROVED); 3959 } 3960 if (waiting_results != 0) { 3961 *pdelay = overall_delay; 3962 indent(); 3963 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 3964 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3965 out(O_ALTFP|O_VERB|O_NONL, " to "); 3966 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3967 out(O_ALTFP|O_VERB, NULL); 3968 indent_pop(); 3969 return (FME_WAIT); 3970 } 3971 indent(); 3972 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 3973 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3974 out(O_ALTFP|O_VERB, NULL); 3975 indent_pop(); 3976 return (FME_CREDIBLE); 3977 } 3978 3979 static enum fme_state 3980 hypothesise(struct fme *fmep, struct event *ep, 3981 unsigned long long at_latest_by, unsigned long long *pdelay) 3982 { 3983 enum fme_state rtr, otr; 3984 unsigned long long my_delay; 3985 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3986 3987 stats_counter_bump(fmep->Hcallcount); 3988 indent_push(" H"); 3989 indent(); 3990 out(O_ALTFP|O_VERB|O_NONL, "->"); 3991 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3992 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3993 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3994 out(O_ALTFP|O_VERB, NULL); 3995 3996 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 3997 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 3998 overall_delay = my_delay; 3999 if (rtr != FME_DISPROVED) { 4000 if (is_problem(ep->t)) { 4001 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4002 if (otr != FME_DISPROVED) { 4003 if (fmep->peek == 0 && ep->is_suspect == 0) { 4004 ep->suspects = fmep->suspects; 4005 ep->is_suspect = 1; 4006 fmep->suspects = ep; 4007 fmep->nsuspects++; 4008 } 4009 } 4010 } else 4011 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4012 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4013 overall_delay = my_delay; 4014 if ((otr != FME_DISPROVED) && 4015 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4016 *pdelay = overall_delay; 4017 } 4018 if (rtr == FME_DISPROVED) { 4019 indent(); 4020 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4021 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4022 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4023 indent_pop(); 4024 return (FME_DISPROVED); 4025 } 4026 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4027 indent(); 4028 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4029 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4030 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4031 indent_pop(); 4032 return (FME_DISPROVED); 4033 } 4034 if (otr == FME_DISPROVED) { 4035 indent(); 4036 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4037 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4038 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4039 indent_pop(); 4040 return (FME_DISPROVED); 4041 } 4042 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4043 indent(); 4044 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4045 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4046 out(O_ALTFP|O_VERB|O_NONL, " to "); 4047 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4048 out(O_ALTFP|O_VERB, NULL); 4049 indent_pop(); 4050 return (FME_WAIT); 4051 } 4052 indent(); 4053 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4054 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4055 out(O_ALTFP|O_VERB, NULL); 4056 indent_pop(); 4057 return (FME_CREDIBLE); 4058 } 4059 4060 /* 4061 * fme_istat_load -- reconstitute any persistent istats 4062 */ 4063 void 4064 fme_istat_load(fmd_hdl_t *hdl) 4065 { 4066 int sz; 4067 char *sbuf; 4068 char *ptr; 4069 4070 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4071 out(O_ALTFP, "fme_istat_load: No stats"); 4072 return; 4073 } 4074 4075 sbuf = alloca(sz); 4076 4077 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4078 4079 /* 4080 * pick apart the serialized stats 4081 * 4082 * format is: 4083 * <class-name>, '@', <path>, '\0', <value>, '\0' 4084 * for example: 4085 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4086 * 4087 * since this is parsing our own serialized data, any parsing issues 4088 * are fatal, so we check for them all with ASSERT() below. 4089 */ 4090 ptr = sbuf; 4091 while (ptr < &sbuf[sz]) { 4092 char *sepptr; 4093 struct node *np; 4094 int val; 4095 4096 sepptr = strchr(ptr, '@'); 4097 ASSERT(sepptr != NULL); 4098 *sepptr = '\0'; 4099 4100 /* construct the event */ 4101 np = newnode(T_EVENT, NULL, 0); 4102 np->u.event.ename = newnode(T_NAME, NULL, 0); 4103 np->u.event.ename->u.name.t = N_STAT; 4104 np->u.event.ename->u.name.s = stable(ptr); 4105 np->u.event.ename->u.name.it = IT_ENAME; 4106 np->u.event.ename->u.name.last = np->u.event.ename; 4107 4108 ptr = sepptr + 1; 4109 ASSERT(ptr < &sbuf[sz]); 4110 ptr += strlen(ptr); 4111 ptr++; /* move past the '\0' separating path from value */ 4112 ASSERT(ptr < &sbuf[sz]); 4113 ASSERT(isdigit(*ptr)); 4114 val = atoi(ptr); 4115 ASSERT(val > 0); 4116 ptr += strlen(ptr); 4117 ptr++; /* move past the final '\0' for this entry */ 4118 4119 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4120 ASSERT(np->u.event.epname != NULL); 4121 4122 istat_bump(np, val); 4123 tree_free(np); 4124 } 4125 4126 istat_save(); 4127 } 4128