1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * 25 * fme.c -- fault management exercise module 26 * 27 * this module provides the simulated fault management exercise. 28 */ 29 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <strings.h> 34 #include <ctype.h> 35 #include <alloca.h> 36 #include <libnvpair.h> 37 #include <sys/fm/protocol.h> 38 #include <fm/fmd_api.h> 39 #include "alloc.h" 40 #include "out.h" 41 #include "stats.h" 42 #include "stable.h" 43 #include "literals.h" 44 #include "lut.h" 45 #include "tree.h" 46 #include "ptree.h" 47 #include "itree.h" 48 #include "ipath.h" 49 #include "fme.h" 50 #include "evnv.h" 51 #include "eval.h" 52 #include "config.h" 53 #include "platform.h" 54 #include "esclex.h" 55 56 /* imported from eft.c... */ 57 extern hrtime_t Hesitate; 58 extern char *Serd_Override; 59 extern nv_alloc_t Eft_nv_hdl; 60 extern int Max_fme; 61 extern fmd_hdl_t *Hdl; 62 63 static int Istat_need_save; 64 static int Serd_need_save; 65 void istat_save(void); 66 void serd_save(void); 67 68 /* fme under construction is global so we can free it on module abort */ 69 static struct fme *Nfmep; 70 71 static int Undiag_reason = UD_VAL_UNKNOWN; 72 73 static int Nextid = 0; 74 75 static int Open_fme_count = 0; /* Count of open FMEs */ 76 77 /* list of fault management exercises underway */ 78 static struct fme { 79 struct fme *next; /* next exercise */ 80 unsigned long long ull; /* time when fme was created */ 81 int id; /* FME id */ 82 struct config *config; /* cooked configuration data */ 83 struct lut *eventtree; /* propagation tree for this FME */ 84 /* 85 * The initial error report that created this FME is kept in 86 * two forms. e0 points to the instance tree node and is used 87 * by fme_eval() as the starting point for the inference 88 * algorithm. e0r is the event handle FMD passed to us when 89 * the ereport first arrived and is used when setting timers, 90 * which are always relative to the time of this initial 91 * report. 92 */ 93 struct event *e0; 94 fmd_event_t *e0r; 95 96 id_t timer; /* for setting an fmd time-out */ 97 98 struct event *ecurrent; /* ereport under consideration */ 99 struct event *suspects; /* current suspect list */ 100 struct event *psuspects; /* previous suspect list */ 101 int nsuspects; /* count of suspects */ 102 int posted_suspects; /* true if we've posted a diagnosis */ 103 int uniqobs; /* number of unique events observed */ 104 int peek; /* just peeking, don't track suspects */ 105 int overflow; /* true if overflow FME */ 106 enum fme_state { 107 FME_NOTHING = 5000, /* not evaluated yet */ 108 FME_WAIT, /* need to wait for more info */ 109 FME_CREDIBLE, /* suspect list is credible */ 110 FME_DISPROVED, /* no valid suspects found */ 111 FME_DEFERRED /* don't know yet (k-count not met) */ 112 } state; 113 114 unsigned long long pull; /* time passed since created */ 115 unsigned long long wull; /* wait until this time for re-eval */ 116 struct event *observations; /* observation list */ 117 struct lut *globals; /* values of global variables */ 118 /* fmd interfacing */ 119 fmd_hdl_t *hdl; /* handle for talking with fmd */ 120 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 121 /* stats */ 122 struct stats *Rcount; 123 struct stats *Hcallcount; 124 struct stats *Rcallcount; 125 struct stats *Ccallcount; 126 struct stats *Ecallcount; 127 struct stats *Tcallcount; 128 struct stats *Marrowcount; 129 struct stats *diags; 130 } *FMElist, *EFMElist, *ClosedFMEs; 131 132 static struct case_list { 133 fmd_case_t *fmcase; 134 struct case_list *next; 135 } *Undiagablecaselist; 136 137 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 138 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 139 unsigned long long at_latest_by, unsigned long long *pdelay); 140 static struct node *eventprop_lookup(struct event *ep, const char *propname); 141 static struct node *pathstring2epnamenp(char *path); 142 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 143 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 144 static char *undiag_2reason_str(int ud, char *arg); 145 static const char *undiag_2defect_str(int ud); 146 static void restore_suspects(struct fme *fmep); 147 static void save_suspects(struct fme *fmep); 148 static void destroy_fme(struct fme *f); 149 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 150 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 151 static void istat_counter_reset_cb(struct istat_entry *entp, 152 struct stats *statp, const struct ipath *ipp); 153 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 154 struct stats *statp, void *unused); 155 static void serd_reset_cb(struct serd_entry *entp, void *unused, 156 const struct ipath *ipp); 157 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 158 void *unused2); 159 static void destroy_fme_bufs(struct fme *fp); 160 161 static struct fme * 162 alloc_fme(void) 163 { 164 struct fme *fmep; 165 166 fmep = MALLOC(sizeof (*fmep)); 167 bzero(fmep, sizeof (*fmep)); 168 return (fmep); 169 } 170 171 /* 172 * fme_ready -- called when all initialization of the FME (except for 173 * stats) has completed successfully. Adds the fme to global lists 174 * and establishes its stats. 175 */ 176 static struct fme * 177 fme_ready(struct fme *fmep) 178 { 179 char nbuf[100]; 180 181 Nfmep = NULL; /* don't need to free this on module abort now */ 182 183 if (EFMElist) { 184 EFMElist->next = fmep; 185 EFMElist = fmep; 186 } else 187 FMElist = EFMElist = fmep; 188 189 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 190 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 191 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 192 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 193 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 194 fmep->Rcallcount = stats_new_counter(nbuf, 195 "calls to requirements_test()", 1); 196 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 197 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 199 fmep->Ecallcount = 200 stats_new_counter(nbuf, "calls to effects_test()", 1); 201 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 202 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 203 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 204 fmep->Marrowcount = stats_new_counter(nbuf, 205 "arrows marked by mark_arrows()", 1); 206 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 207 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 208 209 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 210 config_print(O_ALTFP|O_VERB2, fmep->config); 211 212 return (fmep); 213 } 214 215 extern void ipath_dummy_lut(struct arrow *); 216 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 217 218 /* ARGSUSED */ 219 static void 220 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 221 { 222 struct bubble *bp; 223 struct arrowlist *ap; 224 225 for (bp = itree_next_bubble(ep, NULL); bp; 226 bp = itree_next_bubble(ep, bp)) { 227 if (bp->t != B_FROM) 228 continue; 229 for (ap = itree_next_arrow(bp, NULL); ap; 230 ap = itree_next_arrow(bp, ap)) { 231 ap->arrowp->pnode->u.arrow.needed = 1; 232 ipath_dummy_lut(ap->arrowp); 233 } 234 } 235 } 236 237 /* ARGSUSED */ 238 static void 239 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 240 { 241 struct bubble *bp; 242 struct arrowlist *ap; 243 244 for (bp = itree_next_bubble(ep, NULL); bp; 245 bp = itree_next_bubble(ep, bp)) { 246 if (bp->t != B_FROM) 247 continue; 248 for (ap = itree_next_arrow(bp, NULL); ap; 249 ap = itree_next_arrow(bp, ap)) 250 ap->arrowp->pnode->u.arrow.needed = 0; 251 } 252 } 253 254 static void globals_destructor(void *left, void *right, void *arg); 255 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 256 257 static boolean_t 258 prune_propagations(const char *e0class, const struct ipath *e0ipp) 259 { 260 char nbuf[100]; 261 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 262 extern struct lut *Usednames; 263 264 Nfmep = alloc_fme(); 265 Nfmep->id = Nextid; 266 Nfmep->state = FME_NOTHING; 267 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 268 if ((Nfmep->e0 = 269 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 270 itree_free(Nfmep->eventtree); 271 FREE(Nfmep); 272 Nfmep = NULL; 273 return (B_FALSE); 274 } 275 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 276 Nfmep->e0->count++; 277 278 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 279 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 280 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 281 Nfmep->Hcallcount = 282 stats_new_counter(nbuf, "calls to hypothesise()", 1); 283 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 284 Nfmep->Rcallcount = stats_new_counter(nbuf, 285 "calls to requirements_test()", 1); 286 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 287 Nfmep->Ccallcount = 288 stats_new_counter(nbuf, "calls to causes_test()", 1); 289 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 290 Nfmep->Ecallcount = 291 stats_new_counter(nbuf, "calls to effects_test()", 1); 292 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 293 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 294 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 295 Nfmep->Marrowcount = stats_new_counter(nbuf, 296 "arrows marked by mark_arrows()", 1); 297 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 298 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 299 300 Nfmep->peek = 1; 301 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 302 lut_free(Usednames, NULL, NULL); 303 Usednames = NULL; 304 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 305 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 306 itree_prune(Nfmep->eventtree); 307 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 308 309 stats_delete(Nfmep->Rcount); 310 stats_delete(Nfmep->Hcallcount); 311 stats_delete(Nfmep->Rcallcount); 312 stats_delete(Nfmep->Ccallcount); 313 stats_delete(Nfmep->Ecallcount); 314 stats_delete(Nfmep->Tcallcount); 315 stats_delete(Nfmep->Marrowcount); 316 stats_delete(Nfmep->diags); 317 itree_free(Nfmep->eventtree); 318 lut_free(Nfmep->globals, globals_destructor, NULL); 319 FREE(Nfmep); 320 return (B_TRUE); 321 } 322 323 static struct fme * 324 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 325 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 326 { 327 struct cfgdata *cfgdata; 328 int init_size; 329 extern int alloc_total(); 330 nvlist_t *detector = NULL; 331 char *pathstr; 332 char *arg; 333 334 /* 335 * First check if e0ipp is actually in the topology so we can give a 336 * more useful error message. 337 */ 338 ipathlastcomp(e0ipp); 339 pathstr = ipath2str(NULL, e0ipp); 340 cfgdata = config_snapshot(); 341 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 342 &detector, pathstr); 343 FREE(pathstr); 344 structconfig_free(cfgdata->cooked); 345 config_free(cfgdata); 346 if (detector == NULL) { 347 /* See if class permits silent discard on unknown component. */ 348 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 349 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 350 "to component path, but silent discard allowed.", 351 e0class); 352 } else { 353 Undiag_reason = UD_VAL_BADEVENTPATH; 354 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 355 &detector); 356 arg = ipath2str(e0class, e0ipp); 357 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 358 FREE(arg); 359 } 360 return (NULL); 361 } 362 363 /* 364 * Next run a quick first pass of the rules with a dummy config. This 365 * allows us to prune those rules which can't possibly cause this 366 * ereport. 367 */ 368 if (!prune_propagations(e0class, e0ipp)) { 369 /* 370 * The fault class must have been in the rules or we would 371 * not have registered for it (and got a "nosub"), and the 372 * pathname must be in the topology or we would have failed the 373 * previous test. So to get here means the combination of 374 * class and pathname in the ereport must be invalid. 375 */ 376 Undiag_reason = UD_VAL_BADEVENTCLASS; 377 arg = ipath2str(e0class, e0ipp); 378 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 379 nvlist_free(detector); 380 FREE(arg); 381 return (NULL); 382 } 383 384 /* 385 * Now go ahead and create the real fme using the pruned rules. 386 */ 387 init_size = alloc_total(); 388 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 389 nvlist_free(detector); 390 pathstr = ipath2str(NULL, e0ipp); 391 cfgdata = config_snapshot(); 392 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 393 &detector, pathstr); 394 FREE(pathstr); 395 platform_save_config(hdl, fmcase); 396 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 397 alloc_total() - init_size); 398 399 Nfmep = alloc_fme(); 400 401 Nfmep->id = Nextid++; 402 Nfmep->config = cfgdata->cooked; 403 config_free(cfgdata); 404 Nfmep->posted_suspects = 0; 405 Nfmep->uniqobs = 0; 406 Nfmep->state = FME_NOTHING; 407 Nfmep->pull = 0ULL; 408 Nfmep->overflow = 0; 409 410 Nfmep->fmcase = fmcase; 411 Nfmep->hdl = hdl; 412 413 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 414 Undiag_reason = UD_VAL_INSTFAIL; 415 arg = ipath2str(e0class, e0ipp); 416 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 417 nvlist_free(detector); 418 FREE(arg); 419 structconfig_free(Nfmep->config); 420 destroy_fme_bufs(Nfmep); 421 FREE(Nfmep); 422 Nfmep = NULL; 423 return (NULL); 424 } 425 426 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 427 428 if ((Nfmep->e0 = 429 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 430 Undiag_reason = UD_VAL_BADEVENTI; 431 arg = ipath2str(e0class, e0ipp); 432 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 433 nvlist_free(detector); 434 FREE(arg); 435 itree_free(Nfmep->eventtree); 436 structconfig_free(Nfmep->config); 437 destroy_fme_bufs(Nfmep); 438 FREE(Nfmep); 439 Nfmep = NULL; 440 return (NULL); 441 } 442 443 nvlist_free(detector); 444 return (fme_ready(Nfmep)); 445 } 446 447 void 448 fme_fini(void) 449 { 450 struct fme *sfp, *fp; 451 struct case_list *ucasep, *nextcasep; 452 453 ucasep = Undiagablecaselist; 454 while (ucasep != NULL) { 455 nextcasep = ucasep->next; 456 FREE(ucasep); 457 ucasep = nextcasep; 458 } 459 Undiagablecaselist = NULL; 460 461 /* clean up closed fmes */ 462 fp = ClosedFMEs; 463 while (fp != NULL) { 464 sfp = fp->next; 465 destroy_fme(fp); 466 fp = sfp; 467 } 468 ClosedFMEs = NULL; 469 470 fp = FMElist; 471 while (fp != NULL) { 472 sfp = fp->next; 473 destroy_fme(fp); 474 fp = sfp; 475 } 476 FMElist = EFMElist = NULL; 477 478 /* if we were in the middle of creating an fme, free it now */ 479 if (Nfmep) { 480 destroy_fme(Nfmep); 481 Nfmep = NULL; 482 } 483 } 484 485 /* 486 * Allocated space for a buffer name. 20 bytes allows for 487 * a ridiculous 9,999,999 unique observations. 488 */ 489 #define OBBUFNMSZ 20 490 491 /* 492 * serialize_observation 493 * 494 * Create a recoverable version of the current observation 495 * (f->ecurrent). We keep a serialized version of each unique 496 * observation in order that we may resume correctly the fme in the 497 * correct state if eft or fmd crashes and we're restarted. 498 */ 499 static void 500 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 501 { 502 size_t pkdlen; 503 char tmpbuf[OBBUFNMSZ]; 504 char *pkd = NULL; 505 char *estr; 506 507 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 508 estr = ipath2str(cls, ipp); 509 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 510 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 511 strlen(estr) + 1); 512 FREE(estr); 513 514 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 515 (void) snprintf(tmpbuf, 516 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 517 if (nvlist_xpack(fp->ecurrent->nvp, 518 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 519 out(O_DIE|O_SYS, "pack of observed nvl failed"); 520 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 521 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 522 FREE(pkd); 523 } 524 525 fp->uniqobs++; 526 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 527 sizeof (fp->uniqobs)); 528 } 529 530 /* 531 * init_fme_bufs -- We keep several bits of state about an fme for 532 * use if eft or fmd crashes and we're restarted. 533 */ 534 static void 535 init_fme_bufs(struct fme *fp) 536 { 537 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 538 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 539 sizeof (fp->pull)); 540 541 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 542 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 543 sizeof (fp->id)); 544 545 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 546 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 547 sizeof (fp->uniqobs)); 548 549 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 550 sizeof (fp->posted_suspects)); 551 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 552 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 553 } 554 555 static void 556 destroy_fme_bufs(struct fme *fp) 557 { 558 char tmpbuf[OBBUFNMSZ]; 559 int o; 560 561 platform_restore_config(fp->hdl, fp->fmcase); 562 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 563 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 564 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 565 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 566 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 567 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 568 569 for (o = 0; o < fp->uniqobs; o++) { 570 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 571 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 572 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 573 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 574 } 575 } 576 577 /* 578 * reconstitute_observations -- convert a case's serialized observations 579 * back into struct events. Returns zero if all observations are 580 * successfully reconstituted. 581 */ 582 static int 583 reconstitute_observations(struct fme *fmep) 584 { 585 struct event *ep; 586 struct node *epnamenp = NULL; 587 size_t pkdlen; 588 char *pkd = NULL; 589 char *tmpbuf = alloca(OBBUFNMSZ); 590 char *sepptr; 591 char *estr; 592 int ocnt; 593 int elen; 594 595 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 596 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 597 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 598 if (elen == 0) { 599 out(O_ALTFP, 600 "reconstitute_observation: no %s buffer found.", 601 tmpbuf); 602 Undiag_reason = UD_VAL_MISSINGOBS; 603 break; 604 } 605 606 estr = MALLOC(elen); 607 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 608 sepptr = strchr(estr, '@'); 609 if (sepptr == NULL) { 610 out(O_ALTFP, 611 "reconstitute_observation: %s: " 612 "missing @ separator in %s.", 613 tmpbuf, estr); 614 Undiag_reason = UD_VAL_MISSINGPATH; 615 FREE(estr); 616 break; 617 } 618 619 *sepptr = '\0'; 620 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 621 out(O_ALTFP, 622 "reconstitute_observation: %s: " 623 "trouble converting path string \"%s\" " 624 "to internal representation.", 625 tmpbuf, sepptr + 1); 626 Undiag_reason = UD_VAL_MISSINGPATH; 627 FREE(estr); 628 break; 629 } 630 631 /* construct the event */ 632 ep = itree_lookup(fmep->eventtree, 633 stable(estr), ipath(epnamenp)); 634 if (ep == NULL) { 635 out(O_ALTFP, 636 "reconstitute_observation: %s: " 637 "lookup of \"%s\" in itree failed.", 638 tmpbuf, ipath2str(estr, ipath(epnamenp))); 639 Undiag_reason = UD_VAL_BADOBS; 640 tree_free(epnamenp); 641 FREE(estr); 642 break; 643 } 644 tree_free(epnamenp); 645 646 /* 647 * We may or may not have a saved nvlist for the observation 648 */ 649 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 650 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 651 if (pkdlen != 0) { 652 pkd = MALLOC(pkdlen); 653 fmd_buf_read(fmep->hdl, 654 fmep->fmcase, tmpbuf, pkd, pkdlen); 655 ASSERT(ep->nvp == NULL); 656 if (nvlist_xunpack(pkd, 657 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 658 out(O_DIE|O_SYS, "pack of observed nvl failed"); 659 FREE(pkd); 660 } 661 662 if (ocnt == 0) 663 fmep->e0 = ep; 664 665 FREE(estr); 666 fmep->ecurrent = ep; 667 ep->count++; 668 669 /* link it into list of observations seen */ 670 ep->observations = fmep->observations; 671 fmep->observations = ep; 672 } 673 674 if (ocnt == fmep->uniqobs) { 675 (void) fme_ready(fmep); 676 return (0); 677 } 678 679 return (1); 680 } 681 682 /* 683 * restart_fme -- called during eft initialization. Reconstitutes 684 * an in-progress fme. 685 */ 686 void 687 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 688 { 689 nvlist_t *defect; 690 struct case_list *bad; 691 struct fme *fmep; 692 struct cfgdata *cfgdata; 693 size_t rawsz; 694 struct event *ep; 695 char *tmpbuf = alloca(OBBUFNMSZ); 696 char *sepptr; 697 char *estr; 698 int elen; 699 struct node *epnamenp = NULL; 700 int init_size; 701 extern int alloc_total(); 702 char *reason; 703 704 /* 705 * ignore solved or closed cases 706 */ 707 if (fmd_case_solved(hdl, inprogress) || 708 fmd_case_closed(hdl, inprogress)) 709 return; 710 711 fmep = alloc_fme(); 712 fmep->fmcase = inprogress; 713 fmep->hdl = hdl; 714 715 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 716 out(O_ALTFP, "restart_fme: no saved posted status"); 717 Undiag_reason = UD_VAL_MISSINGINFO; 718 goto badcase; 719 } else { 720 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 721 (void *)&fmep->posted_suspects, 722 sizeof (fmep->posted_suspects)); 723 } 724 725 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 726 out(O_ALTFP, "restart_fme: no saved id"); 727 Undiag_reason = UD_VAL_MISSINGINFO; 728 goto badcase; 729 } else { 730 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 731 sizeof (fmep->id)); 732 } 733 if (Nextid <= fmep->id) 734 Nextid = fmep->id + 1; 735 736 out(O_ALTFP, "Replay FME %d", fmep->id); 737 738 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 739 out(O_ALTFP, "restart_fme: No config data"); 740 Undiag_reason = UD_VAL_MISSINGINFO; 741 goto badcase; 742 } 743 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 744 sizeof (size_t)); 745 746 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 747 out(O_ALTFP, "restart_fme: No event zero"); 748 Undiag_reason = UD_VAL_MISSINGZERO; 749 goto badcase; 750 } 751 752 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 753 out(O_ALTFP, "restart_fme: no saved wait time"); 754 Undiag_reason = UD_VAL_MISSINGINFO; 755 goto badcase; 756 } else { 757 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 758 sizeof (fmep->pull)); 759 } 760 761 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 762 out(O_ALTFP, "restart_fme: no count of observations"); 763 Undiag_reason = UD_VAL_MISSINGINFO; 764 goto badcase; 765 } else { 766 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 767 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 768 } 769 770 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 771 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 772 if (elen == 0) { 773 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 774 tmpbuf); 775 Undiag_reason = UD_VAL_MISSINGOBS; 776 goto badcase; 777 } 778 estr = MALLOC(elen); 779 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 780 sepptr = strchr(estr, '@'); 781 if (sepptr == NULL) { 782 out(O_ALTFP, "reconstitute_observation: %s: " 783 "missing @ separator in %s.", 784 tmpbuf, estr); 785 Undiag_reason = UD_VAL_MISSINGPATH; 786 FREE(estr); 787 goto badcase; 788 } 789 *sepptr = '\0'; 790 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 791 out(O_ALTFP, "reconstitute_observation: %s: " 792 "trouble converting path string \"%s\" " 793 "to internal representation.", tmpbuf, sepptr + 1); 794 Undiag_reason = UD_VAL_MISSINGPATH; 795 FREE(estr); 796 goto badcase; 797 } 798 (void) prune_propagations(stable(estr), ipath(epnamenp)); 799 tree_free(epnamenp); 800 FREE(estr); 801 802 init_size = alloc_total(); 803 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 804 cfgdata = MALLOC(sizeof (struct cfgdata)); 805 cfgdata->cooked = NULL; 806 cfgdata->devcache = NULL; 807 cfgdata->devidcache = NULL; 808 cfgdata->tpcache = NULL; 809 cfgdata->cpucache = NULL; 810 cfgdata->raw_refcnt = 1; 811 812 if (rawsz > 0) { 813 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 814 out(O_ALTFP, "restart_fme: Config data size mismatch"); 815 Undiag_reason = UD_VAL_CFGMISMATCH; 816 goto badcase; 817 } 818 cfgdata->begin = MALLOC(rawsz); 819 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 820 fmd_buf_read(hdl, 821 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 822 } else { 823 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 824 } 825 826 config_cook(cfgdata); 827 fmep->config = cfgdata->cooked; 828 config_free(cfgdata); 829 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 830 alloc_total() - init_size); 831 832 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 833 /* case not properly saved or irretrievable */ 834 out(O_ALTFP, "restart_fme: NULL instance tree"); 835 Undiag_reason = UD_VAL_INSTFAIL; 836 goto badcase; 837 } 838 839 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 840 841 if (reconstitute_observations(fmep) != 0) 842 goto badcase; 843 844 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 845 for (ep = fmep->observations; ep; ep = ep->observations) { 846 out(O_ALTFP|O_NONL, " "); 847 itree_pevent_brief(O_ALTFP|O_NONL, ep); 848 } 849 out(O_ALTFP, NULL); 850 851 Open_fme_count++; 852 853 /* give the diagnosis algorithm a shot at the new FME state */ 854 fme_eval(fmep, fmep->e0r); 855 return; 856 857 badcase: 858 if (fmep->eventtree != NULL) 859 itree_free(fmep->eventtree); 860 if (fmep->config) 861 structconfig_free(fmep->config); 862 destroy_fme_bufs(fmep); 863 FREE(fmep); 864 865 /* 866 * Since we're unable to restart the case, add it to the undiagable 867 * list and solve and close it as appropriate. 868 */ 869 bad = MALLOC(sizeof (struct case_list)); 870 bad->next = NULL; 871 872 if (Undiagablecaselist != NULL) 873 bad->next = Undiagablecaselist; 874 Undiagablecaselist = bad; 875 bad->fmcase = inprogress; 876 877 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 878 fmd_case_uuid(hdl, bad->fmcase)); 879 880 if (fmd_case_solved(hdl, bad->fmcase)) { 881 out(O_ALTFP|O_NONL, "already solved, "); 882 } else { 883 out(O_ALTFP|O_NONL, "solving, "); 884 defect = fmd_nvl_create_fault(hdl, 885 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 886 reason = undiag_2reason_str(Undiag_reason, NULL); 887 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 888 FREE(reason); 889 fmd_case_add_suspect(hdl, bad->fmcase, defect); 890 fmd_case_solve(hdl, bad->fmcase); 891 Undiag_reason = UD_VAL_UNKNOWN; 892 } 893 894 if (fmd_case_closed(hdl, bad->fmcase)) { 895 out(O_ALTFP, "already closed ]"); 896 } else { 897 out(O_ALTFP, "closing ]"); 898 fmd_case_close(hdl, bad->fmcase); 899 } 900 } 901 902 /*ARGSUSED*/ 903 static void 904 globals_destructor(void *left, void *right, void *arg) 905 { 906 struct evalue *evp = (struct evalue *)right; 907 if (evp->t == NODEPTR) 908 tree_free((struct node *)(uintptr_t)evp->v); 909 evp->v = (uintptr_t)NULL; 910 FREE(evp); 911 } 912 913 void 914 destroy_fme(struct fme *f) 915 { 916 stats_delete(f->Rcount); 917 stats_delete(f->Hcallcount); 918 stats_delete(f->Rcallcount); 919 stats_delete(f->Ccallcount); 920 stats_delete(f->Ecallcount); 921 stats_delete(f->Tcallcount); 922 stats_delete(f->Marrowcount); 923 stats_delete(f->diags); 924 925 if (f->eventtree != NULL) 926 itree_free(f->eventtree); 927 if (f->config) 928 structconfig_free(f->config); 929 lut_free(f->globals, globals_destructor, NULL); 930 FREE(f); 931 } 932 933 static const char * 934 fme_state2str(enum fme_state s) 935 { 936 switch (s) { 937 case FME_NOTHING: return ("NOTHING"); 938 case FME_WAIT: return ("WAIT"); 939 case FME_CREDIBLE: return ("CREDIBLE"); 940 case FME_DISPROVED: return ("DISPROVED"); 941 case FME_DEFERRED: return ("DEFERRED"); 942 default: return ("UNKNOWN"); 943 } 944 } 945 946 static int 947 is_problem(enum nametype t) 948 { 949 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 950 } 951 952 static int 953 is_defect(enum nametype t) 954 { 955 return (t == N_DEFECT); 956 } 957 958 static int 959 is_upset(enum nametype t) 960 { 961 return (t == N_UPSET); 962 } 963 964 static void 965 fme_print(int flags, struct fme *fmep) 966 { 967 struct event *ep; 968 969 out(flags, "Fault Management Exercise %d", fmep->id); 970 out(flags, "\t State: %s", fme_state2str(fmep->state)); 971 out(flags|O_NONL, "\t Start time: "); 972 ptree_timeval(flags|O_NONL, &fmep->ull); 973 out(flags, NULL); 974 if (fmep->wull) { 975 out(flags|O_NONL, "\t Wait time: "); 976 ptree_timeval(flags|O_NONL, &fmep->wull); 977 out(flags, NULL); 978 } 979 out(flags|O_NONL, "\t E0: "); 980 if (fmep->e0) 981 itree_pevent_brief(flags|O_NONL, fmep->e0); 982 else 983 out(flags|O_NONL, "NULL"); 984 out(flags, NULL); 985 out(flags|O_NONL, "\tObservations:"); 986 for (ep = fmep->observations; ep; ep = ep->observations) { 987 out(flags|O_NONL, " "); 988 itree_pevent_brief(flags|O_NONL, ep); 989 } 990 out(flags, NULL); 991 out(flags|O_NONL, "\tSuspect list:"); 992 for (ep = fmep->suspects; ep; ep = ep->suspects) { 993 out(flags|O_NONL, " "); 994 itree_pevent_brief(flags|O_NONL, ep); 995 } 996 out(flags, NULL); 997 if (fmep->eventtree != NULL) { 998 out(flags|O_VERB2, "\t Tree:"); 999 itree_ptree(flags|O_VERB2, fmep->eventtree); 1000 } 1001 } 1002 1003 static struct node * 1004 pathstring2epnamenp(char *path) 1005 { 1006 char *sep = "/"; 1007 struct node *ret; 1008 char *ptr; 1009 1010 if ((ptr = strtok(path, sep)) == NULL) 1011 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1012 1013 ret = tree_iname(stable(ptr), NULL, 0); 1014 1015 while ((ptr = strtok(NULL, sep)) != NULL) 1016 ret = tree_name_append(ret, 1017 tree_iname(stable(ptr), NULL, 0)); 1018 1019 return (ret); 1020 } 1021 1022 /* 1023 * for a given upset sp, increment the corresponding SERD engine. if the 1024 * SERD engine trips, return the ename and ipp of the resulting ereport. 1025 * returns true if engine tripped and *enamep and *ippp were filled in. 1026 */ 1027 static int 1028 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1029 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1030 const struct ipath **ippp) 1031 { 1032 struct node *serdinst; 1033 char *serdname; 1034 char *serdresource; 1035 char *serdclass; 1036 struct node *nid; 1037 struct serd_entry *newentp; 1038 int i, serdn = -1, serdincrement = 1, len = 0; 1039 char *serdsuffix = NULL, *serdt = NULL; 1040 struct evalue *ep; 1041 1042 ASSERT(sp->t == N_UPSET); 1043 ASSERT(ffep != NULL); 1044 1045 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1046 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1047 ASSERT(ep->t == UINT64); 1048 serdn = (int)ep->v; 1049 } 1050 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1051 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1052 ASSERT(ep->t == STRING); 1053 serdt = (char *)(uintptr_t)ep->v; 1054 } 1055 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1056 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1057 ASSERT(ep->t == STRING); 1058 serdsuffix = (char *)(uintptr_t)ep->v; 1059 } 1060 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1061 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1062 ASSERT(ep->t == UINT64); 1063 serdincrement = (int)ep->v; 1064 } 1065 1066 /* 1067 * obtain instanced SERD engine from the upset sp. from this 1068 * derive serdname, the string used to identify the SERD engine. 1069 */ 1070 serdinst = eventprop_lookup(sp, L_engine); 1071 1072 if (serdinst == NULL) 1073 return (-1); 1074 1075 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1076 if (serdsuffix != NULL) 1077 len += strlen(serdsuffix); 1078 serdclass = MALLOC(len); 1079 if (serdsuffix != NULL) 1080 (void) snprintf(serdclass, len, "%s%s", 1081 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1082 else 1083 (void) snprintf(serdclass, len, "%s", 1084 serdinst->u.stmt.np->u.event.ename->u.name.s); 1085 serdresource = ipath2str(NULL, 1086 ipath(serdinst->u.stmt.np->u.event.epname)); 1087 len += strlen(serdresource) + 1; 1088 serdname = MALLOC(len); 1089 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1090 FREE(serdresource); 1091 1092 /* handle serd engine "id" property, if there is one */ 1093 if ((nid = 1094 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1095 struct evalue *gval; 1096 char suffixbuf[200]; 1097 char *suffix; 1098 char *nserdname; 1099 size_t nname; 1100 1101 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1102 ptree_name_iter(O_ALTFP|O_NONL, nid); 1103 1104 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1105 1106 if ((gval = lut_lookup(fmep->globals, 1107 (void *)nid->u.globid.s, NULL)) == NULL) { 1108 out(O_ALTFP, " undefined"); 1109 } else if (gval->t == UINT64) { 1110 out(O_ALTFP, " %llu", gval->v); 1111 (void) sprintf(suffixbuf, "%llu", gval->v); 1112 suffix = suffixbuf; 1113 } else { 1114 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1115 suffix = (char *)(uintptr_t)gval->v; 1116 } 1117 1118 nname = strlen(serdname) + strlen(suffix) + 2; 1119 nserdname = MALLOC(nname); 1120 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1121 FREE(serdname); 1122 serdname = nserdname; 1123 } 1124 1125 /* 1126 * if the engine is empty, and we have an override for n/t then 1127 * destroy and recreate it. 1128 */ 1129 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1130 fmd_serd_empty(hdl, serdname)) 1131 fmd_serd_destroy(hdl, serdname); 1132 1133 if (!fmd_serd_exists(hdl, serdname)) { 1134 struct node *nN, *nT; 1135 const char *s; 1136 struct node *nodep; 1137 struct config *cp; 1138 char *path; 1139 uint_t nval; 1140 hrtime_t tval; 1141 int i; 1142 char *ptr; 1143 int got_n_override = 0, got_t_override = 0; 1144 1145 /* no SERD engine yet, so create it */ 1146 nodep = serdinst->u.stmt.np->u.event.epname; 1147 path = ipath2str(NULL, ipath(nodep)); 1148 cp = config_lookup(fmep->config, path, 0); 1149 FREE((void *)path); 1150 1151 /* 1152 * We allow serd paramaters to be overridden, either from 1153 * eft.conf file values (if Serd_Override is set) or from 1154 * driver properties (for "serd.io.device" engines). 1155 */ 1156 if (Serd_Override != NULL) { 1157 char *save_ptr, *ptr1, *ptr2, *ptr3; 1158 ptr3 = save_ptr = STRDUP(Serd_Override); 1159 while (*ptr3 != '\0') { 1160 ptr1 = strchr(ptr3, ','); 1161 *ptr1 = '\0'; 1162 if (strcmp(ptr3, serdclass) == 0) { 1163 ptr2 = strchr(ptr1 + 1, ','); 1164 *ptr2 = '\0'; 1165 nval = atoi(ptr1 + 1); 1166 out(O_ALTFP, "serd override %s_n %d", 1167 serdclass, nval); 1168 ptr3 = strchr(ptr2 + 1, ' '); 1169 if (ptr3) 1170 *ptr3 = '\0'; 1171 ptr = STRDUP(ptr2 + 1); 1172 out(O_ALTFP, "serd override %s_t %s", 1173 serdclass, ptr); 1174 got_n_override = 1; 1175 got_t_override = 1; 1176 break; 1177 } else { 1178 ptr2 = strchr(ptr1 + 1, ','); 1179 ptr3 = strchr(ptr2 + 1, ' '); 1180 if (ptr3 == NULL) 1181 break; 1182 } 1183 ptr3++; 1184 } 1185 FREE(save_ptr); 1186 } 1187 1188 if (cp && got_n_override == 0) { 1189 /* 1190 * convert serd engine class into property name 1191 */ 1192 char *prop_name = MALLOC(strlen(serdclass) + 3); 1193 for (i = 0; i < strlen(serdclass); i++) { 1194 if (serdclass[i] == '.') 1195 prop_name[i] = '_'; 1196 else 1197 prop_name[i] = serdclass[i]; 1198 } 1199 prop_name[i++] = '_'; 1200 prop_name[i++] = 'n'; 1201 prop_name[i] = '\0'; 1202 if (s = config_getprop(cp, prop_name)) { 1203 nval = atoi(s); 1204 out(O_ALTFP, "serd override %s_n %s", 1205 serdclass, s); 1206 got_n_override = 1; 1207 } 1208 prop_name[i - 1] = 't'; 1209 if (s = config_getprop(cp, prop_name)) { 1210 ptr = STRDUP(s); 1211 out(O_ALTFP, "serd override %s_t %s", 1212 serdclass, s); 1213 got_t_override = 1; 1214 } 1215 FREE(prop_name); 1216 } 1217 1218 if (serdn != -1 && got_n_override == 0) { 1219 nval = serdn; 1220 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1221 got_n_override = 1; 1222 } 1223 if (serdt != NULL && got_t_override == 0) { 1224 ptr = STRDUP(serdt); 1225 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1226 got_t_override = 1; 1227 } 1228 1229 if (!got_n_override) { 1230 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1231 NULL); 1232 ASSERT(nN->t == T_NUM); 1233 nval = (uint_t)nN->u.ull; 1234 } 1235 if (!got_t_override) { 1236 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1237 NULL); 1238 ASSERT(nT->t == T_TIMEVAL); 1239 tval = (hrtime_t)nT->u.ull; 1240 } else { 1241 const unsigned long long *ullp; 1242 const char *suffix; 1243 int len; 1244 1245 len = strspn(ptr, "0123456789"); 1246 suffix = stable(&ptr[len]); 1247 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1248 (void *)suffix, NULL); 1249 ptr[len] = '\0'; 1250 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1251 FREE(ptr); 1252 } 1253 fmd_serd_create(hdl, serdname, nval, tval); 1254 } 1255 1256 newentp = MALLOC(sizeof (*newentp)); 1257 newentp->ename = stable(serdclass); 1258 FREE(serdclass); 1259 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1260 newentp->hdl = hdl; 1261 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1262 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1263 (void *)newentp, (lut_cmp)serd_cmp); 1264 Serd_need_save = 1; 1265 serd_save(); 1266 } else { 1267 FREE(newentp); 1268 } 1269 1270 1271 /* 1272 * increment SERD engine. if engine fires, reset serd 1273 * engine and return trip_strcode if required. 1274 */ 1275 for (i = 0; i < serdincrement; i++) { 1276 if (fmd_serd_record(hdl, serdname, ffep)) { 1277 fmd_case_add_serd(hdl, fmcase, serdname); 1278 fmd_serd_reset(hdl, serdname); 1279 1280 if (ippp) { 1281 struct node *tripinst = 1282 lut_lookup(serdinst->u.stmt.lutp, 1283 (void *)L_trip, NULL); 1284 ASSERT(tripinst != NULL); 1285 *enamep = tripinst->u.event.ename->u.name.s; 1286 *ippp = ipath(tripinst->u.event.epname); 1287 out(O_ALTFP|O_NONL, 1288 "[engine fired: %s, sending: ", serdname); 1289 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1290 out(O_ALTFP, "]"); 1291 } else { 1292 out(O_ALTFP, "[engine fired: %s, no trip]", 1293 serdname); 1294 } 1295 FREE(serdname); 1296 return (1); 1297 } 1298 } 1299 1300 FREE(serdname); 1301 return (0); 1302 } 1303 1304 /* 1305 * search a suspect list for upsets. feed each upset to serd_eval() and 1306 * build up tripped[], an array of ereports produced by the firing of 1307 * any SERD engines. then feed each ereport back into 1308 * fme_receive_report(). 1309 * 1310 * returns ntrip, the number of these ereports produced. 1311 */ 1312 static int 1313 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1314 { 1315 /* we build an array of tripped ereports that we send ourselves */ 1316 struct { 1317 const char *ename; 1318 const struct ipath *ipp; 1319 } *tripped; 1320 struct event *sp; 1321 int ntrip, nupset, i; 1322 1323 /* 1324 * count the number of upsets to determine the upper limit on 1325 * expected trip ereport strings. remember that one upset can 1326 * lead to at most one ereport. 1327 */ 1328 nupset = 0; 1329 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1330 if (sp->t == N_UPSET) 1331 nupset++; 1332 } 1333 1334 if (nupset == 0) 1335 return (0); 1336 1337 /* 1338 * get to this point if we have upsets and expect some trip 1339 * ereports 1340 */ 1341 tripped = alloca(sizeof (*tripped) * nupset); 1342 bzero((void *)tripped, sizeof (*tripped) * nupset); 1343 1344 ntrip = 0; 1345 for (sp = fmep->suspects; sp; sp = sp->suspects) 1346 if (sp->t == N_UPSET && 1347 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1348 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1349 ntrip++; 1350 1351 for (i = 0; i < ntrip; i++) { 1352 struct event *ep, *nep; 1353 struct fme *nfmep; 1354 fmd_case_t *fmcase; 1355 const struct ipath *ipp; 1356 const char *eventstring; 1357 int prev_verbose; 1358 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1359 enum fme_state state; 1360 1361 /* 1362 * First try and evaluate a case with the trip ereport plus 1363 * all the other ereports that cause the trip. If that fails 1364 * to evaluate then try again with just this ereport on its own. 1365 */ 1366 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1367 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1368 out(O_ALTFP|O_STAMP, NULL); 1369 ep = fmep->e0; 1370 eventstring = ep->enode->u.event.ename->u.name.s; 1371 ipp = ep->ipp; 1372 1373 /* 1374 * create a duplicate fme and case 1375 */ 1376 fmcase = fmd_case_open(fmep->hdl, NULL); 1377 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1378 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1379 out(O_ALTFP, " ]"); 1380 1381 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1382 fmcase, ffep, ep->nvp)) == NULL) { 1383 out(O_ALTFP|O_NONL, "["); 1384 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1385 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1386 continue; 1387 } 1388 1389 Open_fme_count++; 1390 nfmep->pull = fmep->pull; 1391 init_fme_bufs(nfmep); 1392 out(O_ALTFP|O_NONL, "["); 1393 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1394 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1395 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1396 if (ffep) { 1397 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1398 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1399 nfmep->e0r = ffep; 1400 } 1401 1402 /* 1403 * add the original ereports 1404 */ 1405 for (ep = fmep->observations; ep; ep = ep->observations) { 1406 eventstring = ep->enode->u.event.ename->u.name.s; 1407 ipp = ep->ipp; 1408 out(O_ALTFP|O_NONL, "adding event ["); 1409 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1410 out(O_ALTFP, " ]"); 1411 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1412 if (nep->count++ == 0) { 1413 nep->observations = nfmep->observations; 1414 nfmep->observations = nep; 1415 serialize_observation(nfmep, eventstring, ipp); 1416 nep->nvp = evnv_dupnvl(ep->nvp); 1417 } 1418 if (ep->ffep && ep->ffep != ffep) 1419 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1420 ep->ffep); 1421 stats_counter_bump(nfmep->Rcount); 1422 } 1423 1424 /* 1425 * add the serd trigger ereport 1426 */ 1427 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1428 tripped[i].ipp)) == NULL) { 1429 /* 1430 * The trigger ereport is not in the instance tree. It 1431 * was presumably removed by prune_propagations() as 1432 * this combination of events is not present in the 1433 * rules. 1434 */ 1435 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1436 Undiag_reason = UD_VAL_BADEVENTI; 1437 goto retry_lone_ereport; 1438 } 1439 out(O_ALTFP|O_NONL, "adding event ["); 1440 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1441 out(O_ALTFP, " ]"); 1442 nfmep->ecurrent = ep; 1443 ep->nvp = NULL; 1444 ep->count = 1; 1445 ep->observations = nfmep->observations; 1446 nfmep->observations = ep; 1447 1448 /* 1449 * just peek first. 1450 */ 1451 nfmep->peek = 1; 1452 prev_verbose = Verbose; 1453 if (Debug == 0) 1454 Verbose = 0; 1455 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1456 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1457 nfmep->peek = 0; 1458 Verbose = prev_verbose; 1459 if (state == FME_DISPROVED) { 1460 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1461 Undiag_reason = UD_VAL_UNSOLVD; 1462 retry_lone_ereport: 1463 /* 1464 * However the trigger ereport on its own might be 1465 * diagnosable, so check for that. Undo the new fme 1466 * and case we just created and call fme_receive_report. 1467 */ 1468 out(O_ALTFP|O_NONL, "["); 1469 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1470 tripped[i].ipp); 1471 out(O_ALTFP, " retrying with just trigger ereport]"); 1472 itree_free(nfmep->eventtree); 1473 nfmep->eventtree = NULL; 1474 structconfig_free(nfmep->config); 1475 nfmep->config = NULL; 1476 destroy_fme_bufs(nfmep); 1477 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1478 fme_receive_report(fmep->hdl, ffep, 1479 tripped[i].ename, tripped[i].ipp, NULL); 1480 continue; 1481 } 1482 1483 /* 1484 * and evaluate 1485 */ 1486 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1487 fme_eval(nfmep, ffep); 1488 } 1489 1490 return (ntrip); 1491 } 1492 1493 /* 1494 * fme_receive_external_report -- call when an external ereport comes in 1495 * 1496 * this routine just converts the relevant information from the ereport 1497 * into a format used internally and passes it on to fme_receive_report(). 1498 */ 1499 void 1500 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1501 const char *class) 1502 { 1503 struct node *epnamenp; 1504 fmd_case_t *fmcase; 1505 const struct ipath *ipp; 1506 nvlist_t *detector = NULL; 1507 1508 class = stable(class); 1509 1510 /* Get the component path from the ereport */ 1511 epnamenp = platform_getpath(nvl); 1512 1513 /* See if we ended up without a path. */ 1514 if (epnamenp == NULL) { 1515 /* See if class permits silent discard on unknown component. */ 1516 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1517 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1518 "to component path, but silent discard allowed.", 1519 class); 1520 } else { 1521 /* 1522 * XFILE: Failure to find a component is bad unless 1523 * 'discard_if_config_unknown=1' was specified in the 1524 * ereport definition. Indicate undiagnosable. 1525 */ 1526 Undiag_reason = UD_VAL_NOPATH; 1527 fmcase = fmd_case_open(hdl, NULL); 1528 1529 /* 1530 * We don't have a component path here (which means that 1531 * the detector was not in hc-scheme and couldn't be 1532 * converted to hc-scheme. Report the raw detector as 1533 * the suspect resource if there is one. 1534 */ 1535 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1536 &detector); 1537 publish_undiagnosable(hdl, ffep, fmcase, detector, 1538 (char *)class); 1539 } 1540 return; 1541 } 1542 1543 ipp = ipath(epnamenp); 1544 tree_free(epnamenp); 1545 fme_receive_report(hdl, ffep, class, ipp, nvl); 1546 } 1547 1548 /*ARGSUSED*/ 1549 void 1550 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1551 const char *eventstring) 1552 { 1553 char *uuid; 1554 nvlist_t **nva; 1555 uint_t nvc; 1556 const struct ipath *ipp; 1557 1558 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1559 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1560 &nva, &nvc) != 0) { 1561 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1562 return; 1563 } 1564 1565 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1566 1567 while (nvc-- != 0) { 1568 /* 1569 * Reset any istat or serd engine associated with this path. 1570 */ 1571 char *path; 1572 1573 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1574 continue; 1575 1576 path = ipath2str(NULL, ipp); 1577 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1578 path); 1579 FREE(path); 1580 1581 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1582 istat_save(); 1583 1584 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1585 serd_save(); 1586 } 1587 } 1588 1589 /*ARGSUSED*/ 1590 void 1591 fme_receive_topology_change(void) 1592 { 1593 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1594 istat_save(); 1595 1596 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1597 serd_save(); 1598 } 1599 1600 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1601 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1602 1603 /* ARGSUSED */ 1604 static void 1605 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1606 { 1607 struct bubble *bp; 1608 struct arrowlist *ap; 1609 1610 ep->cached_state = 0; 1611 ep->keep_in_tree = 0; 1612 for (bp = itree_next_bubble(ep, NULL); bp; 1613 bp = itree_next_bubble(ep, bp)) { 1614 if (bp->t != B_FROM) 1615 continue; 1616 bp->mark = 0; 1617 for (ap = itree_next_arrow(bp, NULL); ap; 1618 ap = itree_next_arrow(bp, ap)) 1619 ap->arrowp->mark = 0; 1620 } 1621 } 1622 1623 static void 1624 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1625 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1626 { 1627 struct event *ep; 1628 struct fme *fmep = NULL; 1629 struct fme *ofmep = NULL; 1630 struct fme *cfmep, *svfmep; 1631 int matched = 0; 1632 nvlist_t *defect; 1633 fmd_case_t *fmcase; 1634 char *reason; 1635 1636 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1637 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1638 out(O_ALTFP|O_STAMP, NULL); 1639 1640 /* decide which FME it goes to */ 1641 for (fmep = FMElist; fmep; fmep = fmep->next) { 1642 int prev_verbose; 1643 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1644 enum fme_state state; 1645 nvlist_t *pre_peek_nvp = NULL; 1646 1647 if (fmep->overflow) { 1648 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1649 ofmep = fmep; 1650 1651 continue; 1652 } 1653 1654 /* 1655 * ignore solved or closed cases 1656 */ 1657 if (fmep->posted_suspects || 1658 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1659 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1660 continue; 1661 1662 /* look up event in event tree for this FME */ 1663 if ((ep = itree_lookup(fmep->eventtree, 1664 eventstring, ipp)) == NULL) 1665 continue; 1666 1667 /* note observation */ 1668 fmep->ecurrent = ep; 1669 if (ep->count++ == 0) { 1670 /* link it into list of observations seen */ 1671 ep->observations = fmep->observations; 1672 fmep->observations = ep; 1673 ep->nvp = evnv_dupnvl(nvl); 1674 } else { 1675 /* use new payload values for peek */ 1676 pre_peek_nvp = ep->nvp; 1677 ep->nvp = evnv_dupnvl(nvl); 1678 } 1679 1680 /* tell hypothesise() not to mess with suspect list */ 1681 fmep->peek = 1; 1682 1683 /* don't want this to be verbose (unless Debug is set) */ 1684 prev_verbose = Verbose; 1685 if (Debug == 0) 1686 Verbose = 0; 1687 1688 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1689 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1690 1691 fmep->peek = 0; 1692 1693 /* put verbose flag back */ 1694 Verbose = prev_verbose; 1695 1696 if (state != FME_DISPROVED) { 1697 /* found an FME that explains the ereport */ 1698 matched++; 1699 out(O_ALTFP|O_NONL, "["); 1700 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1701 out(O_ALTFP, " explained by FME%d]", fmep->id); 1702 1703 if (pre_peek_nvp) 1704 nvlist_free(pre_peek_nvp); 1705 1706 if (ep->count == 1) 1707 serialize_observation(fmep, eventstring, ipp); 1708 1709 if (ffep) { 1710 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1711 ep->ffep = ffep; 1712 } 1713 1714 stats_counter_bump(fmep->Rcount); 1715 1716 /* re-eval FME */ 1717 fme_eval(fmep, ffep); 1718 } else { 1719 1720 /* not a match, undo noting of observation */ 1721 fmep->ecurrent = NULL; 1722 if (--ep->count == 0) { 1723 /* unlink it from observations */ 1724 fmep->observations = ep->observations; 1725 ep->observations = NULL; 1726 nvlist_free(ep->nvp); 1727 ep->nvp = NULL; 1728 } else { 1729 nvlist_free(ep->nvp); 1730 ep->nvp = pre_peek_nvp; 1731 } 1732 } 1733 } 1734 1735 if (matched) 1736 return; /* explained by at least one existing FME */ 1737 1738 /* clean up closed fmes */ 1739 cfmep = ClosedFMEs; 1740 while (cfmep != NULL) { 1741 svfmep = cfmep->next; 1742 destroy_fme(cfmep); 1743 cfmep = svfmep; 1744 } 1745 ClosedFMEs = NULL; 1746 1747 if (ofmep) { 1748 out(O_ALTFP|O_NONL, "["); 1749 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1750 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1751 if (ffep) 1752 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1753 1754 return; 1755 1756 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1757 out(O_ALTFP|O_NONL, "["); 1758 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1759 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1760 1761 fmcase = fmd_case_open(hdl, NULL); 1762 1763 /* Create overflow fme */ 1764 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1765 nvl)) == NULL) { 1766 out(O_ALTFP|O_NONL, "["); 1767 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1768 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1769 return; 1770 } 1771 1772 Open_fme_count++; 1773 1774 init_fme_bufs(fmep); 1775 fmep->overflow = B_TRUE; 1776 1777 if (ffep) 1778 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1779 1780 Undiag_reason = UD_VAL_MAXFME; 1781 defect = fmd_nvl_create_fault(hdl, 1782 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1783 reason = undiag_2reason_str(Undiag_reason, NULL); 1784 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1785 FREE(reason); 1786 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1787 fmd_case_solve(hdl, fmep->fmcase); 1788 Undiag_reason = UD_VAL_UNKNOWN; 1789 return; 1790 } 1791 1792 /* open a case */ 1793 fmcase = fmd_case_open(hdl, NULL); 1794 1795 /* start a new FME */ 1796 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1797 out(O_ALTFP|O_NONL, "["); 1798 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1799 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1800 return; 1801 } 1802 1803 Open_fme_count++; 1804 1805 init_fme_bufs(fmep); 1806 1807 out(O_ALTFP|O_NONL, "["); 1808 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1809 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1810 fmd_case_uuid(hdl, fmep->fmcase)); 1811 1812 ep = fmep->e0; 1813 ASSERT(ep != NULL); 1814 1815 /* note observation */ 1816 fmep->ecurrent = ep; 1817 if (ep->count++ == 0) { 1818 /* link it into list of observations seen */ 1819 ep->observations = fmep->observations; 1820 fmep->observations = ep; 1821 ep->nvp = evnv_dupnvl(nvl); 1822 serialize_observation(fmep, eventstring, ipp); 1823 } else { 1824 /* new payload overrides any previous */ 1825 nvlist_free(ep->nvp); 1826 ep->nvp = evnv_dupnvl(nvl); 1827 } 1828 1829 stats_counter_bump(fmep->Rcount); 1830 1831 if (ffep) { 1832 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1833 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1834 fmep->e0r = ffep; 1835 ep->ffep = ffep; 1836 } 1837 1838 /* give the diagnosis algorithm a shot at the new FME state */ 1839 fme_eval(fmep, ffep); 1840 } 1841 1842 void 1843 fme_status(int flags) 1844 { 1845 struct fme *fmep; 1846 1847 if (FMElist == NULL) { 1848 out(flags, "No fault management exercises underway."); 1849 return; 1850 } 1851 1852 for (fmep = FMElist; fmep; fmep = fmep->next) 1853 fme_print(flags, fmep); 1854 } 1855 1856 /* 1857 * "indent" routines used mostly for nicely formatted debug output, but also 1858 * for sanity checking for infinite recursion bugs. 1859 */ 1860 1861 #define MAX_INDENT 1024 1862 static const char *indent_s[MAX_INDENT]; 1863 static int current_indent; 1864 1865 static void 1866 indent_push(const char *s) 1867 { 1868 if (current_indent < MAX_INDENT) 1869 indent_s[current_indent++] = s; 1870 else 1871 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1872 } 1873 1874 static void 1875 indent_set(const char *s) 1876 { 1877 current_indent = 0; 1878 indent_push(s); 1879 } 1880 1881 static void 1882 indent_pop(void) 1883 { 1884 if (current_indent > 0) 1885 current_indent--; 1886 else 1887 out(O_DIE, "recursion underflow"); 1888 } 1889 1890 static void 1891 indent(void) 1892 { 1893 int i; 1894 if (!Verbose) 1895 return; 1896 for (i = 0; i < current_indent; i++) 1897 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1898 } 1899 1900 #define SLNEW 1 1901 #define SLCHANGED 2 1902 #define SLWAIT 3 1903 #define SLDISPROVED 4 1904 1905 static void 1906 print_suspects(int circumstance, struct fme *fmep) 1907 { 1908 struct event *ep; 1909 1910 out(O_ALTFP|O_NONL, "["); 1911 if (circumstance == SLCHANGED) { 1912 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1913 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1914 } else if (circumstance == SLWAIT) { 1915 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1916 fmep->timer); 1917 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1918 } else if (circumstance == SLDISPROVED) { 1919 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1920 } else { 1921 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1922 } 1923 1924 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1925 out(O_ALTFP, "]"); 1926 return; 1927 } 1928 1929 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1930 out(O_ALTFP|O_NONL, " "); 1931 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1932 } 1933 out(O_ALTFP, "]"); 1934 } 1935 1936 static struct node * 1937 eventprop_lookup(struct event *ep, const char *propname) 1938 { 1939 return (lut_lookup(ep->props, (void *)propname, NULL)); 1940 } 1941 1942 #define MAXDIGITIDX 23 1943 static char numbuf[MAXDIGITIDX + 1]; 1944 1945 static int 1946 node2uint(struct node *n, uint_t *valp) 1947 { 1948 struct evalue value; 1949 struct lut *globals = NULL; 1950 1951 if (n == NULL) 1952 return (1); 1953 1954 /* 1955 * check value.v since we are being asked to convert an unsigned 1956 * long long int to an unsigned int 1957 */ 1958 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1959 value.t != UINT64 || value.v > (1ULL << 32)) 1960 return (1); 1961 1962 *valp = (uint_t)value.v; 1963 1964 return (0); 1965 } 1966 1967 static nvlist_t * 1968 node2fmri(struct node *n) 1969 { 1970 nvlist_t **pa, *f, *p; 1971 struct node *nc; 1972 uint_t depth = 0; 1973 char *numstr, *nullbyte; 1974 char *failure; 1975 int err, i; 1976 1977 /* XXX do we need to be able to handle a non-T_NAME node? */ 1978 if (n == NULL || n->t != T_NAME) 1979 return (NULL); 1980 1981 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1982 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1983 break; 1984 depth++; 1985 } 1986 1987 if (nc != NULL) { 1988 /* We bailed early, something went wrong */ 1989 return (NULL); 1990 } 1991 1992 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1993 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1994 pa = alloca(depth * sizeof (nvlist_t *)); 1995 for (i = 0; i < depth; i++) 1996 pa[i] = NULL; 1997 1998 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 1999 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2000 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2001 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2002 if (err != 0) { 2003 failure = "basic construction of FMRI failed"; 2004 goto boom; 2005 } 2006 2007 numbuf[MAXDIGITIDX] = '\0'; 2008 nullbyte = &numbuf[MAXDIGITIDX]; 2009 i = 0; 2010 2011 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2012 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2013 if (err != 0) { 2014 failure = "alloc of an hc-pair failed"; 2015 goto boom; 2016 } 2017 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2018 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2019 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2020 if (err != 0) { 2021 failure = "construction of an hc-pair failed"; 2022 goto boom; 2023 } 2024 pa[i++] = p; 2025 } 2026 2027 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2028 if (err == 0) { 2029 for (i = 0; i < depth; i++) 2030 if (pa[i] != NULL) 2031 nvlist_free(pa[i]); 2032 return (f); 2033 } 2034 failure = "addition of hc-pair array to FMRI failed"; 2035 2036 boom: 2037 for (i = 0; i < depth; i++) 2038 if (pa[i] != NULL) 2039 nvlist_free(pa[i]); 2040 nvlist_free(f); 2041 out(O_DIE, "%s", failure); 2042 /*NOTREACHED*/ 2043 return (NULL); 2044 } 2045 2046 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2047 struct ipath { 2048 const char *s; /* component name (in stable) */ 2049 int i; /* instance number */ 2050 }; 2051 2052 static nvlist_t * 2053 ipath2fmri(struct ipath *ipath) 2054 { 2055 nvlist_t **pa, *f, *p; 2056 uint_t depth = 0; 2057 char *numstr, *nullbyte; 2058 char *failure; 2059 int err, i; 2060 struct ipath *ipp; 2061 2062 for (ipp = ipath; ipp->s != NULL; ipp++) 2063 depth++; 2064 2065 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2066 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2067 pa = alloca(depth * sizeof (nvlist_t *)); 2068 for (i = 0; i < depth; i++) 2069 pa[i] = NULL; 2070 2071 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2072 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2073 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2074 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2075 if (err != 0) { 2076 failure = "basic construction of FMRI failed"; 2077 goto boom; 2078 } 2079 2080 numbuf[MAXDIGITIDX] = '\0'; 2081 nullbyte = &numbuf[MAXDIGITIDX]; 2082 i = 0; 2083 2084 for (ipp = ipath; ipp->s != NULL; ipp++) { 2085 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2086 if (err != 0) { 2087 failure = "alloc of an hc-pair failed"; 2088 goto boom; 2089 } 2090 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2091 numstr = ulltostr(ipp->i, nullbyte); 2092 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2093 if (err != 0) { 2094 failure = "construction of an hc-pair failed"; 2095 goto boom; 2096 } 2097 pa[i++] = p; 2098 } 2099 2100 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2101 if (err == 0) { 2102 for (i = 0; i < depth; i++) 2103 if (pa[i] != NULL) 2104 nvlist_free(pa[i]); 2105 return (f); 2106 } 2107 failure = "addition of hc-pair array to FMRI failed"; 2108 2109 boom: 2110 for (i = 0; i < depth; i++) 2111 if (pa[i] != NULL) 2112 nvlist_free(pa[i]); 2113 nvlist_free(f); 2114 out(O_DIE, "%s", failure); 2115 /*NOTREACHED*/ 2116 return (NULL); 2117 } 2118 2119 static uint8_t 2120 percentof(uint_t part, uint_t whole) 2121 { 2122 unsigned long long p = part * 1000; 2123 2124 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2125 } 2126 2127 struct rsl { 2128 struct event *suspect; 2129 nvlist_t *asru; 2130 nvlist_t *fru; 2131 nvlist_t *rsrc; 2132 }; 2133 2134 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2135 2136 /* 2137 * rslfree -- free internal members of struct rsl not expected to be 2138 * freed elsewhere. 2139 */ 2140 static void 2141 rslfree(struct rsl *freeme) 2142 { 2143 if (freeme->asru != NULL) 2144 nvlist_free(freeme->asru); 2145 if (freeme->fru != NULL) 2146 nvlist_free(freeme->fru); 2147 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2148 nvlist_free(freeme->rsrc); 2149 } 2150 2151 /* 2152 * rslcmp -- compare two rsl structures. Use the following 2153 * comparisons to establish cardinality: 2154 * 2155 * 1. Name of the suspect's class. (simple strcmp) 2156 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2157 * 2158 */ 2159 static int 2160 rslcmp(const void *a, const void *b) 2161 { 2162 struct rsl *r1 = (struct rsl *)a; 2163 struct rsl *r2 = (struct rsl *)b; 2164 int rv; 2165 2166 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2167 r2->suspect->enode->u.event.ename->u.name.s); 2168 if (rv != 0) 2169 return (rv); 2170 2171 if (r1->rsrc == NULL && r2->rsrc == NULL) 2172 return (0); 2173 if (r1->rsrc == NULL) 2174 return (-1); 2175 if (r2->rsrc == NULL) 2176 return (1); 2177 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2178 } 2179 2180 /* 2181 * get_resources -- for a given suspect, determine what ASRU, FRU and 2182 * RSRC nvlists should be advertised in the final suspect list. 2183 */ 2184 void 2185 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2186 { 2187 struct node *asrudef, *frudef; 2188 nvlist_t *asru, *fru; 2189 nvlist_t *rsrc = NULL; 2190 char *pathstr; 2191 2192 /* 2193 * First find any ASRU and/or FRU defined in the 2194 * initial fault tree. 2195 */ 2196 asrudef = eventprop_lookup(sp, L_ASRU); 2197 frudef = eventprop_lookup(sp, L_FRU); 2198 2199 /* 2200 * Create FMRIs based on those definitions 2201 */ 2202 asru = node2fmri(asrudef); 2203 fru = node2fmri(frudef); 2204 pathstr = ipath2str(NULL, sp->ipp); 2205 2206 /* 2207 * Allow for platform translations of the FMRIs 2208 */ 2209 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2210 pathstr); 2211 2212 FREE(pathstr); 2213 rsrcs->suspect = sp; 2214 rsrcs->asru = asru; 2215 rsrcs->fru = fru; 2216 rsrcs->rsrc = rsrc; 2217 } 2218 2219 /* 2220 * trim_suspects -- prior to publishing, we may need to remove some 2221 * suspects from the list. If we're auto-closing upsets, we don't 2222 * want any of those in the published list. If the ASRUs for multiple 2223 * defects resolve to the same ASRU (driver) we only want to publish 2224 * that as a single suspect. 2225 */ 2226 static int 2227 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2228 fmd_event_t *ffep) 2229 { 2230 struct event *ep; 2231 struct rsl *rp = begin; 2232 struct rsl *rp2 = begin2; 2233 int mess_zero_count = 0; 2234 int serd_rval; 2235 uint_t messval; 2236 2237 /* remove any unwanted upsets and populate our array */ 2238 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2239 if (is_upset(ep->t)) 2240 continue; 2241 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2242 NULL, NULL); 2243 if (serd_rval == 0) 2244 continue; 2245 if (node2uint(eventprop_lookup(ep, L_message), 2246 &messval) == 0 && messval == 0) { 2247 get_resources(ep, rp2, fmep->config); 2248 rp2++; 2249 mess_zero_count++; 2250 } else { 2251 get_resources(ep, rp, fmep->config); 2252 rp++; 2253 fmep->nsuspects++; 2254 } 2255 } 2256 return (mess_zero_count); 2257 } 2258 2259 /* 2260 * addpayloadprop -- add a payload prop to a problem 2261 */ 2262 static void 2263 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2264 { 2265 nvlist_t *rsrc, *hcs; 2266 2267 ASSERT(fault != NULL); 2268 ASSERT(lhs != NULL); 2269 ASSERT(rhs != NULL); 2270 2271 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2272 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2273 2274 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2275 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2276 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2277 out(O_DIE, 2278 "cannot add payloadprop \"%s\" to fault", lhs); 2279 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2280 out(O_DIE, 2281 "cannot add payloadprop \"%s\" to fault", lhs); 2282 nvlist_free(hcs); 2283 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2284 out(O_DIE, 2285 "cannot add payloadprop \"%s\" to fault", lhs); 2286 } else 2287 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2288 2289 if (rhs->t == UINT64) { 2290 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2291 2292 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2293 out(O_DIE, 2294 "cannot add payloadprop \"%s\" to fault", lhs); 2295 } else { 2296 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2297 lhs, (char *)(uintptr_t)rhs->v); 2298 2299 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2300 out(O_DIE, 2301 "cannot add payloadprop \"%s\" to fault", lhs); 2302 } 2303 } 2304 2305 static char *Istatbuf; 2306 static char *Istatbufptr; 2307 static int Istatsz; 2308 2309 /* 2310 * istataddsize -- calculate size of istat and add it to Istatsz 2311 */ 2312 /*ARGSUSED2*/ 2313 static void 2314 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2315 { 2316 int val; 2317 2318 ASSERT(lhs != NULL); 2319 ASSERT(rhs != NULL); 2320 2321 if ((val = stats_counter_value(rhs)) == 0) 2322 return; /* skip zero-valued stats */ 2323 2324 /* count up the size of the stat name */ 2325 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2326 Istatsz++; /* for the trailing NULL byte */ 2327 2328 /* count up the size of the stat value */ 2329 Istatsz += snprintf(NULL, 0, "%d", val); 2330 Istatsz++; /* for the trailing NULL byte */ 2331 } 2332 2333 /* 2334 * istat2str -- serialize an istat, writing result to *Istatbufptr 2335 */ 2336 /*ARGSUSED2*/ 2337 static void 2338 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2339 { 2340 char *str; 2341 int len; 2342 int val; 2343 2344 ASSERT(lhs != NULL); 2345 ASSERT(rhs != NULL); 2346 2347 if ((val = stats_counter_value(rhs)) == 0) 2348 return; /* skip zero-valued stats */ 2349 2350 /* serialize the stat name */ 2351 str = ipath2str(lhs->ename, lhs->ipath); 2352 len = strlen(str); 2353 2354 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2355 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2356 Istatbufptr += len; 2357 FREE(str); 2358 *Istatbufptr++ = '\0'; 2359 2360 /* serialize the stat value */ 2361 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2362 "%d", val); 2363 *Istatbufptr++ = '\0'; 2364 2365 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2366 } 2367 2368 void 2369 istat_save() 2370 { 2371 if (Istat_need_save == 0) 2372 return; 2373 2374 /* figure out how big the serialzed info is */ 2375 Istatsz = 0; 2376 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2377 2378 if (Istatsz == 0) { 2379 /* no stats to save */ 2380 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2381 return; 2382 } 2383 2384 /* create the serialized buffer */ 2385 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2386 lut_walk(Istats, (lut_cb)istat2str, NULL); 2387 2388 /* clear out current saved stats */ 2389 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2390 2391 /* write out the new version */ 2392 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2393 FREE(Istatbuf); 2394 2395 Istat_need_save = 0; 2396 } 2397 2398 int 2399 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2400 { 2401 if (ent1->ename != ent2->ename) 2402 return (ent2->ename - ent1->ename); 2403 if (ent1->ipath != ent2->ipath) 2404 return ((char *)ent2->ipath - (char *)ent1->ipath); 2405 2406 return (0); 2407 } 2408 2409 /* 2410 * istat-verify -- verify the component associated with a stat still exists 2411 * 2412 * if the component no longer exists, this routine resets the stat and 2413 * returns 0. if the component still exists, it returns 1. 2414 */ 2415 static int 2416 istat_verify(struct node *snp, struct istat_entry *entp) 2417 { 2418 struct stats *statp; 2419 nvlist_t *fmri; 2420 2421 fmri = node2fmri(snp->u.event.epname); 2422 if (platform_path_exists(fmri)) { 2423 nvlist_free(fmri); 2424 return (1); 2425 } 2426 nvlist_free(fmri); 2427 2428 /* component no longer in system. zero out the associated stats */ 2429 if ((statp = (struct stats *) 2430 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2431 stats_counter_value(statp) == 0) 2432 return (0); /* stat is already reset */ 2433 2434 Istat_need_save = 1; 2435 stats_counter_reset(statp); 2436 return (0); 2437 } 2438 2439 static void 2440 istat_bump(struct node *snp, int n) 2441 { 2442 struct stats *statp; 2443 struct istat_entry ent; 2444 2445 ASSERT(snp != NULL); 2446 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2447 ASSERT(snp->u.event.epname != NULL); 2448 2449 /* class name should be hoisted into a single stable entry */ 2450 ASSERT(snp->u.event.ename->u.name.next == NULL); 2451 ent.ename = snp->u.event.ename->u.name.s; 2452 ent.ipath = ipath(snp->u.event.epname); 2453 2454 if (!istat_verify(snp, &ent)) { 2455 /* component no longer exists in system, nothing to do */ 2456 return; 2457 } 2458 2459 if ((statp = (struct stats *) 2460 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2461 /* need to create the counter */ 2462 int cnt = 0; 2463 struct node *np; 2464 char *sname; 2465 char *snamep; 2466 struct istat_entry *newentp; 2467 2468 /* count up the size of the stat name */ 2469 np = snp->u.event.ename; 2470 while (np != NULL) { 2471 cnt += strlen(np->u.name.s); 2472 cnt++; /* for the '.' or '@' */ 2473 np = np->u.name.next; 2474 } 2475 np = snp->u.event.epname; 2476 while (np != NULL) { 2477 cnt += snprintf(NULL, 0, "%s%llu", 2478 np->u.name.s, np->u.name.child->u.ull); 2479 cnt++; /* for the '/' or trailing NULL byte */ 2480 np = np->u.name.next; 2481 } 2482 2483 /* build the stat name */ 2484 snamep = sname = alloca(cnt); 2485 np = snp->u.event.ename; 2486 while (np != NULL) { 2487 snamep += snprintf(snamep, &sname[cnt] - snamep, 2488 "%s", np->u.name.s); 2489 np = np->u.name.next; 2490 if (np) 2491 *snamep++ = '.'; 2492 } 2493 *snamep++ = '@'; 2494 np = snp->u.event.epname; 2495 while (np != NULL) { 2496 snamep += snprintf(snamep, &sname[cnt] - snamep, 2497 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2498 np = np->u.name.next; 2499 if (np) 2500 *snamep++ = '/'; 2501 } 2502 *snamep++ = '\0'; 2503 2504 /* create the new stat & add it to our list */ 2505 newentp = MALLOC(sizeof (*newentp)); 2506 *newentp = ent; 2507 statp = stats_new_counter(NULL, sname, 0); 2508 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2509 (lut_cmp)istat_cmp); 2510 } 2511 2512 /* if n is non-zero, set that value instead of bumping */ 2513 if (n) { 2514 stats_counter_reset(statp); 2515 stats_counter_add(statp, n); 2516 } else 2517 stats_counter_bump(statp); 2518 Istat_need_save = 1; 2519 2520 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2521 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2522 stats_counter_value(statp)); 2523 } 2524 2525 /*ARGSUSED*/ 2526 static void 2527 istat_destructor(void *left, void *right, void *arg) 2528 { 2529 struct istat_entry *entp = (struct istat_entry *)left; 2530 struct stats *statp = (struct stats *)right; 2531 FREE(entp); 2532 stats_delete(statp); 2533 } 2534 2535 /* 2536 * Callback used in a walk of the Istats to reset matching stat counters. 2537 */ 2538 static void 2539 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2540 const struct ipath *ipp) 2541 { 2542 char *path; 2543 2544 if (entp->ipath == ipp) { 2545 path = ipath2str(entp->ename, ipp); 2546 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2547 FREE(path); 2548 stats_counter_reset(statp); 2549 Istat_need_save = 1; 2550 } 2551 } 2552 2553 /*ARGSUSED*/ 2554 static void 2555 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2556 void *unused) 2557 { 2558 char *path; 2559 nvlist_t *fmri; 2560 2561 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2562 if (!platform_path_exists(fmri)) { 2563 path = ipath2str(entp->ename, entp->ipath); 2564 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2565 FREE(path); 2566 stats_counter_reset(statp); 2567 Istat_need_save = 1; 2568 } 2569 nvlist_free(fmri); 2570 } 2571 2572 void 2573 istat_fini(void) 2574 { 2575 lut_free(Istats, istat_destructor, NULL); 2576 } 2577 2578 static char *Serdbuf; 2579 static char *Serdbufptr; 2580 static int Serdsz; 2581 2582 /* 2583 * serdaddsize -- calculate size of serd and add it to Serdsz 2584 */ 2585 /*ARGSUSED*/ 2586 static void 2587 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2588 { 2589 ASSERT(lhs != NULL); 2590 2591 /* count up the size of the stat name */ 2592 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2593 Serdsz++; /* for the trailing NULL byte */ 2594 } 2595 2596 /* 2597 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2598 */ 2599 /*ARGSUSED*/ 2600 static void 2601 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2602 { 2603 char *str; 2604 int len; 2605 2606 ASSERT(lhs != NULL); 2607 2608 /* serialize the serd engine name */ 2609 str = ipath2str(lhs->ename, lhs->ipath); 2610 len = strlen(str); 2611 2612 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2613 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2614 Serdbufptr += len; 2615 FREE(str); 2616 *Serdbufptr++ = '\0'; 2617 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2618 } 2619 2620 void 2621 serd_save() 2622 { 2623 if (Serd_need_save == 0) 2624 return; 2625 2626 /* figure out how big the serialzed info is */ 2627 Serdsz = 0; 2628 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2629 2630 if (Serdsz == 0) { 2631 /* no serd engines to save */ 2632 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2633 return; 2634 } 2635 2636 /* create the serialized buffer */ 2637 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2638 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2639 2640 /* clear out current saved stats */ 2641 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2642 2643 /* write out the new version */ 2644 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2645 FREE(Serdbuf); 2646 Serd_need_save = 0; 2647 } 2648 2649 int 2650 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2651 { 2652 if (ent1->ename != ent2->ename) 2653 return (ent2->ename - ent1->ename); 2654 if (ent1->ipath != ent2->ipath) 2655 return ((char *)ent2->ipath - (char *)ent1->ipath); 2656 2657 return (0); 2658 } 2659 2660 void 2661 fme_serd_load(fmd_hdl_t *hdl) 2662 { 2663 int sz; 2664 char *sbuf; 2665 char *sepptr; 2666 char *ptr; 2667 struct serd_entry *newentp; 2668 struct node *epname; 2669 nvlist_t *fmri; 2670 char *namestring; 2671 2672 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2673 return; 2674 sbuf = alloca(sz); 2675 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2676 ptr = sbuf; 2677 while (ptr < &sbuf[sz]) { 2678 sepptr = strchr(ptr, '@'); 2679 *sepptr = '\0'; 2680 namestring = ptr; 2681 sepptr++; 2682 ptr = sepptr; 2683 ptr += strlen(ptr); 2684 ptr++; /* move past the '\0' separating paths */ 2685 epname = pathstring2epnamenp(sepptr); 2686 fmri = node2fmri(epname); 2687 if (platform_path_exists(fmri)) { 2688 newentp = MALLOC(sizeof (*newentp)); 2689 newentp->hdl = hdl; 2690 newentp->ipath = ipath(epname); 2691 newentp->ename = stable(namestring); 2692 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2693 (void *)newentp, (lut_cmp)serd_cmp); 2694 } else 2695 Serd_need_save = 1; 2696 tree_free(epname); 2697 nvlist_free(fmri); 2698 } 2699 /* save it back again in case some of the paths no longer exist */ 2700 serd_save(); 2701 } 2702 2703 /*ARGSUSED*/ 2704 static void 2705 serd_destructor(void *left, void *right, void *arg) 2706 { 2707 struct serd_entry *entp = (struct serd_entry *)left; 2708 FREE(entp); 2709 } 2710 2711 /* 2712 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2713 */ 2714 /*ARGSUSED*/ 2715 static void 2716 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2717 { 2718 char *path; 2719 2720 if (entp->ipath == ipp) { 2721 path = ipath2str(entp->ename, ipp); 2722 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2723 fmd_serd_reset(entp->hdl, path); 2724 FREE(path); 2725 Serd_need_save = 1; 2726 } 2727 } 2728 2729 /*ARGSUSED*/ 2730 static void 2731 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2732 { 2733 char *path; 2734 nvlist_t *fmri; 2735 2736 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2737 if (!platform_path_exists(fmri)) { 2738 path = ipath2str(entp->ename, entp->ipath); 2739 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2740 fmd_serd_reset(entp->hdl, path); 2741 FREE(path); 2742 Serd_need_save = 1; 2743 } 2744 nvlist_free(fmri); 2745 } 2746 2747 void 2748 serd_fini(void) 2749 { 2750 lut_free(SerdEngines, serd_destructor, NULL); 2751 } 2752 2753 static void 2754 publish_suspects(struct fme *fmep, struct rsl *srl) 2755 { 2756 struct rsl *rp; 2757 nvlist_t *fault; 2758 uint8_t cert; 2759 uint_t *frs; 2760 uint_t frsum, fr; 2761 uint_t messval; 2762 uint_t retireval; 2763 uint_t responseval; 2764 struct node *snp; 2765 int frcnt, fridx; 2766 boolean_t allfaulty = B_TRUE; 2767 struct rsl *erl = srl + fmep->nsuspects - 1; 2768 2769 /* 2770 * sort the array 2771 */ 2772 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2773 2774 /* sum the fitrates */ 2775 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2776 fridx = frcnt = frsum = 0; 2777 2778 for (rp = srl; rp <= erl; rp++) { 2779 struct node *n; 2780 2781 n = eventprop_lookup(rp->suspect, L_FITrate); 2782 if (node2uint(n, &fr) != 0) { 2783 out(O_DEBUG|O_NONL, "event "); 2784 ipath_print(O_DEBUG|O_NONL, 2785 rp->suspect->enode->u.event.ename->u.name.s, 2786 rp->suspect->ipp); 2787 out(O_VERB, " has no FITrate (using 1)"); 2788 fr = 1; 2789 } else if (fr == 0) { 2790 out(O_DEBUG|O_NONL, "event "); 2791 ipath_print(O_DEBUG|O_NONL, 2792 rp->suspect->enode->u.event.ename->u.name.s, 2793 rp->suspect->ipp); 2794 out(O_VERB, " has zero FITrate (using 1)"); 2795 fr = 1; 2796 } 2797 2798 frs[fridx++] = fr; 2799 frsum += fr; 2800 frcnt++; 2801 } 2802 2803 /* Add them in reverse order of our sort, as fmd reverses order */ 2804 for (rp = erl; rp >= srl; rp--) { 2805 cert = percentof(frs[--fridx], frsum); 2806 fault = fmd_nvl_create_fault(fmep->hdl, 2807 rp->suspect->enode->u.event.ename->u.name.s, 2808 cert, 2809 rp->asru, 2810 rp->fru, 2811 rp->rsrc); 2812 if (fault == NULL) 2813 out(O_DIE, "fault creation failed"); 2814 /* if "message" property exists, add it to the fault */ 2815 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2816 &messval) == 0) { 2817 2818 out(O_ALTFP, 2819 "[FME%d, %s adds message=%d to suspect list]", 2820 fmep->id, 2821 rp->suspect->enode->u.event.ename->u.name.s, 2822 messval); 2823 if (nvlist_add_boolean_value(fault, 2824 FM_SUSPECT_MESSAGE, 2825 (messval) ? B_TRUE : B_FALSE) != 0) { 2826 out(O_DIE, "cannot add no-message to fault"); 2827 } 2828 } 2829 2830 /* if "retire" property exists, add it to the fault */ 2831 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2832 &retireval) == 0) { 2833 2834 out(O_ALTFP, 2835 "[FME%d, %s adds retire=%d to suspect list]", 2836 fmep->id, 2837 rp->suspect->enode->u.event.ename->u.name.s, 2838 retireval); 2839 if (nvlist_add_boolean_value(fault, 2840 FM_SUSPECT_RETIRE, 2841 (retireval) ? B_TRUE : B_FALSE) != 0) { 2842 out(O_DIE, "cannot add no-retire to fault"); 2843 } 2844 } 2845 2846 /* if "response" property exists, add it to the fault */ 2847 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2848 &responseval) == 0) { 2849 2850 out(O_ALTFP, 2851 "[FME%d, %s adds response=%d to suspect list]", 2852 fmep->id, 2853 rp->suspect->enode->u.event.ename->u.name.s, 2854 responseval); 2855 if (nvlist_add_boolean_value(fault, 2856 FM_SUSPECT_RESPONSE, 2857 (responseval) ? B_TRUE : B_FALSE) != 0) { 2858 out(O_DIE, "cannot add no-response to fault"); 2859 } 2860 } 2861 2862 /* add any payload properties */ 2863 lut_walk(rp->suspect->payloadprops, 2864 (lut_cb)addpayloadprop, (void *)fault); 2865 rslfree(rp); 2866 2867 /* 2868 * If "action" property exists, evaluate it; this must be done 2869 * before the allfaulty check below since some actions may 2870 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2871 * needs to be restructured if any new actions are introduced 2872 * that have effects that we do not want to be visible if 2873 * we decide not to publish in the dupclose check below. 2874 */ 2875 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2876 struct evalue evalue; 2877 2878 out(O_ALTFP|O_NONL, 2879 "[FME%d, %s action ", fmep->id, 2880 rp->suspect->enode->u.event.ename->u.name.s); 2881 ptree_name_iter(O_ALTFP|O_NONL, snp); 2882 out(O_ALTFP, "]"); 2883 Action_nvl = fault; 2884 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2885 NULL, 0, &evalue); 2886 } 2887 2888 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2889 2890 /* 2891 * check if the asru is already marked as "faulty". 2892 */ 2893 if (allfaulty) { 2894 nvlist_t *asru; 2895 2896 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2897 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2898 out(O_ALTFP|O_VERB|O_NONL, " "); 2899 if (nvlist_lookup_nvlist(fault, 2900 FM_FAULT_ASRU, &asru) != 0) { 2901 out(O_ALTFP|O_VERB, "NULL asru"); 2902 allfaulty = B_FALSE; 2903 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2904 FMD_HAS_FAULT_ASRU, NULL)) { 2905 out(O_ALTFP|O_VERB, "faulty"); 2906 } else { 2907 out(O_ALTFP|O_VERB, "not faulty"); 2908 allfaulty = B_FALSE; 2909 } 2910 } 2911 2912 } 2913 2914 if (!allfaulty) { 2915 /* 2916 * don't update the count stat if all asrus are already 2917 * present and unrepaired in the asru cache 2918 */ 2919 for (rp = erl; rp >= srl; rp--) { 2920 struct event *suspect = rp->suspect; 2921 2922 if (suspect == NULL) 2923 continue; 2924 2925 /* if "count" exists, increment the appropriate stat */ 2926 if ((snp = eventprop_lookup(suspect, 2927 L_count)) != NULL) { 2928 out(O_ALTFP|O_NONL, 2929 "[FME%d, %s count ", fmep->id, 2930 suspect->enode->u.event.ename->u.name.s); 2931 ptree_name_iter(O_ALTFP|O_NONL, snp); 2932 out(O_ALTFP, "]"); 2933 istat_bump(snp, 0); 2934 2935 } 2936 } 2937 istat_save(); /* write out any istat changes */ 2938 } 2939 } 2940 2941 static const char * 2942 undiag_2defect_str(int ud) 2943 { 2944 switch (ud) { 2945 case UD_VAL_MISSINGINFO: 2946 case UD_VAL_MISSINGOBS: 2947 case UD_VAL_MISSINGPATH: 2948 case UD_VAL_MISSINGZERO: 2949 case UD_VAL_BADOBS: 2950 case UD_VAL_CFGMISMATCH: 2951 return (UNDIAG_DEFECT_CHKPT); 2952 break; 2953 2954 case UD_VAL_BADEVENTI: 2955 case UD_VAL_BADEVENTPATH: 2956 case UD_VAL_BADEVENTCLASS: 2957 case UD_VAL_INSTFAIL: 2958 case UD_VAL_NOPATH: 2959 case UD_VAL_UNSOLVD: 2960 return (UNDIAG_DEFECT_FME); 2961 break; 2962 2963 case UD_VAL_MAXFME: 2964 return (UNDIAG_DEFECT_LIMIT); 2965 break; 2966 2967 case UD_VAL_UNKNOWN: 2968 default: 2969 return (UNDIAG_DEFECT_UNKNOWN); 2970 break; 2971 } 2972 } 2973 2974 static const char * 2975 undiag_2fault_str(int ud) 2976 { 2977 switch (ud) { 2978 case UD_VAL_BADEVENTI: 2979 case UD_VAL_BADEVENTPATH: 2980 case UD_VAL_BADEVENTCLASS: 2981 case UD_VAL_INSTFAIL: 2982 case UD_VAL_NOPATH: 2983 case UD_VAL_UNSOLVD: 2984 return (UNDIAG_FAULT_FME); 2985 default: 2986 return (NULL); 2987 } 2988 } 2989 2990 static char * 2991 undiag_2reason_str(int ud, char *arg) 2992 { 2993 const char *ptr; 2994 char *buf; 2995 int with_arg = 0; 2996 2997 switch (ud) { 2998 case UD_VAL_BADEVENTPATH: 2999 ptr = UD_STR_BADEVENTPATH; 3000 with_arg = 1; 3001 break; 3002 case UD_VAL_BADEVENTCLASS: 3003 ptr = UD_STR_BADEVENTCLASS; 3004 with_arg = 1; 3005 break; 3006 case UD_VAL_BADEVENTI: 3007 ptr = UD_STR_BADEVENTI; 3008 with_arg = 1; 3009 break; 3010 case UD_VAL_BADOBS: 3011 ptr = UD_STR_BADOBS; 3012 break; 3013 case UD_VAL_CFGMISMATCH: 3014 ptr = UD_STR_CFGMISMATCH; 3015 break; 3016 case UD_VAL_INSTFAIL: 3017 ptr = UD_STR_INSTFAIL; 3018 with_arg = 1; 3019 break; 3020 case UD_VAL_MAXFME: 3021 ptr = UD_STR_MAXFME; 3022 break; 3023 case UD_VAL_MISSINGINFO: 3024 ptr = UD_STR_MISSINGINFO; 3025 break; 3026 case UD_VAL_MISSINGOBS: 3027 ptr = UD_STR_MISSINGOBS; 3028 break; 3029 case UD_VAL_MISSINGPATH: 3030 ptr = UD_STR_MISSINGPATH; 3031 break; 3032 case UD_VAL_MISSINGZERO: 3033 ptr = UD_STR_MISSINGZERO; 3034 break; 3035 case UD_VAL_NOPATH: 3036 ptr = UD_STR_NOPATH; 3037 with_arg = 1; 3038 break; 3039 case UD_VAL_UNSOLVD: 3040 ptr = UD_STR_UNSOLVD; 3041 break; 3042 case UD_VAL_UNKNOWN: 3043 default: 3044 ptr = UD_STR_UNKNOWN; 3045 break; 3046 } 3047 if (with_arg) { 3048 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3049 (void) sprintf(buf, ptr, arg); 3050 } else { 3051 buf = MALLOC(strlen(ptr) + 1); 3052 (void) sprintf(buf, ptr); 3053 } 3054 return (buf); 3055 } 3056 3057 static void 3058 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3059 nvlist_t *detector, char *arg) 3060 { 3061 struct case_list *newcase; 3062 nvlist_t *defect, *fault; 3063 const char *faultstr; 3064 char *reason = undiag_2reason_str(Undiag_reason, arg); 3065 3066 out(O_ALTFP, 3067 "[undiagnosable ereport received, " 3068 "creating and closing a new case (%s)]", reason); 3069 3070 newcase = MALLOC(sizeof (struct case_list)); 3071 newcase->next = NULL; 3072 newcase->fmcase = fmcase; 3073 if (Undiagablecaselist != NULL) 3074 newcase->next = Undiagablecaselist; 3075 Undiagablecaselist = newcase; 3076 3077 if (ffep != NULL) 3078 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3079 3080 /* add defect */ 3081 defect = fmd_nvl_create_fault(hdl, 3082 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3083 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3084 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3085 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3086 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3087 3088 /* add fault if appropriate */ 3089 faultstr = undiag_2fault_str(Undiag_reason); 3090 if (faultstr != NULL) { 3091 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3092 detector); 3093 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3094 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3095 B_FALSE); 3096 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3097 B_FALSE); 3098 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3099 } 3100 FREE(reason); 3101 3102 /* solve and close case */ 3103 fmd_case_solve(hdl, newcase->fmcase); 3104 fmd_case_close(hdl, newcase->fmcase); 3105 Undiag_reason = UD_VAL_UNKNOWN; 3106 } 3107 3108 static void 3109 fme_undiagnosable(struct fme *f) 3110 { 3111 nvlist_t *defect, *fault, *detector = NULL; 3112 struct event *ep; 3113 char *pathstr; 3114 const char *faultstr; 3115 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3116 3117 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3118 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3119 3120 for (ep = f->observations; ep; ep = ep->observations) { 3121 3122 if (ep->ffep != f->e0r) 3123 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3124 3125 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3126 platform_units_translate(0, f->config, NULL, NULL, &detector, 3127 pathstr); 3128 FREE(pathstr); 3129 3130 /* add defect */ 3131 defect = fmd_nvl_create_fault(f->hdl, 3132 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3133 NULL, NULL, detector); 3134 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3135 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3136 B_FALSE); 3137 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3138 B_FALSE); 3139 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3140 3141 /* add fault if appropriate */ 3142 faultstr = undiag_2fault_str(Undiag_reason); 3143 if (faultstr == NULL) 3144 continue; 3145 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3146 NULL, NULL, detector); 3147 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3148 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3149 B_FALSE); 3150 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3151 B_FALSE); 3152 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3153 nvlist_free(detector); 3154 } 3155 FREE(reason); 3156 fmd_case_solve(f->hdl, f->fmcase); 3157 fmd_case_close(f->hdl, f->fmcase); 3158 Undiag_reason = UD_VAL_UNKNOWN; 3159 } 3160 3161 /* 3162 * fme_close_case 3163 * 3164 * Find the requested case amongst our fmes and close it. Free up 3165 * the related fme. 3166 */ 3167 void 3168 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3169 { 3170 struct case_list *ucasep, *prevcasep = NULL; 3171 struct fme *prev = NULL; 3172 struct fme *fmep; 3173 3174 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3175 if (fmcase != ucasep->fmcase) { 3176 prevcasep = ucasep; 3177 continue; 3178 } 3179 3180 if (prevcasep == NULL) 3181 Undiagablecaselist = Undiagablecaselist->next; 3182 else 3183 prevcasep->next = ucasep->next; 3184 3185 FREE(ucasep); 3186 return; 3187 } 3188 3189 for (fmep = FMElist; fmep; fmep = fmep->next) { 3190 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3191 break; 3192 prev = fmep; 3193 } 3194 3195 if (fmep == NULL) { 3196 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3197 fmd_case_uuid(hdl, fmcase)); 3198 return; 3199 } 3200 3201 if (EFMElist == fmep) 3202 EFMElist = prev; 3203 3204 if (prev == NULL) 3205 FMElist = FMElist->next; 3206 else 3207 prev->next = fmep->next; 3208 3209 fmep->next = NULL; 3210 3211 /* Get rid of any timer this fme has set */ 3212 if (fmep->wull != 0) 3213 fmd_timer_remove(fmep->hdl, fmep->timer); 3214 3215 if (ClosedFMEs == NULL) { 3216 ClosedFMEs = fmep; 3217 } else { 3218 fmep->next = ClosedFMEs; 3219 ClosedFMEs = fmep; 3220 } 3221 3222 Open_fme_count--; 3223 3224 /* See if we can close the overflow FME */ 3225 if (Open_fme_count <= Max_fme) { 3226 for (fmep = FMElist; fmep; fmep = fmep->next) { 3227 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3228 fmep->fmcase))) 3229 break; 3230 } 3231 3232 if (fmep != NULL) 3233 fmd_case_close(fmep->hdl, fmep->fmcase); 3234 } 3235 } 3236 3237 /* 3238 * fme_set_timer() 3239 * If the time we need to wait for the given FME is less than the 3240 * current timer, kick that old timer out and establish a new one. 3241 */ 3242 static int 3243 fme_set_timer(struct fme *fmep, unsigned long long wull) 3244 { 3245 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3246 ptree_timeval(O_ALTFP|O_VERB, &wull); 3247 3248 if (wull <= fmep->pull) { 3249 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3250 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3251 out(O_ALTFP|O_VERB, NULL); 3252 /* we've waited at least wull already, don't need timer */ 3253 return (0); 3254 } 3255 3256 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3257 if (fmep->wull != 0) { 3258 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3259 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3260 out(O_ALTFP|O_VERB, NULL); 3261 } else { 3262 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3263 out(O_ALTFP|O_VERB, NULL); 3264 } 3265 3266 if (fmep->wull != 0) 3267 if (wull >= fmep->wull) 3268 /* New timer would fire later than established timer */ 3269 return (0); 3270 3271 if (fmep->wull != 0) { 3272 fmd_timer_remove(fmep->hdl, fmep->timer); 3273 } 3274 3275 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3276 fmep->e0r, wull); 3277 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3278 fmep->wull = wull; 3279 return (1); 3280 } 3281 3282 void 3283 fme_timer_fired(struct fme *fmep, id_t tid) 3284 { 3285 struct fme *ffmep = NULL; 3286 3287 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3288 if (ffmep == fmep) 3289 break; 3290 3291 if (ffmep == NULL) { 3292 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3293 (void *)fmep); 3294 return; 3295 } 3296 3297 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3298 fmep->pull = fmep->wull; 3299 fmep->wull = 0; 3300 fmd_buf_write(fmep->hdl, fmep->fmcase, 3301 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3302 3303 fme_eval(fmep, fmep->e0r); 3304 } 3305 3306 /* 3307 * Preserve the fme's suspect list in its psuspects list, NULLing the 3308 * suspects list in the meantime. 3309 */ 3310 static void 3311 save_suspects(struct fme *fmep) 3312 { 3313 struct event *ep; 3314 struct event *nextep; 3315 3316 /* zero out the previous suspect list */ 3317 for (ep = fmep->psuspects; ep; ep = nextep) { 3318 nextep = ep->psuspects; 3319 ep->psuspects = NULL; 3320 } 3321 fmep->psuspects = NULL; 3322 3323 /* zero out the suspect list, copying it to previous suspect list */ 3324 fmep->psuspects = fmep->suspects; 3325 for (ep = fmep->suspects; ep; ep = nextep) { 3326 nextep = ep->suspects; 3327 ep->psuspects = ep->suspects; 3328 ep->suspects = NULL; 3329 ep->is_suspect = 0; 3330 } 3331 fmep->suspects = NULL; 3332 fmep->nsuspects = 0; 3333 } 3334 3335 /* 3336 * Retrieve the fme's suspect list from its psuspects list. 3337 */ 3338 static void 3339 restore_suspects(struct fme *fmep) 3340 { 3341 struct event *ep; 3342 struct event *nextep; 3343 3344 fmep->nsuspects = 0; 3345 fmep->suspects = fmep->psuspects; 3346 for (ep = fmep->psuspects; ep; ep = nextep) { 3347 fmep->nsuspects++; 3348 nextep = ep->psuspects; 3349 ep->suspects = ep->psuspects; 3350 } 3351 } 3352 3353 /* 3354 * this is what we use to call the Emrys prototype code instead of main() 3355 */ 3356 static void 3357 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3358 { 3359 struct event *ep; 3360 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3361 struct rsl *srl = NULL; 3362 struct rsl *srl2 = NULL; 3363 int mess_zero_count; 3364 int rpcnt; 3365 3366 save_suspects(fmep); 3367 3368 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3369 indent_set(" "); 3370 3371 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3372 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3373 3374 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3375 fme_state2str(fmep->state)); 3376 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3377 out(O_ALTFP|O_NONL, " "); 3378 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3379 } 3380 out(O_ALTFP, NULL); 3381 3382 switch (fmep->state) { 3383 case FME_CREDIBLE: 3384 print_suspects(SLNEW, fmep); 3385 (void) upsets_eval(fmep, ffep); 3386 3387 /* 3388 * we may have already posted suspects in upsets_eval() which 3389 * can recurse into fme_eval() again. If so then just return. 3390 */ 3391 if (fmep->posted_suspects) 3392 return; 3393 3394 stats_counter_bump(fmep->diags); 3395 rpcnt = fmep->nsuspects; 3396 save_suspects(fmep); 3397 3398 /* 3399 * create two lists, one for "message=1" faults and one for 3400 * "message=0" faults. If we have a mixture we will generate 3401 * two separate suspect lists. 3402 */ 3403 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3404 bzero(srl, rpcnt * sizeof (struct rsl)); 3405 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3406 bzero(srl2, rpcnt * sizeof (struct rsl)); 3407 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3408 3409 /* 3410 * If the resulting suspect list has no members, we're 3411 * done so simply close the case. Otherwise sort and publish. 3412 */ 3413 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3414 out(O_ALTFP, 3415 "[FME%d, case %s (all suspects are upsets)]", 3416 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3417 fmd_case_close(fmep->hdl, fmep->fmcase); 3418 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3419 publish_suspects(fmep, srl); 3420 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3421 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3422 fmd_case_solve(fmep->hdl, fmep->fmcase); 3423 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3424 fmep->nsuspects = mess_zero_count; 3425 publish_suspects(fmep, srl2); 3426 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3427 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3428 fmd_case_solve(fmep->hdl, fmep->fmcase); 3429 } else { 3430 struct event *obsp; 3431 struct fme *nfmep; 3432 3433 publish_suspects(fmep, srl); 3434 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3435 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3436 fmd_case_solve(fmep->hdl, fmep->fmcase); 3437 3438 /* 3439 * Got both message=0 and message=1 so create a 3440 * duplicate case. Also need a temporary duplicate fme 3441 * structure for use by publish_suspects(). 3442 */ 3443 nfmep = alloc_fme(); 3444 nfmep->id = Nextid++; 3445 nfmep->hdl = fmep->hdl; 3446 nfmep->nsuspects = mess_zero_count; 3447 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3448 out(O_ALTFP|O_STAMP, 3449 "[creating parallel FME%d, case %s]", nfmep->id, 3450 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3451 Open_fme_count++; 3452 if (ffep) { 3453 fmd_case_setprincipal(nfmep->hdl, 3454 nfmep->fmcase, ffep); 3455 fmd_case_add_ereport(nfmep->hdl, 3456 nfmep->fmcase, ffep); 3457 } 3458 for (obsp = fmep->observations; obsp; 3459 obsp = obsp->observations) 3460 if (obsp->ffep && obsp->ffep != ffep) 3461 fmd_case_add_ereport(nfmep->hdl, 3462 nfmep->fmcase, obsp->ffep); 3463 3464 publish_suspects(nfmep, srl2); 3465 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3466 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3467 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3468 FREE(nfmep); 3469 } 3470 FREE(srl); 3471 FREE(srl2); 3472 restore_suspects(fmep); 3473 3474 fmep->posted_suspects = 1; 3475 fmd_buf_write(fmep->hdl, fmep->fmcase, 3476 WOBUF_POSTD, 3477 (void *)&fmep->posted_suspects, 3478 sizeof (fmep->posted_suspects)); 3479 3480 /* 3481 * Now the suspects have been posted, we can clear up 3482 * the instance tree as we won't be looking at it again. 3483 * Also cancel the timer as the case is now solved. 3484 */ 3485 if (fmep->wull != 0) { 3486 fmd_timer_remove(fmep->hdl, fmep->timer); 3487 fmep->wull = 0; 3488 } 3489 break; 3490 3491 case FME_WAIT: 3492 ASSERT(my_delay > fmep->ull); 3493 (void) fme_set_timer(fmep, my_delay); 3494 print_suspects(SLWAIT, fmep); 3495 itree_prune(fmep->eventtree); 3496 return; 3497 3498 case FME_DISPROVED: 3499 print_suspects(SLDISPROVED, fmep); 3500 Undiag_reason = UD_VAL_UNSOLVD; 3501 fme_undiagnosable(fmep); 3502 break; 3503 } 3504 3505 itree_free(fmep->eventtree); 3506 fmep->eventtree = NULL; 3507 structconfig_free(fmep->config); 3508 fmep->config = NULL; 3509 destroy_fme_bufs(fmep); 3510 } 3511 3512 static void indent(void); 3513 static int triggered(struct fme *fmep, struct event *ep, int mark); 3514 static enum fme_state effects_test(struct fme *fmep, 3515 struct event *fault_event, unsigned long long at_latest_by, 3516 unsigned long long *pdelay); 3517 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3518 unsigned long long at_latest_by, unsigned long long *pdelay); 3519 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3520 unsigned long long at_latest_by, unsigned long long *pdelay); 3521 3522 static int 3523 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3524 { 3525 struct constraintlist *ctp; 3526 struct evalue value; 3527 char *sep = ""; 3528 3529 if (arrowp->forever_false) { 3530 indent(); 3531 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3532 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3533 out(O_ALTFP|O_VERB|O_NONL, sep); 3534 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3535 sep = ", "; 3536 } 3537 out(O_ALTFP|O_VERB, NULL); 3538 return (0); 3539 } 3540 if (arrowp->forever_true) { 3541 indent(); 3542 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3543 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3544 out(O_ALTFP|O_VERB|O_NONL, sep); 3545 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3546 sep = ", "; 3547 } 3548 out(O_ALTFP|O_VERB, NULL); 3549 return (1); 3550 } 3551 3552 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3553 if (eval_expr(ctp->cnode, NULL, NULL, 3554 &fmep->globals, fmep->config, 3555 arrowp, 0, &value)) { 3556 /* evaluation successful */ 3557 if (value.t == UNDEFINED || value.v == 0) { 3558 /* known false */ 3559 arrowp->forever_false = 1; 3560 indent(); 3561 out(O_ALTFP|O_VERB|O_NONL, 3562 " False constraint: "); 3563 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3564 out(O_ALTFP|O_VERB, NULL); 3565 return (0); 3566 } 3567 } else { 3568 /* evaluation unsuccessful -- unknown value */ 3569 indent(); 3570 out(O_ALTFP|O_VERB|O_NONL, 3571 " Deferred constraint: "); 3572 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3573 out(O_ALTFP|O_VERB, NULL); 3574 return (1); 3575 } 3576 } 3577 /* known true */ 3578 arrowp->forever_true = 1; 3579 indent(); 3580 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3581 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3582 out(O_ALTFP|O_VERB|O_NONL, sep); 3583 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3584 sep = ", "; 3585 } 3586 out(O_ALTFP|O_VERB, NULL); 3587 return (1); 3588 } 3589 3590 static int 3591 triggered(struct fme *fmep, struct event *ep, int mark) 3592 { 3593 struct bubble *bp; 3594 struct arrowlist *ap; 3595 int count = 0; 3596 3597 stats_counter_bump(fmep->Tcallcount); 3598 for (bp = itree_next_bubble(ep, NULL); bp; 3599 bp = itree_next_bubble(ep, bp)) { 3600 if (bp->t != B_TO) 3601 continue; 3602 for (ap = itree_next_arrow(bp, NULL); ap; 3603 ap = itree_next_arrow(bp, ap)) { 3604 /* check count of marks against K in the bubble */ 3605 if ((ap->arrowp->mark & mark) && 3606 ++count >= bp->nork) 3607 return (1); 3608 } 3609 } 3610 return (0); 3611 } 3612 3613 static int 3614 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3615 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3616 { 3617 struct bubble *bp; 3618 struct arrowlist *ap; 3619 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3620 unsigned long long my_delay; 3621 enum fme_state result; 3622 int retval = 0; 3623 3624 for (bp = itree_next_bubble(ep, NULL); bp; 3625 bp = itree_next_bubble(ep, bp)) { 3626 if (bp->t != B_FROM) 3627 continue; 3628 stats_counter_bump(fmep->Marrowcount); 3629 for (ap = itree_next_arrow(bp, NULL); ap; 3630 ap = itree_next_arrow(bp, ap)) { 3631 struct event *ep2 = ap->arrowp->head->myevent; 3632 /* 3633 * if we're clearing marks, we can avoid doing 3634 * all that work evaluating constraints. 3635 */ 3636 if (mark == 0) { 3637 if (ap->arrowp->arrow_marked == 0) 3638 continue; 3639 ap->arrowp->arrow_marked = 0; 3640 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3641 if (keep && (ep2->cached_state & 3642 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3643 ep2->keep_in_tree = 1; 3644 ep2->cached_state &= 3645 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3646 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3647 keep); 3648 continue; 3649 } 3650 ap->arrowp->arrow_marked = 1; 3651 if (ep2->cached_state & REQMNTS_DISPROVED) { 3652 indent(); 3653 out(O_ALTFP|O_VERB|O_NONL, 3654 " ALREADY DISPROVED "); 3655 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3656 out(O_ALTFP|O_VERB, NULL); 3657 continue; 3658 } 3659 if (ep2->cached_state & WAIT_EFFECT) { 3660 indent(); 3661 out(O_ALTFP|O_VERB|O_NONL, 3662 " ALREADY EFFECTS WAIT "); 3663 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3664 out(O_ALTFP|O_VERB, NULL); 3665 continue; 3666 } 3667 if (ep2->cached_state & CREDIBLE_EFFECT) { 3668 indent(); 3669 out(O_ALTFP|O_VERB|O_NONL, 3670 " ALREADY EFFECTS CREDIBLE "); 3671 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3672 out(O_ALTFP|O_VERB, NULL); 3673 continue; 3674 } 3675 if ((ep2->cached_state & PARENT_WAIT) && 3676 (mark & PARENT_WAIT)) { 3677 indent(); 3678 out(O_ALTFP|O_VERB|O_NONL, 3679 " ALREADY PARENT EFFECTS WAIT "); 3680 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3681 out(O_ALTFP|O_VERB, NULL); 3682 continue; 3683 } 3684 platform_set_payloadnvp(ep2->nvp); 3685 if (checkconstraints(fmep, ap->arrowp) == 0) { 3686 platform_set_payloadnvp(NULL); 3687 indent(); 3688 out(O_ALTFP|O_VERB|O_NONL, 3689 " CONSTRAINTS FAIL "); 3690 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3691 out(O_ALTFP|O_VERB, NULL); 3692 continue; 3693 } 3694 platform_set_payloadnvp(NULL); 3695 ap->arrowp->mark |= EFFECTS_COUNTER; 3696 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3697 indent(); 3698 out(O_ALTFP|O_VERB|O_NONL, 3699 " K-COUNT NOT YET MET "); 3700 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3701 out(O_ALTFP|O_VERB, NULL); 3702 continue; 3703 } 3704 ep2->cached_state &= ~PARENT_WAIT; 3705 /* 3706 * if we've reached an ereport and no propagation time 3707 * is specified, use the Hesitate value 3708 */ 3709 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3710 ap->arrowp->maxdelay == 0ULL) { 3711 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3712 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3713 out(O_ALTFP|O_VERB, NULL); 3714 result = requirements_test(fmep, ep2, Hesitate, 3715 &my_delay); 3716 } else { 3717 result = requirements_test(fmep, ep2, 3718 at_latest_by + ap->arrowp->maxdelay, 3719 &my_delay); 3720 } 3721 if (result == FME_WAIT) { 3722 retval = WAIT_EFFECT; 3723 if (overall_delay > my_delay) 3724 overall_delay = my_delay; 3725 ep2->cached_state |= WAIT_EFFECT; 3726 indent(); 3727 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3728 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3729 out(O_ALTFP|O_VERB, NULL); 3730 indent_push(" E"); 3731 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3732 at_latest_by, &my_delay, 0) == 3733 WAIT_EFFECT) { 3734 retval = WAIT_EFFECT; 3735 if (overall_delay > my_delay) 3736 overall_delay = my_delay; 3737 } 3738 indent_pop(); 3739 } else if (result == FME_DISPROVED) { 3740 indent(); 3741 out(O_ALTFP|O_VERB|O_NONL, 3742 " EFFECTS DISPROVED "); 3743 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3744 out(O_ALTFP|O_VERB, NULL); 3745 } else { 3746 ep2->cached_state |= mark; 3747 indent(); 3748 if (mark == CREDIBLE_EFFECT) 3749 out(O_ALTFP|O_VERB|O_NONL, 3750 " EFFECTS CREDIBLE "); 3751 else 3752 out(O_ALTFP|O_VERB|O_NONL, 3753 " PARENT EFFECTS WAIT "); 3754 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3755 out(O_ALTFP|O_VERB, NULL); 3756 indent_push(" E"); 3757 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3758 &my_delay, 0) == WAIT_EFFECT) { 3759 retval = WAIT_EFFECT; 3760 if (overall_delay > my_delay) 3761 overall_delay = my_delay; 3762 } 3763 indent_pop(); 3764 } 3765 } 3766 } 3767 if (retval == WAIT_EFFECT) 3768 *pdelay = overall_delay; 3769 return (retval); 3770 } 3771 3772 static enum fme_state 3773 effects_test(struct fme *fmep, struct event *fault_event, 3774 unsigned long long at_latest_by, unsigned long long *pdelay) 3775 { 3776 struct event *error_event; 3777 enum fme_state return_value = FME_CREDIBLE; 3778 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3779 unsigned long long my_delay; 3780 3781 stats_counter_bump(fmep->Ecallcount); 3782 indent_push(" E"); 3783 indent(); 3784 out(O_ALTFP|O_VERB|O_NONL, "->"); 3785 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3786 out(O_ALTFP|O_VERB, NULL); 3787 3788 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3789 &my_delay, 0) == WAIT_EFFECT) { 3790 return_value = FME_WAIT; 3791 if (overall_delay > my_delay) 3792 overall_delay = my_delay; 3793 } 3794 for (error_event = fmep->observations; 3795 error_event; error_event = error_event->observations) { 3796 indent(); 3797 out(O_ALTFP|O_VERB|O_NONL, " "); 3798 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3799 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3800 if (error_event->cached_state & 3801 (PARENT_WAIT|WAIT_EFFECT)) { 3802 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3803 continue; 3804 } 3805 return_value = FME_DISPROVED; 3806 out(O_ALTFP|O_VERB, " NOT triggered"); 3807 break; 3808 } else { 3809 out(O_ALTFP|O_VERB, " triggered"); 3810 } 3811 } 3812 if (return_value == FME_DISPROVED) { 3813 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3814 } else { 3815 fault_event->keep_in_tree = 1; 3816 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3817 } 3818 3819 indent(); 3820 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3821 fme_state2str(return_value)); 3822 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3823 out(O_ALTFP|O_VERB, NULL); 3824 indent_pop(); 3825 if (return_value == FME_WAIT) 3826 *pdelay = overall_delay; 3827 return (return_value); 3828 } 3829 3830 static enum fme_state 3831 requirements_test(struct fme *fmep, struct event *ep, 3832 unsigned long long at_latest_by, unsigned long long *pdelay) 3833 { 3834 int waiting_events; 3835 int credible_events; 3836 int deferred_events; 3837 enum fme_state return_value = FME_CREDIBLE; 3838 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3839 unsigned long long arrow_delay; 3840 unsigned long long my_delay; 3841 struct event *ep2; 3842 struct bubble *bp; 3843 struct arrowlist *ap; 3844 3845 if (ep->cached_state & REQMNTS_CREDIBLE) { 3846 indent(); 3847 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3848 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3849 out(O_ALTFP|O_VERB, NULL); 3850 return (FME_CREDIBLE); 3851 } 3852 if (ep->cached_state & REQMNTS_DISPROVED) { 3853 indent(); 3854 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3855 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3856 out(O_ALTFP|O_VERB, NULL); 3857 return (FME_DISPROVED); 3858 } 3859 if (ep->cached_state & REQMNTS_WAIT) { 3860 indent(); 3861 *pdelay = ep->cached_delay; 3862 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3863 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3864 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3865 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3866 out(O_ALTFP|O_VERB, NULL); 3867 return (FME_WAIT); 3868 } 3869 stats_counter_bump(fmep->Rcallcount); 3870 indent_push(" R"); 3871 indent(); 3872 out(O_ALTFP|O_VERB|O_NONL, "->"); 3873 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3874 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3875 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3876 out(O_ALTFP|O_VERB, NULL); 3877 3878 if (ep->t == N_EREPORT) { 3879 if (ep->count == 0) { 3880 if (fmep->pull >= at_latest_by) { 3881 return_value = FME_DISPROVED; 3882 } else { 3883 ep->cached_delay = *pdelay = at_latest_by; 3884 return_value = FME_WAIT; 3885 } 3886 } 3887 3888 indent(); 3889 switch (return_value) { 3890 case FME_CREDIBLE: 3891 ep->cached_state |= REQMNTS_CREDIBLE; 3892 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3893 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3894 break; 3895 case FME_DISPROVED: 3896 ep->cached_state |= REQMNTS_DISPROVED; 3897 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3898 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3899 break; 3900 case FME_WAIT: 3901 ep->cached_state |= REQMNTS_WAIT; 3902 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3903 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3904 out(O_ALTFP|O_VERB|O_NONL, " to "); 3905 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3906 break; 3907 default: 3908 out(O_DIE, "requirements_test: unexpected fme_state"); 3909 break; 3910 } 3911 out(O_ALTFP|O_VERB, NULL); 3912 indent_pop(); 3913 3914 return (return_value); 3915 } 3916 3917 /* this event is not a report, descend the tree */ 3918 for (bp = itree_next_bubble(ep, NULL); bp; 3919 bp = itree_next_bubble(ep, bp)) { 3920 int n; 3921 3922 if (bp->t != B_FROM) 3923 continue; 3924 3925 n = bp->nork; 3926 3927 credible_events = 0; 3928 waiting_events = 0; 3929 deferred_events = 0; 3930 arrow_delay = TIMEVAL_EVENTUALLY; 3931 /* 3932 * n is -1 for 'A' so adjust it. 3933 * XXX just count up the arrows for now. 3934 */ 3935 if (n < 0) { 3936 n = 0; 3937 for (ap = itree_next_arrow(bp, NULL); ap; 3938 ap = itree_next_arrow(bp, ap)) 3939 n++; 3940 indent(); 3941 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3942 } else { 3943 indent(); 3944 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3945 } 3946 3947 if (n == 0) 3948 continue; 3949 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3950 for (ap = itree_next_arrow(bp, NULL); ap; 3951 ap = itree_next_arrow(bp, ap)) { 3952 ep2 = ap->arrowp->head->myevent; 3953 platform_set_payloadnvp(ep2->nvp); 3954 (void) checkconstraints(fmep, ap->arrowp); 3955 if (!ap->arrowp->forever_false) { 3956 /* 3957 * if all arrows are invalidated by the 3958 * constraints, then we should elide the 3959 * whole bubble to be consistant with 3960 * the tree creation time behaviour 3961 */ 3962 bp->mark |= BUBBLE_OK; 3963 platform_set_payloadnvp(NULL); 3964 break; 3965 } 3966 platform_set_payloadnvp(NULL); 3967 } 3968 } 3969 for (ap = itree_next_arrow(bp, NULL); ap; 3970 ap = itree_next_arrow(bp, ap)) { 3971 ep2 = ap->arrowp->head->myevent; 3972 if (n <= credible_events) 3973 break; 3974 3975 ap->arrowp->mark |= REQMNTS_COUNTER; 3976 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3977 /* XXX adding max timevals! */ 3978 switch (requirements_test(fmep, ep2, 3979 at_latest_by + ap->arrowp->maxdelay, 3980 &my_delay)) { 3981 case FME_DEFERRED: 3982 deferred_events++; 3983 break; 3984 case FME_CREDIBLE: 3985 credible_events++; 3986 break; 3987 case FME_DISPROVED: 3988 break; 3989 case FME_WAIT: 3990 if (my_delay < arrow_delay) 3991 arrow_delay = my_delay; 3992 waiting_events++; 3993 break; 3994 default: 3995 out(O_DIE, 3996 "Bug in requirements_test."); 3997 } 3998 else 3999 deferred_events++; 4000 } 4001 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 4002 bp->mark |= BUBBLE_ELIDED; 4003 continue; 4004 } 4005 indent(); 4006 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 4007 credible_events + deferred_events, waiting_events); 4008 if (credible_events + deferred_events + waiting_events < n) { 4009 /* Can never meet requirements */ 4010 ep->cached_state |= REQMNTS_DISPROVED; 4011 indent(); 4012 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4013 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4014 out(O_ALTFP|O_VERB, NULL); 4015 indent_pop(); 4016 return (FME_DISPROVED); 4017 } 4018 if (credible_events + deferred_events < n) { 4019 /* will have to wait */ 4020 /* wait time is shortest known */ 4021 if (arrow_delay < overall_delay) 4022 overall_delay = arrow_delay; 4023 return_value = FME_WAIT; 4024 } else if (credible_events < n) { 4025 if (return_value != FME_WAIT) 4026 return_value = FME_DEFERRED; 4027 } 4028 } 4029 4030 /* 4031 * don't mark as FME_DEFERRED. If this event isn't reached by another 4032 * path, then this will be considered FME_CREDIBLE. But if it is 4033 * reached by a different path so the K-count is met, then might 4034 * get overridden by FME_WAIT or FME_DISPROVED. 4035 */ 4036 if (return_value == FME_WAIT) { 4037 ep->cached_state |= REQMNTS_WAIT; 4038 ep->cached_delay = *pdelay = overall_delay; 4039 } else if (return_value == FME_CREDIBLE) { 4040 ep->cached_state |= REQMNTS_CREDIBLE; 4041 } 4042 indent(); 4043 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4044 fme_state2str(return_value)); 4045 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4046 out(O_ALTFP|O_VERB, NULL); 4047 indent_pop(); 4048 return (return_value); 4049 } 4050 4051 static enum fme_state 4052 causes_test(struct fme *fmep, struct event *ep, 4053 unsigned long long at_latest_by, unsigned long long *pdelay) 4054 { 4055 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4056 unsigned long long my_delay; 4057 int credible_results = 0; 4058 int waiting_results = 0; 4059 enum fme_state fstate; 4060 struct event *tail_event; 4061 struct bubble *bp; 4062 struct arrowlist *ap; 4063 int k = 1; 4064 4065 stats_counter_bump(fmep->Ccallcount); 4066 indent_push(" C"); 4067 indent(); 4068 out(O_ALTFP|O_VERB|O_NONL, "->"); 4069 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4070 out(O_ALTFP|O_VERB, NULL); 4071 4072 for (bp = itree_next_bubble(ep, NULL); bp; 4073 bp = itree_next_bubble(ep, bp)) { 4074 if (bp->t != B_TO) 4075 continue; 4076 k = bp->nork; /* remember the K value */ 4077 for (ap = itree_next_arrow(bp, NULL); ap; 4078 ap = itree_next_arrow(bp, ap)) { 4079 int do_not_follow = 0; 4080 4081 /* 4082 * if we get to the same event multiple times 4083 * only worry about the first one. 4084 */ 4085 if (ap->arrowp->tail->myevent->cached_state & 4086 CAUSES_TESTED) { 4087 indent(); 4088 out(O_ALTFP|O_VERB|O_NONL, 4089 " causes test already run for "); 4090 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4091 ap->arrowp->tail->myevent); 4092 out(O_ALTFP|O_VERB, NULL); 4093 continue; 4094 } 4095 4096 /* 4097 * see if false constraint prevents us 4098 * from traversing this arrow 4099 */ 4100 platform_set_payloadnvp(ep->nvp); 4101 if (checkconstraints(fmep, ap->arrowp) == 0) 4102 do_not_follow = 1; 4103 platform_set_payloadnvp(NULL); 4104 if (do_not_follow) { 4105 indent(); 4106 out(O_ALTFP|O_VERB|O_NONL, 4107 " False arrow from "); 4108 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4109 ap->arrowp->tail->myevent); 4110 out(O_ALTFP|O_VERB, NULL); 4111 continue; 4112 } 4113 4114 ap->arrowp->tail->myevent->cached_state |= 4115 CAUSES_TESTED; 4116 tail_event = ap->arrowp->tail->myevent; 4117 fstate = hypothesise(fmep, tail_event, at_latest_by, 4118 &my_delay); 4119 4120 switch (fstate) { 4121 case FME_WAIT: 4122 if (my_delay < overall_delay) 4123 overall_delay = my_delay; 4124 waiting_results++; 4125 break; 4126 case FME_CREDIBLE: 4127 credible_results++; 4128 break; 4129 case FME_DISPROVED: 4130 break; 4131 default: 4132 out(O_DIE, "Bug in causes_test"); 4133 } 4134 } 4135 } 4136 /* compare against K */ 4137 if (credible_results + waiting_results < k) { 4138 indent(); 4139 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4140 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4141 out(O_ALTFP|O_VERB, NULL); 4142 indent_pop(); 4143 return (FME_DISPROVED); 4144 } 4145 if (waiting_results != 0) { 4146 *pdelay = overall_delay; 4147 indent(); 4148 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4149 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4150 out(O_ALTFP|O_VERB|O_NONL, " to "); 4151 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4152 out(O_ALTFP|O_VERB, NULL); 4153 indent_pop(); 4154 return (FME_WAIT); 4155 } 4156 indent(); 4157 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4158 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4159 out(O_ALTFP|O_VERB, NULL); 4160 indent_pop(); 4161 return (FME_CREDIBLE); 4162 } 4163 4164 static enum fme_state 4165 hypothesise(struct fme *fmep, struct event *ep, 4166 unsigned long long at_latest_by, unsigned long long *pdelay) 4167 { 4168 enum fme_state rtr, otr; 4169 unsigned long long my_delay; 4170 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4171 4172 stats_counter_bump(fmep->Hcallcount); 4173 indent_push(" H"); 4174 indent(); 4175 out(O_ALTFP|O_VERB|O_NONL, "->"); 4176 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4177 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4178 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4179 out(O_ALTFP|O_VERB, NULL); 4180 4181 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4182 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4183 overall_delay = my_delay; 4184 if (rtr != FME_DISPROVED) { 4185 if (is_problem(ep->t)) { 4186 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4187 if (otr != FME_DISPROVED) { 4188 if (fmep->peek == 0 && ep->is_suspect == 0) { 4189 ep->suspects = fmep->suspects; 4190 ep->is_suspect = 1; 4191 fmep->suspects = ep; 4192 fmep->nsuspects++; 4193 } 4194 } 4195 } else 4196 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4197 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4198 overall_delay = my_delay; 4199 if ((otr != FME_DISPROVED) && 4200 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4201 *pdelay = overall_delay; 4202 } 4203 if (rtr == FME_DISPROVED) { 4204 indent(); 4205 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4206 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4207 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4208 indent_pop(); 4209 return (FME_DISPROVED); 4210 } 4211 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4212 indent(); 4213 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4214 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4215 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4216 indent_pop(); 4217 return (FME_DISPROVED); 4218 } 4219 if (otr == FME_DISPROVED) { 4220 indent(); 4221 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4222 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4223 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4224 indent_pop(); 4225 return (FME_DISPROVED); 4226 } 4227 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4228 indent(); 4229 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4230 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4231 out(O_ALTFP|O_VERB|O_NONL, " to "); 4232 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4233 out(O_ALTFP|O_VERB, NULL); 4234 indent_pop(); 4235 return (FME_WAIT); 4236 } 4237 indent(); 4238 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4239 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4240 out(O_ALTFP|O_VERB, NULL); 4241 indent_pop(); 4242 return (FME_CREDIBLE); 4243 } 4244 4245 /* 4246 * fme_istat_load -- reconstitute any persistent istats 4247 */ 4248 void 4249 fme_istat_load(fmd_hdl_t *hdl) 4250 { 4251 int sz; 4252 char *sbuf; 4253 char *ptr; 4254 4255 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4256 out(O_ALTFP, "fme_istat_load: No stats"); 4257 return; 4258 } 4259 4260 sbuf = alloca(sz); 4261 4262 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4263 4264 /* 4265 * pick apart the serialized stats 4266 * 4267 * format is: 4268 * <class-name>, '@', <path>, '\0', <value>, '\0' 4269 * for example: 4270 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4271 * 4272 * since this is parsing our own serialized data, any parsing issues 4273 * are fatal, so we check for them all with ASSERT() below. 4274 */ 4275 ptr = sbuf; 4276 while (ptr < &sbuf[sz]) { 4277 char *sepptr; 4278 struct node *np; 4279 int val; 4280 4281 sepptr = strchr(ptr, '@'); 4282 ASSERT(sepptr != NULL); 4283 *sepptr = '\0'; 4284 4285 /* construct the event */ 4286 np = newnode(T_EVENT, NULL, 0); 4287 np->u.event.ename = newnode(T_NAME, NULL, 0); 4288 np->u.event.ename->u.name.t = N_STAT; 4289 np->u.event.ename->u.name.s = stable(ptr); 4290 np->u.event.ename->u.name.it = IT_ENAME; 4291 np->u.event.ename->u.name.last = np->u.event.ename; 4292 4293 ptr = sepptr + 1; 4294 ASSERT(ptr < &sbuf[sz]); 4295 ptr += strlen(ptr); 4296 ptr++; /* move past the '\0' separating path from value */ 4297 ASSERT(ptr < &sbuf[sz]); 4298 ASSERT(isdigit(*ptr)); 4299 val = atoi(ptr); 4300 ASSERT(val > 0); 4301 ptr += strlen(ptr); 4302 ptr++; /* move past the final '\0' for this entry */ 4303 4304 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4305 ASSERT(np->u.event.epname != NULL); 4306 4307 istat_bump(np, val); 4308 tree_free(np); 4309 } 4310 4311 istat_save(); 4312 } 4313