1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2012 Milan Jurik. All rights reserved. 25 * 26 * fme.c -- fault management exercise module 27 * 28 * this module provides the simulated fault management exercise. 29 */ 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <strings.h> 35 #include <ctype.h> 36 #include <alloca.h> 37 #include <libnvpair.h> 38 #include <sys/fm/protocol.h> 39 #include <fm/fmd_api.h> 40 #include "alloc.h" 41 #include "out.h" 42 #include "stats.h" 43 #include "stable.h" 44 #include "literals.h" 45 #include "lut.h" 46 #include "tree.h" 47 #include "ptree.h" 48 #include "itree.h" 49 #include "ipath.h" 50 #include "fme.h" 51 #include "evnv.h" 52 #include "eval.h" 53 #include "config.h" 54 #include "platform.h" 55 #include "esclex.h" 56 57 /* imported from eft.c... */ 58 extern hrtime_t Hesitate; 59 extern char *Serd_Override; 60 extern nv_alloc_t Eft_nv_hdl; 61 extern int Max_fme; 62 extern fmd_hdl_t *Hdl; 63 64 static int Istat_need_save; 65 static int Serd_need_save; 66 void istat_save(void); 67 void serd_save(void); 68 69 /* fme under construction is global so we can free it on module abort */ 70 static struct fme *Nfmep; 71 72 static int Undiag_reason = UD_VAL_UNKNOWN; 73 74 static int Nextid = 0; 75 76 static int Open_fme_count = 0; /* Count of open FMEs */ 77 78 /* list of fault management exercises underway */ 79 static struct fme { 80 struct fme *next; /* next exercise */ 81 unsigned long long ull; /* time when fme was created */ 82 int id; /* FME id */ 83 struct config *config; /* cooked configuration data */ 84 struct lut *eventtree; /* propagation tree for this FME */ 85 /* 86 * The initial error report that created this FME is kept in 87 * two forms. e0 points to the instance tree node and is used 88 * by fme_eval() as the starting point for the inference 89 * algorithm. e0r is the event handle FMD passed to us when 90 * the ereport first arrived and is used when setting timers, 91 * which are always relative to the time of this initial 92 * report. 93 */ 94 struct event *e0; 95 fmd_event_t *e0r; 96 97 id_t timer; /* for setting an fmd time-out */ 98 99 struct event *ecurrent; /* ereport under consideration */ 100 struct event *suspects; /* current suspect list */ 101 struct event *psuspects; /* previous suspect list */ 102 int nsuspects; /* count of suspects */ 103 int posted_suspects; /* true if we've posted a diagnosis */ 104 int uniqobs; /* number of unique events observed */ 105 int peek; /* just peeking, don't track suspects */ 106 int overflow; /* true if overflow FME */ 107 enum fme_state { 108 FME_NOTHING = 5000, /* not evaluated yet */ 109 FME_WAIT, /* need to wait for more info */ 110 FME_CREDIBLE, /* suspect list is credible */ 111 FME_DISPROVED, /* no valid suspects found */ 112 FME_DEFERRED /* don't know yet (k-count not met) */ 113 } state; 114 115 unsigned long long pull; /* time passed since created */ 116 unsigned long long wull; /* wait until this time for re-eval */ 117 struct event *observations; /* observation list */ 118 struct lut *globals; /* values of global variables */ 119 /* fmd interfacing */ 120 fmd_hdl_t *hdl; /* handle for talking with fmd */ 121 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 122 /* stats */ 123 struct stats *Rcount; 124 struct stats *Hcallcount; 125 struct stats *Rcallcount; 126 struct stats *Ccallcount; 127 struct stats *Ecallcount; 128 struct stats *Tcallcount; 129 struct stats *Marrowcount; 130 struct stats *diags; 131 } *FMElist, *EFMElist, *ClosedFMEs; 132 133 static struct case_list { 134 fmd_case_t *fmcase; 135 struct case_list *next; 136 } *Undiagablecaselist; 137 138 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 139 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 140 unsigned long long at_latest_by, unsigned long long *pdelay); 141 static struct node *eventprop_lookup(struct event *ep, const char *propname); 142 static struct node *pathstring2epnamenp(char *path); 143 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 144 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 145 static char *undiag_2reason_str(int ud, char *arg); 146 static const char *undiag_2defect_str(int ud); 147 static void restore_suspects(struct fme *fmep); 148 static void save_suspects(struct fme *fmep); 149 static void destroy_fme(struct fme *f); 150 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 151 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 152 static void istat_counter_reset_cb(struct istat_entry *entp, 153 struct stats *statp, const struct ipath *ipp); 154 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 155 struct stats *statp, void *unused); 156 static void serd_reset_cb(struct serd_entry *entp, void *unused, 157 const struct ipath *ipp); 158 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 159 void *unused2); 160 static void destroy_fme_bufs(struct fme *fp); 161 162 static struct fme * 163 alloc_fme(void) 164 { 165 struct fme *fmep; 166 167 fmep = MALLOC(sizeof (*fmep)); 168 bzero(fmep, sizeof (*fmep)); 169 return (fmep); 170 } 171 172 /* 173 * fme_ready -- called when all initialization of the FME (except for 174 * stats) has completed successfully. Adds the fme to global lists 175 * and establishes its stats. 176 */ 177 static struct fme * 178 fme_ready(struct fme *fmep) 179 { 180 char nbuf[100]; 181 182 Nfmep = NULL; /* don't need to free this on module abort now */ 183 184 if (EFMElist) { 185 EFMElist->next = fmep; 186 EFMElist = fmep; 187 } else 188 FMElist = EFMElist = fmep; 189 190 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 191 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 192 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 193 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 194 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 195 fmep->Rcallcount = stats_new_counter(nbuf, 196 "calls to requirements_test()", 1); 197 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 198 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 199 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 200 fmep->Ecallcount = 201 stats_new_counter(nbuf, "calls to effects_test()", 1); 202 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 203 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 204 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 205 fmep->Marrowcount = stats_new_counter(nbuf, 206 "arrows marked by mark_arrows()", 1); 207 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 208 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 209 210 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 211 config_print(O_ALTFP|O_VERB2, fmep->config); 212 213 return (fmep); 214 } 215 216 extern void ipath_dummy_lut(struct arrow *); 217 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 218 219 /* ARGSUSED */ 220 static void 221 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 222 { 223 struct bubble *bp; 224 struct arrowlist *ap; 225 226 for (bp = itree_next_bubble(ep, NULL); bp; 227 bp = itree_next_bubble(ep, bp)) { 228 if (bp->t != B_FROM) 229 continue; 230 for (ap = itree_next_arrow(bp, NULL); ap; 231 ap = itree_next_arrow(bp, ap)) { 232 ap->arrowp->pnode->u.arrow.needed = 1; 233 ipath_dummy_lut(ap->arrowp); 234 } 235 } 236 } 237 238 /* ARGSUSED */ 239 static void 240 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 241 { 242 struct bubble *bp; 243 struct arrowlist *ap; 244 245 for (bp = itree_next_bubble(ep, NULL); bp; 246 bp = itree_next_bubble(ep, bp)) { 247 if (bp->t != B_FROM) 248 continue; 249 for (ap = itree_next_arrow(bp, NULL); ap; 250 ap = itree_next_arrow(bp, ap)) 251 ap->arrowp->pnode->u.arrow.needed = 0; 252 } 253 } 254 255 static void globals_destructor(void *left, void *right, void *arg); 256 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 257 258 static boolean_t 259 prune_propagations(const char *e0class, const struct ipath *e0ipp) 260 { 261 char nbuf[100]; 262 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 263 extern struct lut *Usednames; 264 265 Nfmep = alloc_fme(); 266 Nfmep->id = Nextid; 267 Nfmep->state = FME_NOTHING; 268 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 269 if ((Nfmep->e0 = 270 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 271 itree_free(Nfmep->eventtree); 272 FREE(Nfmep); 273 Nfmep = NULL; 274 return (B_FALSE); 275 } 276 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 277 Nfmep->e0->count++; 278 279 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 280 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 281 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 282 Nfmep->Hcallcount = 283 stats_new_counter(nbuf, "calls to hypothesise()", 1); 284 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 285 Nfmep->Rcallcount = stats_new_counter(nbuf, 286 "calls to requirements_test()", 1); 287 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 288 Nfmep->Ccallcount = 289 stats_new_counter(nbuf, "calls to causes_test()", 1); 290 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 291 Nfmep->Ecallcount = 292 stats_new_counter(nbuf, "calls to effects_test()", 1); 293 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 294 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 295 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 296 Nfmep->Marrowcount = stats_new_counter(nbuf, 297 "arrows marked by mark_arrows()", 1); 298 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 299 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 300 301 Nfmep->peek = 1; 302 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 303 lut_free(Usednames, NULL, NULL); 304 Usednames = NULL; 305 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 306 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 307 itree_prune(Nfmep->eventtree); 308 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 309 310 stats_delete(Nfmep->Rcount); 311 stats_delete(Nfmep->Hcallcount); 312 stats_delete(Nfmep->Rcallcount); 313 stats_delete(Nfmep->Ccallcount); 314 stats_delete(Nfmep->Ecallcount); 315 stats_delete(Nfmep->Tcallcount); 316 stats_delete(Nfmep->Marrowcount); 317 stats_delete(Nfmep->diags); 318 itree_free(Nfmep->eventtree); 319 lut_free(Nfmep->globals, globals_destructor, NULL); 320 FREE(Nfmep); 321 return (B_TRUE); 322 } 323 324 static struct fme * 325 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 326 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 327 { 328 struct cfgdata *cfgdata; 329 int init_size; 330 extern int alloc_total(); 331 nvlist_t *detector = NULL; 332 char *pathstr; 333 char *arg; 334 335 /* 336 * First check if e0ipp is actually in the topology so we can give a 337 * more useful error message. 338 */ 339 ipathlastcomp(e0ipp); 340 pathstr = ipath2str(NULL, e0ipp); 341 cfgdata = config_snapshot(); 342 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 343 &detector, pathstr); 344 FREE(pathstr); 345 structconfig_free(cfgdata->cooked); 346 config_free(cfgdata); 347 if (detector == NULL) { 348 /* See if class permits silent discard on unknown component. */ 349 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 350 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 351 "to component path, but silent discard allowed.", 352 e0class); 353 } else { 354 Undiag_reason = UD_VAL_BADEVENTPATH; 355 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 356 &detector); 357 arg = ipath2str(e0class, e0ipp); 358 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 359 FREE(arg); 360 } 361 return (NULL); 362 } 363 364 /* 365 * Next run a quick first pass of the rules with a dummy config. This 366 * allows us to prune those rules which can't possibly cause this 367 * ereport. 368 */ 369 if (!prune_propagations(e0class, e0ipp)) { 370 /* 371 * The fault class must have been in the rules or we would 372 * not have registered for it (and got a "nosub"), and the 373 * pathname must be in the topology or we would have failed the 374 * previous test. So to get here means the combination of 375 * class and pathname in the ereport must be invalid. 376 */ 377 Undiag_reason = UD_VAL_BADEVENTCLASS; 378 arg = ipath2str(e0class, e0ipp); 379 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 380 nvlist_free(detector); 381 FREE(arg); 382 return (NULL); 383 } 384 385 /* 386 * Now go ahead and create the real fme using the pruned rules. 387 */ 388 init_size = alloc_total(); 389 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 390 nvlist_free(detector); 391 pathstr = ipath2str(NULL, e0ipp); 392 cfgdata = config_snapshot(); 393 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 394 &detector, pathstr); 395 FREE(pathstr); 396 platform_save_config(hdl, fmcase); 397 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 398 alloc_total() - init_size); 399 400 Nfmep = alloc_fme(); 401 402 Nfmep->id = Nextid++; 403 Nfmep->config = cfgdata->cooked; 404 config_free(cfgdata); 405 Nfmep->posted_suspects = 0; 406 Nfmep->uniqobs = 0; 407 Nfmep->state = FME_NOTHING; 408 Nfmep->pull = 0ULL; 409 Nfmep->overflow = 0; 410 411 Nfmep->fmcase = fmcase; 412 Nfmep->hdl = hdl; 413 414 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 415 Undiag_reason = UD_VAL_INSTFAIL; 416 arg = ipath2str(e0class, e0ipp); 417 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 418 nvlist_free(detector); 419 FREE(arg); 420 structconfig_free(Nfmep->config); 421 destroy_fme_bufs(Nfmep); 422 FREE(Nfmep); 423 Nfmep = NULL; 424 return (NULL); 425 } 426 427 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 428 429 if ((Nfmep->e0 = 430 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 431 Undiag_reason = UD_VAL_BADEVENTI; 432 arg = ipath2str(e0class, e0ipp); 433 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 434 nvlist_free(detector); 435 FREE(arg); 436 itree_free(Nfmep->eventtree); 437 structconfig_free(Nfmep->config); 438 destroy_fme_bufs(Nfmep); 439 FREE(Nfmep); 440 Nfmep = NULL; 441 return (NULL); 442 } 443 444 nvlist_free(detector); 445 return (fme_ready(Nfmep)); 446 } 447 448 void 449 fme_fini(void) 450 { 451 struct fme *sfp, *fp; 452 struct case_list *ucasep, *nextcasep; 453 454 ucasep = Undiagablecaselist; 455 while (ucasep != NULL) { 456 nextcasep = ucasep->next; 457 FREE(ucasep); 458 ucasep = nextcasep; 459 } 460 Undiagablecaselist = NULL; 461 462 /* clean up closed fmes */ 463 fp = ClosedFMEs; 464 while (fp != NULL) { 465 sfp = fp->next; 466 destroy_fme(fp); 467 fp = sfp; 468 } 469 ClosedFMEs = NULL; 470 471 fp = FMElist; 472 while (fp != NULL) { 473 sfp = fp->next; 474 destroy_fme(fp); 475 fp = sfp; 476 } 477 FMElist = EFMElist = NULL; 478 479 /* if we were in the middle of creating an fme, free it now */ 480 if (Nfmep) { 481 destroy_fme(Nfmep); 482 Nfmep = NULL; 483 } 484 } 485 486 /* 487 * Allocated space for a buffer name. 20 bytes allows for 488 * a ridiculous 9,999,999 unique observations. 489 */ 490 #define OBBUFNMSZ 20 491 492 /* 493 * serialize_observation 494 * 495 * Create a recoverable version of the current observation 496 * (f->ecurrent). We keep a serialized version of each unique 497 * observation in order that we may resume correctly the fme in the 498 * correct state if eft or fmd crashes and we're restarted. 499 */ 500 static void 501 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 502 { 503 size_t pkdlen; 504 char tmpbuf[OBBUFNMSZ]; 505 char *pkd = NULL; 506 char *estr; 507 508 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 509 estr = ipath2str(cls, ipp); 510 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 511 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 512 strlen(estr) + 1); 513 FREE(estr); 514 515 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 516 (void) snprintf(tmpbuf, 517 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 518 if (nvlist_xpack(fp->ecurrent->nvp, 519 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 520 out(O_DIE|O_SYS, "pack of observed nvl failed"); 521 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 522 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 523 FREE(pkd); 524 } 525 526 fp->uniqobs++; 527 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 528 sizeof (fp->uniqobs)); 529 } 530 531 /* 532 * init_fme_bufs -- We keep several bits of state about an fme for 533 * use if eft or fmd crashes and we're restarted. 534 */ 535 static void 536 init_fme_bufs(struct fme *fp) 537 { 538 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 539 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 540 sizeof (fp->pull)); 541 542 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 543 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 544 sizeof (fp->id)); 545 546 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 547 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 548 sizeof (fp->uniqobs)); 549 550 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 551 sizeof (fp->posted_suspects)); 552 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 553 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 554 } 555 556 static void 557 destroy_fme_bufs(struct fme *fp) 558 { 559 char tmpbuf[OBBUFNMSZ]; 560 int o; 561 562 platform_restore_config(fp->hdl, fp->fmcase); 563 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 564 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 565 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 566 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 567 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 568 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 569 570 for (o = 0; o < fp->uniqobs; o++) { 571 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 572 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 573 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 574 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 575 } 576 } 577 578 /* 579 * reconstitute_observations -- convert a case's serialized observations 580 * back into struct events. Returns zero if all observations are 581 * successfully reconstituted. 582 */ 583 static int 584 reconstitute_observations(struct fme *fmep) 585 { 586 struct event *ep; 587 struct node *epnamenp = NULL; 588 size_t pkdlen; 589 char *pkd = NULL; 590 char *tmpbuf = alloca(OBBUFNMSZ); 591 char *sepptr; 592 char *estr; 593 int ocnt; 594 int elen; 595 596 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 597 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 598 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 599 if (elen == 0) { 600 out(O_ALTFP, 601 "reconstitute_observation: no %s buffer found.", 602 tmpbuf); 603 Undiag_reason = UD_VAL_MISSINGOBS; 604 break; 605 } 606 607 estr = MALLOC(elen); 608 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 609 sepptr = strchr(estr, '@'); 610 if (sepptr == NULL) { 611 out(O_ALTFP, 612 "reconstitute_observation: %s: " 613 "missing @ separator in %s.", 614 tmpbuf, estr); 615 Undiag_reason = UD_VAL_MISSINGPATH; 616 FREE(estr); 617 break; 618 } 619 620 *sepptr = '\0'; 621 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 622 out(O_ALTFP, 623 "reconstitute_observation: %s: " 624 "trouble converting path string \"%s\" " 625 "to internal representation.", 626 tmpbuf, sepptr + 1); 627 Undiag_reason = UD_VAL_MISSINGPATH; 628 FREE(estr); 629 break; 630 } 631 632 /* construct the event */ 633 ep = itree_lookup(fmep->eventtree, 634 stable(estr), ipath(epnamenp)); 635 if (ep == NULL) { 636 out(O_ALTFP, 637 "reconstitute_observation: %s: " 638 "lookup of \"%s\" in itree failed.", 639 tmpbuf, ipath2str(estr, ipath(epnamenp))); 640 Undiag_reason = UD_VAL_BADOBS; 641 tree_free(epnamenp); 642 FREE(estr); 643 break; 644 } 645 tree_free(epnamenp); 646 647 /* 648 * We may or may not have a saved nvlist for the observation 649 */ 650 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 651 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 652 if (pkdlen != 0) { 653 pkd = MALLOC(pkdlen); 654 fmd_buf_read(fmep->hdl, 655 fmep->fmcase, tmpbuf, pkd, pkdlen); 656 ASSERT(ep->nvp == NULL); 657 if (nvlist_xunpack(pkd, 658 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 659 out(O_DIE|O_SYS, "pack of observed nvl failed"); 660 FREE(pkd); 661 } 662 663 if (ocnt == 0) 664 fmep->e0 = ep; 665 666 FREE(estr); 667 fmep->ecurrent = ep; 668 ep->count++; 669 670 /* link it into list of observations seen */ 671 ep->observations = fmep->observations; 672 fmep->observations = ep; 673 } 674 675 if (ocnt == fmep->uniqobs) { 676 (void) fme_ready(fmep); 677 return (0); 678 } 679 680 return (1); 681 } 682 683 /* 684 * restart_fme -- called during eft initialization. Reconstitutes 685 * an in-progress fme. 686 */ 687 void 688 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 689 { 690 nvlist_t *defect; 691 struct case_list *bad; 692 struct fme *fmep; 693 struct cfgdata *cfgdata; 694 size_t rawsz; 695 struct event *ep; 696 char *tmpbuf = alloca(OBBUFNMSZ); 697 char *sepptr; 698 char *estr; 699 int elen; 700 struct node *epnamenp = NULL; 701 int init_size; 702 extern int alloc_total(); 703 char *reason; 704 705 /* 706 * ignore solved or closed cases 707 */ 708 if (fmd_case_solved(hdl, inprogress) || 709 fmd_case_closed(hdl, inprogress)) 710 return; 711 712 fmep = alloc_fme(); 713 fmep->fmcase = inprogress; 714 fmep->hdl = hdl; 715 716 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 717 out(O_ALTFP, "restart_fme: no saved posted status"); 718 Undiag_reason = UD_VAL_MISSINGINFO; 719 goto badcase; 720 } else { 721 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 722 (void *)&fmep->posted_suspects, 723 sizeof (fmep->posted_suspects)); 724 } 725 726 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 727 out(O_ALTFP, "restart_fme: no saved id"); 728 Undiag_reason = UD_VAL_MISSINGINFO; 729 goto badcase; 730 } else { 731 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 732 sizeof (fmep->id)); 733 } 734 if (Nextid <= fmep->id) 735 Nextid = fmep->id + 1; 736 737 out(O_ALTFP, "Replay FME %d", fmep->id); 738 739 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 740 out(O_ALTFP, "restart_fme: No config data"); 741 Undiag_reason = UD_VAL_MISSINGINFO; 742 goto badcase; 743 } 744 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 745 sizeof (size_t)); 746 747 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 748 out(O_ALTFP, "restart_fme: No event zero"); 749 Undiag_reason = UD_VAL_MISSINGZERO; 750 goto badcase; 751 } 752 753 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 754 out(O_ALTFP, "restart_fme: no saved wait time"); 755 Undiag_reason = UD_VAL_MISSINGINFO; 756 goto badcase; 757 } else { 758 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 759 sizeof (fmep->pull)); 760 } 761 762 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 763 out(O_ALTFP, "restart_fme: no count of observations"); 764 Undiag_reason = UD_VAL_MISSINGINFO; 765 goto badcase; 766 } else { 767 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 768 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 769 } 770 771 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 772 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 773 if (elen == 0) { 774 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 775 tmpbuf); 776 Undiag_reason = UD_VAL_MISSINGOBS; 777 goto badcase; 778 } 779 estr = MALLOC(elen); 780 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 781 sepptr = strchr(estr, '@'); 782 if (sepptr == NULL) { 783 out(O_ALTFP, "reconstitute_observation: %s: " 784 "missing @ separator in %s.", 785 tmpbuf, estr); 786 Undiag_reason = UD_VAL_MISSINGPATH; 787 FREE(estr); 788 goto badcase; 789 } 790 *sepptr = '\0'; 791 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 792 out(O_ALTFP, "reconstitute_observation: %s: " 793 "trouble converting path string \"%s\" " 794 "to internal representation.", tmpbuf, sepptr + 1); 795 Undiag_reason = UD_VAL_MISSINGPATH; 796 FREE(estr); 797 goto badcase; 798 } 799 (void) prune_propagations(stable(estr), ipath(epnamenp)); 800 tree_free(epnamenp); 801 FREE(estr); 802 803 init_size = alloc_total(); 804 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 805 cfgdata = MALLOC(sizeof (struct cfgdata)); 806 cfgdata->cooked = NULL; 807 cfgdata->devcache = NULL; 808 cfgdata->devidcache = NULL; 809 cfgdata->tpcache = NULL; 810 cfgdata->cpucache = NULL; 811 cfgdata->raw_refcnt = 1; 812 813 if (rawsz > 0) { 814 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 815 out(O_ALTFP, "restart_fme: Config data size mismatch"); 816 Undiag_reason = UD_VAL_CFGMISMATCH; 817 goto badcase; 818 } 819 cfgdata->begin = MALLOC(rawsz); 820 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 821 fmd_buf_read(hdl, 822 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 823 } else { 824 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 825 } 826 827 config_cook(cfgdata); 828 fmep->config = cfgdata->cooked; 829 config_free(cfgdata); 830 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 831 alloc_total() - init_size); 832 833 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 834 /* case not properly saved or irretrievable */ 835 out(O_ALTFP, "restart_fme: NULL instance tree"); 836 Undiag_reason = UD_VAL_INSTFAIL; 837 goto badcase; 838 } 839 840 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 841 842 if (reconstitute_observations(fmep) != 0) 843 goto badcase; 844 845 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 846 for (ep = fmep->observations; ep; ep = ep->observations) { 847 out(O_ALTFP|O_NONL, " "); 848 itree_pevent_brief(O_ALTFP|O_NONL, ep); 849 } 850 out(O_ALTFP, NULL); 851 852 Open_fme_count++; 853 854 /* give the diagnosis algorithm a shot at the new FME state */ 855 fme_eval(fmep, fmep->e0r); 856 return; 857 858 badcase: 859 if (fmep->eventtree != NULL) 860 itree_free(fmep->eventtree); 861 if (fmep->config) 862 structconfig_free(fmep->config); 863 destroy_fme_bufs(fmep); 864 FREE(fmep); 865 866 /* 867 * Since we're unable to restart the case, add it to the undiagable 868 * list and solve and close it as appropriate. 869 */ 870 bad = MALLOC(sizeof (struct case_list)); 871 bad->next = NULL; 872 873 if (Undiagablecaselist != NULL) 874 bad->next = Undiagablecaselist; 875 Undiagablecaselist = bad; 876 bad->fmcase = inprogress; 877 878 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 879 fmd_case_uuid(hdl, bad->fmcase)); 880 881 if (fmd_case_solved(hdl, bad->fmcase)) { 882 out(O_ALTFP|O_NONL, "already solved, "); 883 } else { 884 out(O_ALTFP|O_NONL, "solving, "); 885 defect = fmd_nvl_create_fault(hdl, 886 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 887 reason = undiag_2reason_str(Undiag_reason, NULL); 888 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 889 FREE(reason); 890 fmd_case_add_suspect(hdl, bad->fmcase, defect); 891 fmd_case_solve(hdl, bad->fmcase); 892 Undiag_reason = UD_VAL_UNKNOWN; 893 } 894 895 if (fmd_case_closed(hdl, bad->fmcase)) { 896 out(O_ALTFP, "already closed ]"); 897 } else { 898 out(O_ALTFP, "closing ]"); 899 fmd_case_close(hdl, bad->fmcase); 900 } 901 } 902 903 /*ARGSUSED*/ 904 static void 905 globals_destructor(void *left, void *right, void *arg) 906 { 907 struct evalue *evp = (struct evalue *)right; 908 if (evp->t == NODEPTR) 909 tree_free((struct node *)(uintptr_t)evp->v); 910 evp->v = (uintptr_t)NULL; 911 FREE(evp); 912 } 913 914 void 915 destroy_fme(struct fme *f) 916 { 917 stats_delete(f->Rcount); 918 stats_delete(f->Hcallcount); 919 stats_delete(f->Rcallcount); 920 stats_delete(f->Ccallcount); 921 stats_delete(f->Ecallcount); 922 stats_delete(f->Tcallcount); 923 stats_delete(f->Marrowcount); 924 stats_delete(f->diags); 925 926 if (f->eventtree != NULL) 927 itree_free(f->eventtree); 928 if (f->config) 929 structconfig_free(f->config); 930 lut_free(f->globals, globals_destructor, NULL); 931 FREE(f); 932 } 933 934 static const char * 935 fme_state2str(enum fme_state s) 936 { 937 switch (s) { 938 case FME_NOTHING: return ("NOTHING"); 939 case FME_WAIT: return ("WAIT"); 940 case FME_CREDIBLE: return ("CREDIBLE"); 941 case FME_DISPROVED: return ("DISPROVED"); 942 case FME_DEFERRED: return ("DEFERRED"); 943 default: return ("UNKNOWN"); 944 } 945 } 946 947 static int 948 is_problem(enum nametype t) 949 { 950 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 951 } 952 953 static int 954 is_defect(enum nametype t) 955 { 956 return (t == N_DEFECT); 957 } 958 959 static int 960 is_upset(enum nametype t) 961 { 962 return (t == N_UPSET); 963 } 964 965 static void 966 fme_print(int flags, struct fme *fmep) 967 { 968 struct event *ep; 969 970 out(flags, "Fault Management Exercise %d", fmep->id); 971 out(flags, "\t State: %s", fme_state2str(fmep->state)); 972 out(flags|O_NONL, "\t Start time: "); 973 ptree_timeval(flags|O_NONL, &fmep->ull); 974 out(flags, NULL); 975 if (fmep->wull) { 976 out(flags|O_NONL, "\t Wait time: "); 977 ptree_timeval(flags|O_NONL, &fmep->wull); 978 out(flags, NULL); 979 } 980 out(flags|O_NONL, "\t E0: "); 981 if (fmep->e0) 982 itree_pevent_brief(flags|O_NONL, fmep->e0); 983 else 984 out(flags|O_NONL, "NULL"); 985 out(flags, NULL); 986 out(flags|O_NONL, "\tObservations:"); 987 for (ep = fmep->observations; ep; ep = ep->observations) { 988 out(flags|O_NONL, " "); 989 itree_pevent_brief(flags|O_NONL, ep); 990 } 991 out(flags, NULL); 992 out(flags|O_NONL, "\tSuspect list:"); 993 for (ep = fmep->suspects; ep; ep = ep->suspects) { 994 out(flags|O_NONL, " "); 995 itree_pevent_brief(flags|O_NONL, ep); 996 } 997 out(flags, NULL); 998 if (fmep->eventtree != NULL) { 999 out(flags|O_VERB2, "\t Tree:"); 1000 itree_ptree(flags|O_VERB2, fmep->eventtree); 1001 } 1002 } 1003 1004 static struct node * 1005 pathstring2epnamenp(char *path) 1006 { 1007 char *sep = "/"; 1008 struct node *ret; 1009 char *ptr; 1010 1011 if ((ptr = strtok(path, sep)) == NULL) 1012 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1013 1014 ret = tree_iname(stable(ptr), NULL, 0); 1015 1016 while ((ptr = strtok(NULL, sep)) != NULL) 1017 ret = tree_name_append(ret, 1018 tree_iname(stable(ptr), NULL, 0)); 1019 1020 return (ret); 1021 } 1022 1023 /* 1024 * for a given upset sp, increment the corresponding SERD engine. if the 1025 * SERD engine trips, return the ename and ipp of the resulting ereport. 1026 * returns true if engine tripped and *enamep and *ippp were filled in. 1027 */ 1028 static int 1029 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1030 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1031 const struct ipath **ippp) 1032 { 1033 struct node *serdinst; 1034 char *serdname; 1035 char *serdresource; 1036 char *serdclass; 1037 struct node *nid; 1038 struct serd_entry *newentp; 1039 int i, serdn = -1, serdincrement = 1, len = 0; 1040 char *serdsuffix = NULL, *serdt = NULL; 1041 struct evalue *ep; 1042 1043 ASSERT(sp->t == N_UPSET); 1044 ASSERT(ffep != NULL); 1045 1046 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1047 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1048 ASSERT(ep->t == UINT64); 1049 serdn = (int)ep->v; 1050 } 1051 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1052 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1053 ASSERT(ep->t == STRING); 1054 serdt = (char *)(uintptr_t)ep->v; 1055 } 1056 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1057 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1058 ASSERT(ep->t == STRING); 1059 serdsuffix = (char *)(uintptr_t)ep->v; 1060 } 1061 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1062 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1063 ASSERT(ep->t == UINT64); 1064 serdincrement = (int)ep->v; 1065 } 1066 1067 /* 1068 * obtain instanced SERD engine from the upset sp. from this 1069 * derive serdname, the string used to identify the SERD engine. 1070 */ 1071 serdinst = eventprop_lookup(sp, L_engine); 1072 1073 if (serdinst == NULL) 1074 return (-1); 1075 1076 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1077 if (serdsuffix != NULL) 1078 len += strlen(serdsuffix); 1079 serdclass = MALLOC(len); 1080 if (serdsuffix != NULL) 1081 (void) snprintf(serdclass, len, "%s%s", 1082 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1083 else 1084 (void) snprintf(serdclass, len, "%s", 1085 serdinst->u.stmt.np->u.event.ename->u.name.s); 1086 serdresource = ipath2str(NULL, 1087 ipath(serdinst->u.stmt.np->u.event.epname)); 1088 len += strlen(serdresource) + 1; 1089 serdname = MALLOC(len); 1090 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1091 FREE(serdresource); 1092 1093 /* handle serd engine "id" property, if there is one */ 1094 if ((nid = 1095 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1096 struct evalue *gval; 1097 char suffixbuf[200]; 1098 char *suffix; 1099 char *nserdname; 1100 size_t nname; 1101 1102 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1103 ptree_name_iter(O_ALTFP|O_NONL, nid); 1104 1105 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1106 1107 if ((gval = lut_lookup(fmep->globals, 1108 (void *)nid->u.globid.s, NULL)) == NULL) { 1109 out(O_ALTFP, " undefined"); 1110 } else if (gval->t == UINT64) { 1111 out(O_ALTFP, " %llu", gval->v); 1112 (void) sprintf(suffixbuf, "%llu", gval->v); 1113 suffix = suffixbuf; 1114 } else { 1115 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1116 suffix = (char *)(uintptr_t)gval->v; 1117 } 1118 1119 nname = strlen(serdname) + strlen(suffix) + 2; 1120 nserdname = MALLOC(nname); 1121 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1122 FREE(serdname); 1123 serdname = nserdname; 1124 } 1125 1126 /* 1127 * if the engine is empty, and we have an override for n/t then 1128 * destroy and recreate it. 1129 */ 1130 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1131 fmd_serd_empty(hdl, serdname)) 1132 fmd_serd_destroy(hdl, serdname); 1133 1134 if (!fmd_serd_exists(hdl, serdname)) { 1135 struct node *nN, *nT; 1136 const char *s; 1137 struct node *nodep; 1138 struct config *cp; 1139 char *path; 1140 uint_t nval; 1141 hrtime_t tval; 1142 int i; 1143 char *ptr; 1144 int got_n_override = 0, got_t_override = 0; 1145 1146 /* no SERD engine yet, so create it */ 1147 nodep = serdinst->u.stmt.np->u.event.epname; 1148 path = ipath2str(NULL, ipath(nodep)); 1149 cp = config_lookup(fmep->config, path, 0); 1150 FREE((void *)path); 1151 1152 /* 1153 * We allow serd paramaters to be overridden, either from 1154 * eft.conf file values (if Serd_Override is set) or from 1155 * driver properties (for "serd.io.device" engines). 1156 */ 1157 if (Serd_Override != NULL) { 1158 char *save_ptr, *ptr1, *ptr2, *ptr3; 1159 ptr3 = save_ptr = STRDUP(Serd_Override); 1160 while (*ptr3 != '\0') { 1161 ptr1 = strchr(ptr3, ','); 1162 *ptr1 = '\0'; 1163 if (strcmp(ptr3, serdclass) == 0) { 1164 ptr2 = strchr(ptr1 + 1, ','); 1165 *ptr2 = '\0'; 1166 nval = atoi(ptr1 + 1); 1167 out(O_ALTFP, "serd override %s_n %d", 1168 serdclass, nval); 1169 ptr3 = strchr(ptr2 + 1, ' '); 1170 if (ptr3) 1171 *ptr3 = '\0'; 1172 ptr = STRDUP(ptr2 + 1); 1173 out(O_ALTFP, "serd override %s_t %s", 1174 serdclass, ptr); 1175 got_n_override = 1; 1176 got_t_override = 1; 1177 break; 1178 } else { 1179 ptr2 = strchr(ptr1 + 1, ','); 1180 ptr3 = strchr(ptr2 + 1, ' '); 1181 if (ptr3 == NULL) 1182 break; 1183 } 1184 ptr3++; 1185 } 1186 FREE(save_ptr); 1187 } 1188 1189 if (cp && got_n_override == 0) { 1190 /* 1191 * convert serd engine class into property name 1192 */ 1193 char *prop_name = MALLOC(strlen(serdclass) + 3); 1194 for (i = 0; i < strlen(serdclass); i++) { 1195 if (serdclass[i] == '.') 1196 prop_name[i] = '_'; 1197 else 1198 prop_name[i] = serdclass[i]; 1199 } 1200 prop_name[i++] = '_'; 1201 prop_name[i++] = 'n'; 1202 prop_name[i] = '\0'; 1203 if (s = config_getprop(cp, prop_name)) { 1204 nval = atoi(s); 1205 out(O_ALTFP, "serd override %s_n %s", 1206 serdclass, s); 1207 got_n_override = 1; 1208 } 1209 prop_name[i - 1] = 't'; 1210 if (s = config_getprop(cp, prop_name)) { 1211 ptr = STRDUP(s); 1212 out(O_ALTFP, "serd override %s_t %s", 1213 serdclass, s); 1214 got_t_override = 1; 1215 } 1216 FREE(prop_name); 1217 } 1218 1219 if (serdn != -1 && got_n_override == 0) { 1220 nval = serdn; 1221 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1222 got_n_override = 1; 1223 } 1224 if (serdt != NULL && got_t_override == 0) { 1225 ptr = STRDUP(serdt); 1226 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1227 got_t_override = 1; 1228 } 1229 1230 if (!got_n_override) { 1231 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1232 NULL); 1233 ASSERT(nN->t == T_NUM); 1234 nval = (uint_t)nN->u.ull; 1235 } 1236 if (!got_t_override) { 1237 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1238 NULL); 1239 ASSERT(nT->t == T_TIMEVAL); 1240 tval = (hrtime_t)nT->u.ull; 1241 } else { 1242 const unsigned long long *ullp; 1243 const char *suffix; 1244 int len; 1245 1246 len = strspn(ptr, "0123456789"); 1247 suffix = stable(&ptr[len]); 1248 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1249 (void *)suffix, NULL); 1250 ptr[len] = '\0'; 1251 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1252 FREE(ptr); 1253 } 1254 fmd_serd_create(hdl, serdname, nval, tval); 1255 } 1256 1257 newentp = MALLOC(sizeof (*newentp)); 1258 newentp->ename = stable(serdclass); 1259 FREE(serdclass); 1260 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1261 newentp->hdl = hdl; 1262 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1263 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1264 (void *)newentp, (lut_cmp)serd_cmp); 1265 Serd_need_save = 1; 1266 serd_save(); 1267 } else { 1268 FREE(newentp); 1269 } 1270 1271 1272 /* 1273 * increment SERD engine. if engine fires, reset serd 1274 * engine and return trip_strcode if required. 1275 */ 1276 for (i = 0; i < serdincrement; i++) { 1277 if (fmd_serd_record(hdl, serdname, ffep)) { 1278 fmd_case_add_serd(hdl, fmcase, serdname); 1279 fmd_serd_reset(hdl, serdname); 1280 1281 if (ippp) { 1282 struct node *tripinst = 1283 lut_lookup(serdinst->u.stmt.lutp, 1284 (void *)L_trip, NULL); 1285 ASSERT(tripinst != NULL); 1286 *enamep = tripinst->u.event.ename->u.name.s; 1287 *ippp = ipath(tripinst->u.event.epname); 1288 out(O_ALTFP|O_NONL, 1289 "[engine fired: %s, sending: ", serdname); 1290 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1291 out(O_ALTFP, "]"); 1292 } else { 1293 out(O_ALTFP, "[engine fired: %s, no trip]", 1294 serdname); 1295 } 1296 FREE(serdname); 1297 return (1); 1298 } 1299 } 1300 1301 FREE(serdname); 1302 return (0); 1303 } 1304 1305 /* 1306 * search a suspect list for upsets. feed each upset to serd_eval() and 1307 * build up tripped[], an array of ereports produced by the firing of 1308 * any SERD engines. then feed each ereport back into 1309 * fme_receive_report(). 1310 * 1311 * returns ntrip, the number of these ereports produced. 1312 */ 1313 static int 1314 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1315 { 1316 /* we build an array of tripped ereports that we send ourselves */ 1317 struct { 1318 const char *ename; 1319 const struct ipath *ipp; 1320 } *tripped; 1321 struct event *sp; 1322 int ntrip, nupset, i; 1323 1324 /* 1325 * count the number of upsets to determine the upper limit on 1326 * expected trip ereport strings. remember that one upset can 1327 * lead to at most one ereport. 1328 */ 1329 nupset = 0; 1330 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1331 if (sp->t == N_UPSET) 1332 nupset++; 1333 } 1334 1335 if (nupset == 0) 1336 return (0); 1337 1338 /* 1339 * get to this point if we have upsets and expect some trip 1340 * ereports 1341 */ 1342 tripped = alloca(sizeof (*tripped) * nupset); 1343 bzero((void *)tripped, sizeof (*tripped) * nupset); 1344 1345 ntrip = 0; 1346 for (sp = fmep->suspects; sp; sp = sp->suspects) 1347 if (sp->t == N_UPSET && 1348 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1349 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1350 ntrip++; 1351 1352 for (i = 0; i < ntrip; i++) { 1353 struct event *ep, *nep; 1354 struct fme *nfmep; 1355 fmd_case_t *fmcase; 1356 const struct ipath *ipp; 1357 const char *eventstring; 1358 int prev_verbose; 1359 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1360 enum fme_state state; 1361 1362 /* 1363 * First try and evaluate a case with the trip ereport plus 1364 * all the other ereports that cause the trip. If that fails 1365 * to evaluate then try again with just this ereport on its own. 1366 */ 1367 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1368 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1369 out(O_ALTFP|O_STAMP, NULL); 1370 ep = fmep->e0; 1371 eventstring = ep->enode->u.event.ename->u.name.s; 1372 ipp = ep->ipp; 1373 1374 /* 1375 * create a duplicate fme and case 1376 */ 1377 fmcase = fmd_case_open(fmep->hdl, NULL); 1378 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1379 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1380 out(O_ALTFP, " ]"); 1381 1382 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1383 fmcase, ffep, ep->nvp)) == NULL) { 1384 out(O_ALTFP|O_NONL, "["); 1385 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1386 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1387 continue; 1388 } 1389 1390 Open_fme_count++; 1391 nfmep->pull = fmep->pull; 1392 init_fme_bufs(nfmep); 1393 out(O_ALTFP|O_NONL, "["); 1394 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1395 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1396 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1397 if (ffep) { 1398 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1399 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1400 nfmep->e0r = ffep; 1401 } 1402 1403 /* 1404 * add the original ereports 1405 */ 1406 for (ep = fmep->observations; ep; ep = ep->observations) { 1407 eventstring = ep->enode->u.event.ename->u.name.s; 1408 ipp = ep->ipp; 1409 out(O_ALTFP|O_NONL, "adding event ["); 1410 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1411 out(O_ALTFP, " ]"); 1412 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1413 if (nep->count++ == 0) { 1414 nep->observations = nfmep->observations; 1415 nfmep->observations = nep; 1416 serialize_observation(nfmep, eventstring, ipp); 1417 nep->nvp = evnv_dupnvl(ep->nvp); 1418 } 1419 if (ep->ffep && ep->ffep != ffep) 1420 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1421 ep->ffep); 1422 stats_counter_bump(nfmep->Rcount); 1423 } 1424 1425 /* 1426 * add the serd trigger ereport 1427 */ 1428 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1429 tripped[i].ipp)) == NULL) { 1430 /* 1431 * The trigger ereport is not in the instance tree. It 1432 * was presumably removed by prune_propagations() as 1433 * this combination of events is not present in the 1434 * rules. 1435 */ 1436 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1437 Undiag_reason = UD_VAL_BADEVENTI; 1438 goto retry_lone_ereport; 1439 } 1440 out(O_ALTFP|O_NONL, "adding event ["); 1441 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1442 out(O_ALTFP, " ]"); 1443 nfmep->ecurrent = ep; 1444 ep->nvp = NULL; 1445 ep->count = 1; 1446 ep->observations = nfmep->observations; 1447 nfmep->observations = ep; 1448 1449 /* 1450 * just peek first. 1451 */ 1452 nfmep->peek = 1; 1453 prev_verbose = Verbose; 1454 if (Debug == 0) 1455 Verbose = 0; 1456 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1457 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1458 nfmep->peek = 0; 1459 Verbose = prev_verbose; 1460 if (state == FME_DISPROVED) { 1461 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1462 Undiag_reason = UD_VAL_UNSOLVD; 1463 retry_lone_ereport: 1464 /* 1465 * However the trigger ereport on its own might be 1466 * diagnosable, so check for that. Undo the new fme 1467 * and case we just created and call fme_receive_report. 1468 */ 1469 out(O_ALTFP|O_NONL, "["); 1470 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1471 tripped[i].ipp); 1472 out(O_ALTFP, " retrying with just trigger ereport]"); 1473 itree_free(nfmep->eventtree); 1474 nfmep->eventtree = NULL; 1475 structconfig_free(nfmep->config); 1476 nfmep->config = NULL; 1477 destroy_fme_bufs(nfmep); 1478 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1479 fme_receive_report(fmep->hdl, ffep, 1480 tripped[i].ename, tripped[i].ipp, NULL); 1481 continue; 1482 } 1483 1484 /* 1485 * and evaluate 1486 */ 1487 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1488 fme_eval(nfmep, ffep); 1489 } 1490 1491 return (ntrip); 1492 } 1493 1494 /* 1495 * fme_receive_external_report -- call when an external ereport comes in 1496 * 1497 * this routine just converts the relevant information from the ereport 1498 * into a format used internally and passes it on to fme_receive_report(). 1499 */ 1500 void 1501 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1502 const char *class) 1503 { 1504 struct node *epnamenp; 1505 fmd_case_t *fmcase; 1506 const struct ipath *ipp; 1507 nvlist_t *detector = NULL; 1508 1509 class = stable(class); 1510 1511 /* Get the component path from the ereport */ 1512 epnamenp = platform_getpath(nvl); 1513 1514 /* See if we ended up without a path. */ 1515 if (epnamenp == NULL) { 1516 /* See if class permits silent discard on unknown component. */ 1517 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1518 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1519 "to component path, but silent discard allowed.", 1520 class); 1521 } else { 1522 /* 1523 * XFILE: Failure to find a component is bad unless 1524 * 'discard_if_config_unknown=1' was specified in the 1525 * ereport definition. Indicate undiagnosable. 1526 */ 1527 Undiag_reason = UD_VAL_NOPATH; 1528 fmcase = fmd_case_open(hdl, NULL); 1529 1530 /* 1531 * We don't have a component path here (which means that 1532 * the detector was not in hc-scheme and couldn't be 1533 * converted to hc-scheme. Report the raw detector as 1534 * the suspect resource if there is one. 1535 */ 1536 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1537 &detector); 1538 publish_undiagnosable(hdl, ffep, fmcase, detector, 1539 (char *)class); 1540 } 1541 return; 1542 } 1543 1544 ipp = ipath(epnamenp); 1545 tree_free(epnamenp); 1546 fme_receive_report(hdl, ffep, class, ipp, nvl); 1547 } 1548 1549 /*ARGSUSED*/ 1550 void 1551 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1552 const char *eventstring) 1553 { 1554 char *uuid; 1555 nvlist_t **nva; 1556 uint_t nvc; 1557 const struct ipath *ipp; 1558 1559 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1560 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1561 &nva, &nvc) != 0) { 1562 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1563 return; 1564 } 1565 1566 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1567 1568 while (nvc-- != 0) { 1569 /* 1570 * Reset any istat or serd engine associated with this path. 1571 */ 1572 char *path; 1573 1574 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1575 continue; 1576 1577 path = ipath2str(NULL, ipp); 1578 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1579 path); 1580 FREE(path); 1581 1582 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1583 istat_save(); 1584 1585 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1586 serd_save(); 1587 } 1588 } 1589 1590 /*ARGSUSED*/ 1591 void 1592 fme_receive_topology_change(void) 1593 { 1594 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1595 istat_save(); 1596 1597 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1598 serd_save(); 1599 } 1600 1601 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1602 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1603 1604 /* ARGSUSED */ 1605 static void 1606 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1607 { 1608 struct bubble *bp; 1609 struct arrowlist *ap; 1610 1611 ep->cached_state = 0; 1612 ep->keep_in_tree = 0; 1613 for (bp = itree_next_bubble(ep, NULL); bp; 1614 bp = itree_next_bubble(ep, bp)) { 1615 if (bp->t != B_FROM) 1616 continue; 1617 bp->mark = 0; 1618 for (ap = itree_next_arrow(bp, NULL); ap; 1619 ap = itree_next_arrow(bp, ap)) 1620 ap->arrowp->mark = 0; 1621 } 1622 } 1623 1624 static void 1625 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1626 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1627 { 1628 struct event *ep; 1629 struct fme *fmep = NULL; 1630 struct fme *ofmep = NULL; 1631 struct fme *cfmep, *svfmep; 1632 int matched = 0; 1633 nvlist_t *defect; 1634 fmd_case_t *fmcase; 1635 char *reason; 1636 1637 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1638 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1639 out(O_ALTFP|O_STAMP, NULL); 1640 1641 /* decide which FME it goes to */ 1642 for (fmep = FMElist; fmep; fmep = fmep->next) { 1643 int prev_verbose; 1644 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1645 enum fme_state state; 1646 nvlist_t *pre_peek_nvp = NULL; 1647 1648 if (fmep->overflow) { 1649 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1650 ofmep = fmep; 1651 1652 continue; 1653 } 1654 1655 /* 1656 * ignore solved or closed cases 1657 */ 1658 if (fmep->posted_suspects || 1659 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1660 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1661 continue; 1662 1663 /* look up event in event tree for this FME */ 1664 if ((ep = itree_lookup(fmep->eventtree, 1665 eventstring, ipp)) == NULL) 1666 continue; 1667 1668 /* note observation */ 1669 fmep->ecurrent = ep; 1670 if (ep->count++ == 0) { 1671 /* link it into list of observations seen */ 1672 ep->observations = fmep->observations; 1673 fmep->observations = ep; 1674 ep->nvp = evnv_dupnvl(nvl); 1675 } else { 1676 /* use new payload values for peek */ 1677 pre_peek_nvp = ep->nvp; 1678 ep->nvp = evnv_dupnvl(nvl); 1679 } 1680 1681 /* tell hypothesise() not to mess with suspect list */ 1682 fmep->peek = 1; 1683 1684 /* don't want this to be verbose (unless Debug is set) */ 1685 prev_verbose = Verbose; 1686 if (Debug == 0) 1687 Verbose = 0; 1688 1689 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1690 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1691 1692 fmep->peek = 0; 1693 1694 /* put verbose flag back */ 1695 Verbose = prev_verbose; 1696 1697 if (state != FME_DISPROVED) { 1698 /* found an FME that explains the ereport */ 1699 matched++; 1700 out(O_ALTFP|O_NONL, "["); 1701 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1702 out(O_ALTFP, " explained by FME%d]", fmep->id); 1703 1704 if (pre_peek_nvp) 1705 nvlist_free(pre_peek_nvp); 1706 1707 if (ep->count == 1) 1708 serialize_observation(fmep, eventstring, ipp); 1709 1710 if (ffep) { 1711 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1712 ep->ffep = ffep; 1713 } 1714 1715 stats_counter_bump(fmep->Rcount); 1716 1717 /* re-eval FME */ 1718 fme_eval(fmep, ffep); 1719 } else { 1720 1721 /* not a match, undo noting of observation */ 1722 fmep->ecurrent = NULL; 1723 if (--ep->count == 0) { 1724 /* unlink it from observations */ 1725 fmep->observations = ep->observations; 1726 ep->observations = NULL; 1727 nvlist_free(ep->nvp); 1728 ep->nvp = NULL; 1729 } else { 1730 nvlist_free(ep->nvp); 1731 ep->nvp = pre_peek_nvp; 1732 } 1733 } 1734 } 1735 1736 if (matched) 1737 return; /* explained by at least one existing FME */ 1738 1739 /* clean up closed fmes */ 1740 cfmep = ClosedFMEs; 1741 while (cfmep != NULL) { 1742 svfmep = cfmep->next; 1743 destroy_fme(cfmep); 1744 cfmep = svfmep; 1745 } 1746 ClosedFMEs = NULL; 1747 1748 if (ofmep) { 1749 out(O_ALTFP|O_NONL, "["); 1750 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1751 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1752 if (ffep) 1753 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1754 1755 return; 1756 1757 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1758 out(O_ALTFP|O_NONL, "["); 1759 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1760 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1761 1762 fmcase = fmd_case_open(hdl, NULL); 1763 1764 /* Create overflow fme */ 1765 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1766 nvl)) == NULL) { 1767 out(O_ALTFP|O_NONL, "["); 1768 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1769 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1770 return; 1771 } 1772 1773 Open_fme_count++; 1774 1775 init_fme_bufs(fmep); 1776 fmep->overflow = B_TRUE; 1777 1778 if (ffep) 1779 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1780 1781 Undiag_reason = UD_VAL_MAXFME; 1782 defect = fmd_nvl_create_fault(hdl, 1783 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1784 reason = undiag_2reason_str(Undiag_reason, NULL); 1785 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1786 FREE(reason); 1787 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1788 fmd_case_solve(hdl, fmep->fmcase); 1789 Undiag_reason = UD_VAL_UNKNOWN; 1790 return; 1791 } 1792 1793 /* open a case */ 1794 fmcase = fmd_case_open(hdl, NULL); 1795 1796 /* start a new FME */ 1797 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1798 out(O_ALTFP|O_NONL, "["); 1799 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1800 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1801 return; 1802 } 1803 1804 Open_fme_count++; 1805 1806 init_fme_bufs(fmep); 1807 1808 out(O_ALTFP|O_NONL, "["); 1809 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1810 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1811 fmd_case_uuid(hdl, fmep->fmcase)); 1812 1813 ep = fmep->e0; 1814 ASSERT(ep != NULL); 1815 1816 /* note observation */ 1817 fmep->ecurrent = ep; 1818 if (ep->count++ == 0) { 1819 /* link it into list of observations seen */ 1820 ep->observations = fmep->observations; 1821 fmep->observations = ep; 1822 ep->nvp = evnv_dupnvl(nvl); 1823 serialize_observation(fmep, eventstring, ipp); 1824 } else { 1825 /* new payload overrides any previous */ 1826 nvlist_free(ep->nvp); 1827 ep->nvp = evnv_dupnvl(nvl); 1828 } 1829 1830 stats_counter_bump(fmep->Rcount); 1831 1832 if (ffep) { 1833 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1834 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1835 fmep->e0r = ffep; 1836 ep->ffep = ffep; 1837 } 1838 1839 /* give the diagnosis algorithm a shot at the new FME state */ 1840 fme_eval(fmep, ffep); 1841 } 1842 1843 void 1844 fme_status(int flags) 1845 { 1846 struct fme *fmep; 1847 1848 if (FMElist == NULL) { 1849 out(flags, "No fault management exercises underway."); 1850 return; 1851 } 1852 1853 for (fmep = FMElist; fmep; fmep = fmep->next) 1854 fme_print(flags, fmep); 1855 } 1856 1857 /* 1858 * "indent" routines used mostly for nicely formatted debug output, but also 1859 * for sanity checking for infinite recursion bugs. 1860 */ 1861 1862 #define MAX_INDENT 1024 1863 static const char *indent_s[MAX_INDENT]; 1864 static int current_indent; 1865 1866 static void 1867 indent_push(const char *s) 1868 { 1869 if (current_indent < MAX_INDENT) 1870 indent_s[current_indent++] = s; 1871 else 1872 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1873 } 1874 1875 static void 1876 indent_set(const char *s) 1877 { 1878 current_indent = 0; 1879 indent_push(s); 1880 } 1881 1882 static void 1883 indent_pop(void) 1884 { 1885 if (current_indent > 0) 1886 current_indent--; 1887 else 1888 out(O_DIE, "recursion underflow"); 1889 } 1890 1891 static void 1892 indent(void) 1893 { 1894 int i; 1895 if (!Verbose) 1896 return; 1897 for (i = 0; i < current_indent; i++) 1898 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1899 } 1900 1901 #define SLNEW 1 1902 #define SLCHANGED 2 1903 #define SLWAIT 3 1904 #define SLDISPROVED 4 1905 1906 static void 1907 print_suspects(int circumstance, struct fme *fmep) 1908 { 1909 struct event *ep; 1910 1911 out(O_ALTFP|O_NONL, "["); 1912 if (circumstance == SLCHANGED) { 1913 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1914 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1915 } else if (circumstance == SLWAIT) { 1916 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1917 fmep->timer); 1918 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1919 } else if (circumstance == SLDISPROVED) { 1920 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1921 } else { 1922 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1923 } 1924 1925 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1926 out(O_ALTFP, "]"); 1927 return; 1928 } 1929 1930 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1931 out(O_ALTFP|O_NONL, " "); 1932 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1933 } 1934 out(O_ALTFP, "]"); 1935 } 1936 1937 static struct node * 1938 eventprop_lookup(struct event *ep, const char *propname) 1939 { 1940 return (lut_lookup(ep->props, (void *)propname, NULL)); 1941 } 1942 1943 #define MAXDIGITIDX 23 1944 static char numbuf[MAXDIGITIDX + 1]; 1945 1946 static int 1947 node2uint(struct node *n, uint_t *valp) 1948 { 1949 struct evalue value; 1950 struct lut *globals = NULL; 1951 1952 if (n == NULL) 1953 return (1); 1954 1955 /* 1956 * check value.v since we are being asked to convert an unsigned 1957 * long long int to an unsigned int 1958 */ 1959 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1960 value.t != UINT64 || value.v > (1ULL << 32)) 1961 return (1); 1962 1963 *valp = (uint_t)value.v; 1964 1965 return (0); 1966 } 1967 1968 static nvlist_t * 1969 node2fmri(struct node *n) 1970 { 1971 nvlist_t **pa, *f, *p; 1972 struct node *nc; 1973 uint_t depth = 0; 1974 char *numstr, *nullbyte; 1975 char *failure; 1976 int err, i; 1977 1978 /* XXX do we need to be able to handle a non-T_NAME node? */ 1979 if (n == NULL || n->t != T_NAME) 1980 return (NULL); 1981 1982 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1983 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1984 break; 1985 depth++; 1986 } 1987 1988 if (nc != NULL) { 1989 /* We bailed early, something went wrong */ 1990 return (NULL); 1991 } 1992 1993 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1994 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1995 pa = alloca(depth * sizeof (nvlist_t *)); 1996 for (i = 0; i < depth; i++) 1997 pa[i] = NULL; 1998 1999 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2000 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2001 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2002 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2003 if (err != 0) { 2004 failure = "basic construction of FMRI failed"; 2005 goto boom; 2006 } 2007 2008 numbuf[MAXDIGITIDX] = '\0'; 2009 nullbyte = &numbuf[MAXDIGITIDX]; 2010 i = 0; 2011 2012 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2013 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2014 if (err != 0) { 2015 failure = "alloc of an hc-pair failed"; 2016 goto boom; 2017 } 2018 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2019 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2020 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2021 if (err != 0) { 2022 failure = "construction of an hc-pair failed"; 2023 goto boom; 2024 } 2025 pa[i++] = p; 2026 } 2027 2028 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2029 if (err == 0) { 2030 for (i = 0; i < depth; i++) 2031 if (pa[i] != NULL) 2032 nvlist_free(pa[i]); 2033 return (f); 2034 } 2035 failure = "addition of hc-pair array to FMRI failed"; 2036 2037 boom: 2038 for (i = 0; i < depth; i++) 2039 if (pa[i] != NULL) 2040 nvlist_free(pa[i]); 2041 nvlist_free(f); 2042 out(O_DIE, "%s", failure); 2043 /*NOTREACHED*/ 2044 return (NULL); 2045 } 2046 2047 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2048 struct ipath { 2049 const char *s; /* component name (in stable) */ 2050 int i; /* instance number */ 2051 }; 2052 2053 static nvlist_t * 2054 ipath2fmri(struct ipath *ipath) 2055 { 2056 nvlist_t **pa, *f, *p; 2057 uint_t depth = 0; 2058 char *numstr, *nullbyte; 2059 char *failure; 2060 int err, i; 2061 struct ipath *ipp; 2062 2063 for (ipp = ipath; ipp->s != NULL; ipp++) 2064 depth++; 2065 2066 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2067 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2068 pa = alloca(depth * sizeof (nvlist_t *)); 2069 for (i = 0; i < depth; i++) 2070 pa[i] = NULL; 2071 2072 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2073 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2074 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2075 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2076 if (err != 0) { 2077 failure = "basic construction of FMRI failed"; 2078 goto boom; 2079 } 2080 2081 numbuf[MAXDIGITIDX] = '\0'; 2082 nullbyte = &numbuf[MAXDIGITIDX]; 2083 i = 0; 2084 2085 for (ipp = ipath; ipp->s != NULL; ipp++) { 2086 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2087 if (err != 0) { 2088 failure = "alloc of an hc-pair failed"; 2089 goto boom; 2090 } 2091 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2092 numstr = ulltostr(ipp->i, nullbyte); 2093 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2094 if (err != 0) { 2095 failure = "construction of an hc-pair failed"; 2096 goto boom; 2097 } 2098 pa[i++] = p; 2099 } 2100 2101 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2102 if (err == 0) { 2103 for (i = 0; i < depth; i++) 2104 if (pa[i] != NULL) 2105 nvlist_free(pa[i]); 2106 return (f); 2107 } 2108 failure = "addition of hc-pair array to FMRI failed"; 2109 2110 boom: 2111 for (i = 0; i < depth; i++) 2112 if (pa[i] != NULL) 2113 nvlist_free(pa[i]); 2114 nvlist_free(f); 2115 out(O_DIE, "%s", failure); 2116 /*NOTREACHED*/ 2117 return (NULL); 2118 } 2119 2120 static uint8_t 2121 percentof(uint_t part, uint_t whole) 2122 { 2123 unsigned long long p = part * 1000; 2124 2125 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2126 } 2127 2128 struct rsl { 2129 struct event *suspect; 2130 nvlist_t *asru; 2131 nvlist_t *fru; 2132 nvlist_t *rsrc; 2133 }; 2134 2135 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2136 2137 /* 2138 * rslfree -- free internal members of struct rsl not expected to be 2139 * freed elsewhere. 2140 */ 2141 static void 2142 rslfree(struct rsl *freeme) 2143 { 2144 if (freeme->asru != NULL) 2145 nvlist_free(freeme->asru); 2146 if (freeme->fru != NULL) 2147 nvlist_free(freeme->fru); 2148 if (freeme->rsrc != NULL && freeme->rsrc != freeme->asru) 2149 nvlist_free(freeme->rsrc); 2150 } 2151 2152 /* 2153 * rslcmp -- compare two rsl structures. Use the following 2154 * comparisons to establish cardinality: 2155 * 2156 * 1. Name of the suspect's class. (simple strcmp) 2157 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2158 * 2159 */ 2160 static int 2161 rslcmp(const void *a, const void *b) 2162 { 2163 struct rsl *r1 = (struct rsl *)a; 2164 struct rsl *r2 = (struct rsl *)b; 2165 int rv; 2166 2167 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2168 r2->suspect->enode->u.event.ename->u.name.s); 2169 if (rv != 0) 2170 return (rv); 2171 2172 if (r1->rsrc == NULL && r2->rsrc == NULL) 2173 return (0); 2174 if (r1->rsrc == NULL) 2175 return (-1); 2176 if (r2->rsrc == NULL) 2177 return (1); 2178 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2179 } 2180 2181 /* 2182 * get_resources -- for a given suspect, determine what ASRU, FRU and 2183 * RSRC nvlists should be advertised in the final suspect list. 2184 */ 2185 void 2186 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2187 { 2188 struct node *asrudef, *frudef; 2189 nvlist_t *asru, *fru; 2190 nvlist_t *rsrc = NULL; 2191 char *pathstr; 2192 2193 /* 2194 * First find any ASRU and/or FRU defined in the 2195 * initial fault tree. 2196 */ 2197 asrudef = eventprop_lookup(sp, L_ASRU); 2198 frudef = eventprop_lookup(sp, L_FRU); 2199 2200 /* 2201 * Create FMRIs based on those definitions 2202 */ 2203 asru = node2fmri(asrudef); 2204 fru = node2fmri(frudef); 2205 pathstr = ipath2str(NULL, sp->ipp); 2206 2207 /* 2208 * Allow for platform translations of the FMRIs 2209 */ 2210 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2211 pathstr); 2212 2213 FREE(pathstr); 2214 rsrcs->suspect = sp; 2215 rsrcs->asru = asru; 2216 rsrcs->fru = fru; 2217 rsrcs->rsrc = rsrc; 2218 } 2219 2220 /* 2221 * trim_suspects -- prior to publishing, we may need to remove some 2222 * suspects from the list. If we're auto-closing upsets, we don't 2223 * want any of those in the published list. If the ASRUs for multiple 2224 * defects resolve to the same ASRU (driver) we only want to publish 2225 * that as a single suspect. 2226 */ 2227 static int 2228 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2229 fmd_event_t *ffep) 2230 { 2231 struct event *ep; 2232 struct rsl *rp = begin; 2233 struct rsl *rp2 = begin2; 2234 int mess_zero_count = 0; 2235 int serd_rval; 2236 uint_t messval; 2237 2238 /* remove any unwanted upsets and populate our array */ 2239 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2240 if (is_upset(ep->t)) 2241 continue; 2242 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2243 NULL, NULL); 2244 if (serd_rval == 0) 2245 continue; 2246 if (node2uint(eventprop_lookup(ep, L_message), 2247 &messval) == 0 && messval == 0) { 2248 get_resources(ep, rp2, fmep->config); 2249 rp2++; 2250 mess_zero_count++; 2251 } else { 2252 get_resources(ep, rp, fmep->config); 2253 rp++; 2254 fmep->nsuspects++; 2255 } 2256 } 2257 return (mess_zero_count); 2258 } 2259 2260 /* 2261 * addpayloadprop -- add a payload prop to a problem 2262 */ 2263 static void 2264 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2265 { 2266 nvlist_t *rsrc, *hcs; 2267 2268 ASSERT(fault != NULL); 2269 ASSERT(lhs != NULL); 2270 ASSERT(rhs != NULL); 2271 2272 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2273 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2274 2275 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2276 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2277 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2278 out(O_DIE, 2279 "cannot add payloadprop \"%s\" to fault", lhs); 2280 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2281 out(O_DIE, 2282 "cannot add payloadprop \"%s\" to fault", lhs); 2283 nvlist_free(hcs); 2284 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2285 out(O_DIE, 2286 "cannot add payloadprop \"%s\" to fault", lhs); 2287 } else 2288 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2289 2290 if (rhs->t == UINT64) { 2291 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2292 2293 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2294 out(O_DIE, 2295 "cannot add payloadprop \"%s\" to fault", lhs); 2296 } else { 2297 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2298 lhs, (char *)(uintptr_t)rhs->v); 2299 2300 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2301 out(O_DIE, 2302 "cannot add payloadprop \"%s\" to fault", lhs); 2303 } 2304 } 2305 2306 static char *Istatbuf; 2307 static char *Istatbufptr; 2308 static int Istatsz; 2309 2310 /* 2311 * istataddsize -- calculate size of istat and add it to Istatsz 2312 */ 2313 /*ARGSUSED2*/ 2314 static void 2315 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2316 { 2317 int val; 2318 2319 ASSERT(lhs != NULL); 2320 ASSERT(rhs != NULL); 2321 2322 if ((val = stats_counter_value(rhs)) == 0) 2323 return; /* skip zero-valued stats */ 2324 2325 /* count up the size of the stat name */ 2326 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2327 Istatsz++; /* for the trailing NULL byte */ 2328 2329 /* count up the size of the stat value */ 2330 Istatsz += snprintf(NULL, 0, "%d", val); 2331 Istatsz++; /* for the trailing NULL byte */ 2332 } 2333 2334 /* 2335 * istat2str -- serialize an istat, writing result to *Istatbufptr 2336 */ 2337 /*ARGSUSED2*/ 2338 static void 2339 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2340 { 2341 char *str; 2342 int len; 2343 int val; 2344 2345 ASSERT(lhs != NULL); 2346 ASSERT(rhs != NULL); 2347 2348 if ((val = stats_counter_value(rhs)) == 0) 2349 return; /* skip zero-valued stats */ 2350 2351 /* serialize the stat name */ 2352 str = ipath2str(lhs->ename, lhs->ipath); 2353 len = strlen(str); 2354 2355 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2356 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2357 Istatbufptr += len; 2358 FREE(str); 2359 *Istatbufptr++ = '\0'; 2360 2361 /* serialize the stat value */ 2362 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2363 "%d", val); 2364 *Istatbufptr++ = '\0'; 2365 2366 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2367 } 2368 2369 void 2370 istat_save() 2371 { 2372 if (Istat_need_save == 0) 2373 return; 2374 2375 /* figure out how big the serialzed info is */ 2376 Istatsz = 0; 2377 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2378 2379 if (Istatsz == 0) { 2380 /* no stats to save */ 2381 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2382 return; 2383 } 2384 2385 /* create the serialized buffer */ 2386 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2387 lut_walk(Istats, (lut_cb)istat2str, NULL); 2388 2389 /* clear out current saved stats */ 2390 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2391 2392 /* write out the new version */ 2393 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2394 FREE(Istatbuf); 2395 2396 Istat_need_save = 0; 2397 } 2398 2399 int 2400 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2401 { 2402 if (ent1->ename != ent2->ename) 2403 return (ent2->ename - ent1->ename); 2404 if (ent1->ipath != ent2->ipath) 2405 return ((char *)ent2->ipath - (char *)ent1->ipath); 2406 2407 return (0); 2408 } 2409 2410 /* 2411 * istat-verify -- verify the component associated with a stat still exists 2412 * 2413 * if the component no longer exists, this routine resets the stat and 2414 * returns 0. if the component still exists, it returns 1. 2415 */ 2416 static int 2417 istat_verify(struct node *snp, struct istat_entry *entp) 2418 { 2419 struct stats *statp; 2420 nvlist_t *fmri; 2421 2422 fmri = node2fmri(snp->u.event.epname); 2423 if (platform_path_exists(fmri)) { 2424 nvlist_free(fmri); 2425 return (1); 2426 } 2427 nvlist_free(fmri); 2428 2429 /* component no longer in system. zero out the associated stats */ 2430 if ((statp = (struct stats *) 2431 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2432 stats_counter_value(statp) == 0) 2433 return (0); /* stat is already reset */ 2434 2435 Istat_need_save = 1; 2436 stats_counter_reset(statp); 2437 return (0); 2438 } 2439 2440 static void 2441 istat_bump(struct node *snp, int n) 2442 { 2443 struct stats *statp; 2444 struct istat_entry ent; 2445 2446 ASSERT(snp != NULL); 2447 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2448 ASSERT(snp->u.event.epname != NULL); 2449 2450 /* class name should be hoisted into a single stable entry */ 2451 ASSERT(snp->u.event.ename->u.name.next == NULL); 2452 ent.ename = snp->u.event.ename->u.name.s; 2453 ent.ipath = ipath(snp->u.event.epname); 2454 2455 if (!istat_verify(snp, &ent)) { 2456 /* component no longer exists in system, nothing to do */ 2457 return; 2458 } 2459 2460 if ((statp = (struct stats *) 2461 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2462 /* need to create the counter */ 2463 int cnt = 0; 2464 struct node *np; 2465 char *sname; 2466 char *snamep; 2467 struct istat_entry *newentp; 2468 2469 /* count up the size of the stat name */ 2470 np = snp->u.event.ename; 2471 while (np != NULL) { 2472 cnt += strlen(np->u.name.s); 2473 cnt++; /* for the '.' or '@' */ 2474 np = np->u.name.next; 2475 } 2476 np = snp->u.event.epname; 2477 while (np != NULL) { 2478 cnt += snprintf(NULL, 0, "%s%llu", 2479 np->u.name.s, np->u.name.child->u.ull); 2480 cnt++; /* for the '/' or trailing NULL byte */ 2481 np = np->u.name.next; 2482 } 2483 2484 /* build the stat name */ 2485 snamep = sname = alloca(cnt); 2486 np = snp->u.event.ename; 2487 while (np != NULL) { 2488 snamep += snprintf(snamep, &sname[cnt] - snamep, 2489 "%s", np->u.name.s); 2490 np = np->u.name.next; 2491 if (np) 2492 *snamep++ = '.'; 2493 } 2494 *snamep++ = '@'; 2495 np = snp->u.event.epname; 2496 while (np != NULL) { 2497 snamep += snprintf(snamep, &sname[cnt] - snamep, 2498 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2499 np = np->u.name.next; 2500 if (np) 2501 *snamep++ = '/'; 2502 } 2503 *snamep++ = '\0'; 2504 2505 /* create the new stat & add it to our list */ 2506 newentp = MALLOC(sizeof (*newentp)); 2507 *newentp = ent; 2508 statp = stats_new_counter(NULL, sname, 0); 2509 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2510 (lut_cmp)istat_cmp); 2511 } 2512 2513 /* if n is non-zero, set that value instead of bumping */ 2514 if (n) { 2515 stats_counter_reset(statp); 2516 stats_counter_add(statp, n); 2517 } else 2518 stats_counter_bump(statp); 2519 Istat_need_save = 1; 2520 2521 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2522 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2523 stats_counter_value(statp)); 2524 } 2525 2526 /*ARGSUSED*/ 2527 static void 2528 istat_destructor(void *left, void *right, void *arg) 2529 { 2530 struct istat_entry *entp = (struct istat_entry *)left; 2531 struct stats *statp = (struct stats *)right; 2532 FREE(entp); 2533 stats_delete(statp); 2534 } 2535 2536 /* 2537 * Callback used in a walk of the Istats to reset matching stat counters. 2538 */ 2539 static void 2540 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2541 const struct ipath *ipp) 2542 { 2543 char *path; 2544 2545 if (entp->ipath == ipp) { 2546 path = ipath2str(entp->ename, ipp); 2547 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2548 FREE(path); 2549 stats_counter_reset(statp); 2550 Istat_need_save = 1; 2551 } 2552 } 2553 2554 /*ARGSUSED*/ 2555 static void 2556 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2557 void *unused) 2558 { 2559 char *path; 2560 nvlist_t *fmri; 2561 2562 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2563 if (!platform_path_exists(fmri)) { 2564 path = ipath2str(entp->ename, entp->ipath); 2565 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2566 FREE(path); 2567 stats_counter_reset(statp); 2568 Istat_need_save = 1; 2569 } 2570 nvlist_free(fmri); 2571 } 2572 2573 void 2574 istat_fini(void) 2575 { 2576 lut_free(Istats, istat_destructor, NULL); 2577 } 2578 2579 static char *Serdbuf; 2580 static char *Serdbufptr; 2581 static int Serdsz; 2582 2583 /* 2584 * serdaddsize -- calculate size of serd and add it to Serdsz 2585 */ 2586 /*ARGSUSED*/ 2587 static void 2588 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2589 { 2590 ASSERT(lhs != NULL); 2591 2592 /* count up the size of the stat name */ 2593 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2594 Serdsz++; /* for the trailing NULL byte */ 2595 } 2596 2597 /* 2598 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2599 */ 2600 /*ARGSUSED*/ 2601 static void 2602 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2603 { 2604 char *str; 2605 int len; 2606 2607 ASSERT(lhs != NULL); 2608 2609 /* serialize the serd engine name */ 2610 str = ipath2str(lhs->ename, lhs->ipath); 2611 len = strlen(str); 2612 2613 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2614 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2615 Serdbufptr += len; 2616 FREE(str); 2617 *Serdbufptr++ = '\0'; 2618 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2619 } 2620 2621 void 2622 serd_save() 2623 { 2624 if (Serd_need_save == 0) 2625 return; 2626 2627 /* figure out how big the serialzed info is */ 2628 Serdsz = 0; 2629 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2630 2631 if (Serdsz == 0) { 2632 /* no serd engines to save */ 2633 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2634 return; 2635 } 2636 2637 /* create the serialized buffer */ 2638 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2639 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2640 2641 /* clear out current saved stats */ 2642 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2643 2644 /* write out the new version */ 2645 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2646 FREE(Serdbuf); 2647 Serd_need_save = 0; 2648 } 2649 2650 int 2651 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2652 { 2653 if (ent1->ename != ent2->ename) 2654 return (ent2->ename - ent1->ename); 2655 if (ent1->ipath != ent2->ipath) 2656 return ((char *)ent2->ipath - (char *)ent1->ipath); 2657 2658 return (0); 2659 } 2660 2661 void 2662 fme_serd_load(fmd_hdl_t *hdl) 2663 { 2664 int sz; 2665 char *sbuf; 2666 char *sepptr; 2667 char *ptr; 2668 struct serd_entry *newentp; 2669 struct node *epname; 2670 nvlist_t *fmri; 2671 char *namestring; 2672 2673 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2674 return; 2675 sbuf = alloca(sz); 2676 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2677 ptr = sbuf; 2678 while (ptr < &sbuf[sz]) { 2679 sepptr = strchr(ptr, '@'); 2680 *sepptr = '\0'; 2681 namestring = ptr; 2682 sepptr++; 2683 ptr = sepptr; 2684 ptr += strlen(ptr); 2685 ptr++; /* move past the '\0' separating paths */ 2686 epname = pathstring2epnamenp(sepptr); 2687 fmri = node2fmri(epname); 2688 if (platform_path_exists(fmri)) { 2689 newentp = MALLOC(sizeof (*newentp)); 2690 newentp->hdl = hdl; 2691 newentp->ipath = ipath(epname); 2692 newentp->ename = stable(namestring); 2693 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2694 (void *)newentp, (lut_cmp)serd_cmp); 2695 } else 2696 Serd_need_save = 1; 2697 tree_free(epname); 2698 nvlist_free(fmri); 2699 } 2700 /* save it back again in case some of the paths no longer exist */ 2701 serd_save(); 2702 } 2703 2704 /*ARGSUSED*/ 2705 static void 2706 serd_destructor(void *left, void *right, void *arg) 2707 { 2708 struct serd_entry *entp = (struct serd_entry *)left; 2709 FREE(entp); 2710 } 2711 2712 /* 2713 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2714 */ 2715 /*ARGSUSED*/ 2716 static void 2717 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2718 { 2719 char *path; 2720 2721 if (entp->ipath == ipp) { 2722 path = ipath2str(entp->ename, ipp); 2723 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2724 fmd_serd_reset(entp->hdl, path); 2725 FREE(path); 2726 Serd_need_save = 1; 2727 } 2728 } 2729 2730 /*ARGSUSED*/ 2731 static void 2732 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2733 { 2734 char *path; 2735 nvlist_t *fmri; 2736 2737 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2738 if (!platform_path_exists(fmri)) { 2739 path = ipath2str(entp->ename, entp->ipath); 2740 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2741 fmd_serd_reset(entp->hdl, path); 2742 FREE(path); 2743 Serd_need_save = 1; 2744 } 2745 nvlist_free(fmri); 2746 } 2747 2748 void 2749 serd_fini(void) 2750 { 2751 lut_free(SerdEngines, serd_destructor, NULL); 2752 } 2753 2754 static void 2755 publish_suspects(struct fme *fmep, struct rsl *srl) 2756 { 2757 struct rsl *rp; 2758 nvlist_t *fault; 2759 uint8_t cert; 2760 uint_t *frs; 2761 uint_t frsum, fr; 2762 uint_t messval; 2763 uint_t retireval; 2764 uint_t responseval; 2765 struct node *snp; 2766 int frcnt, fridx; 2767 boolean_t allfaulty = B_TRUE; 2768 struct rsl *erl = srl + fmep->nsuspects - 1; 2769 2770 /* 2771 * sort the array 2772 */ 2773 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2774 2775 /* sum the fitrates */ 2776 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2777 fridx = frcnt = frsum = 0; 2778 2779 for (rp = srl; rp <= erl; rp++) { 2780 struct node *n; 2781 2782 n = eventprop_lookup(rp->suspect, L_FITrate); 2783 if (node2uint(n, &fr) != 0) { 2784 out(O_DEBUG|O_NONL, "event "); 2785 ipath_print(O_DEBUG|O_NONL, 2786 rp->suspect->enode->u.event.ename->u.name.s, 2787 rp->suspect->ipp); 2788 out(O_VERB, " has no FITrate (using 1)"); 2789 fr = 1; 2790 } else if (fr == 0) { 2791 out(O_DEBUG|O_NONL, "event "); 2792 ipath_print(O_DEBUG|O_NONL, 2793 rp->suspect->enode->u.event.ename->u.name.s, 2794 rp->suspect->ipp); 2795 out(O_VERB, " has zero FITrate (using 1)"); 2796 fr = 1; 2797 } 2798 2799 frs[fridx++] = fr; 2800 frsum += fr; 2801 frcnt++; 2802 } 2803 2804 /* Add them in reverse order of our sort, as fmd reverses order */ 2805 for (rp = erl; rp >= srl; rp--) { 2806 cert = percentof(frs[--fridx], frsum); 2807 fault = fmd_nvl_create_fault(fmep->hdl, 2808 rp->suspect->enode->u.event.ename->u.name.s, 2809 cert, 2810 rp->asru, 2811 rp->fru, 2812 rp->rsrc); 2813 if (fault == NULL) 2814 out(O_DIE, "fault creation failed"); 2815 /* if "message" property exists, add it to the fault */ 2816 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2817 &messval) == 0) { 2818 2819 out(O_ALTFP, 2820 "[FME%d, %s adds message=%d to suspect list]", 2821 fmep->id, 2822 rp->suspect->enode->u.event.ename->u.name.s, 2823 messval); 2824 if (nvlist_add_boolean_value(fault, 2825 FM_SUSPECT_MESSAGE, 2826 (messval) ? B_TRUE : B_FALSE) != 0) { 2827 out(O_DIE, "cannot add no-message to fault"); 2828 } 2829 } 2830 2831 /* if "retire" property exists, add it to the fault */ 2832 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2833 &retireval) == 0) { 2834 2835 out(O_ALTFP, 2836 "[FME%d, %s adds retire=%d to suspect list]", 2837 fmep->id, 2838 rp->suspect->enode->u.event.ename->u.name.s, 2839 retireval); 2840 if (nvlist_add_boolean_value(fault, 2841 FM_SUSPECT_RETIRE, 2842 (retireval) ? B_TRUE : B_FALSE) != 0) { 2843 out(O_DIE, "cannot add no-retire to fault"); 2844 } 2845 } 2846 2847 /* if "response" property exists, add it to the fault */ 2848 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2849 &responseval) == 0) { 2850 2851 out(O_ALTFP, 2852 "[FME%d, %s adds response=%d to suspect list]", 2853 fmep->id, 2854 rp->suspect->enode->u.event.ename->u.name.s, 2855 responseval); 2856 if (nvlist_add_boolean_value(fault, 2857 FM_SUSPECT_RESPONSE, 2858 (responseval) ? B_TRUE : B_FALSE) != 0) { 2859 out(O_DIE, "cannot add no-response to fault"); 2860 } 2861 } 2862 2863 /* add any payload properties */ 2864 lut_walk(rp->suspect->payloadprops, 2865 (lut_cb)addpayloadprop, (void *)fault); 2866 rslfree(rp); 2867 2868 /* 2869 * If "action" property exists, evaluate it; this must be done 2870 * before the allfaulty check below since some actions may 2871 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2872 * needs to be restructured if any new actions are introduced 2873 * that have effects that we do not want to be visible if 2874 * we decide not to publish in the dupclose check below. 2875 */ 2876 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2877 struct evalue evalue; 2878 2879 out(O_ALTFP|O_NONL, 2880 "[FME%d, %s action ", fmep->id, 2881 rp->suspect->enode->u.event.ename->u.name.s); 2882 ptree_name_iter(O_ALTFP|O_NONL, snp); 2883 out(O_ALTFP, "]"); 2884 Action_nvl = fault; 2885 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2886 NULL, 0, &evalue); 2887 } 2888 2889 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2890 2891 /* 2892 * check if the asru is already marked as "faulty". 2893 */ 2894 if (allfaulty) { 2895 nvlist_t *asru; 2896 2897 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2898 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2899 out(O_ALTFP|O_VERB|O_NONL, " "); 2900 if (nvlist_lookup_nvlist(fault, 2901 FM_FAULT_ASRU, &asru) != 0) { 2902 out(O_ALTFP|O_VERB, "NULL asru"); 2903 allfaulty = B_FALSE; 2904 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2905 FMD_HAS_FAULT_ASRU, NULL)) { 2906 out(O_ALTFP|O_VERB, "faulty"); 2907 } else { 2908 out(O_ALTFP|O_VERB, "not faulty"); 2909 allfaulty = B_FALSE; 2910 } 2911 } 2912 2913 } 2914 2915 if (!allfaulty) { 2916 /* 2917 * don't update the count stat if all asrus are already 2918 * present and unrepaired in the asru cache 2919 */ 2920 for (rp = erl; rp >= srl; rp--) { 2921 struct event *suspect = rp->suspect; 2922 2923 if (suspect == NULL) 2924 continue; 2925 2926 /* if "count" exists, increment the appropriate stat */ 2927 if ((snp = eventprop_lookup(suspect, 2928 L_count)) != NULL) { 2929 out(O_ALTFP|O_NONL, 2930 "[FME%d, %s count ", fmep->id, 2931 suspect->enode->u.event.ename->u.name.s); 2932 ptree_name_iter(O_ALTFP|O_NONL, snp); 2933 out(O_ALTFP, "]"); 2934 istat_bump(snp, 0); 2935 2936 } 2937 } 2938 istat_save(); /* write out any istat changes */ 2939 } 2940 } 2941 2942 static const char * 2943 undiag_2defect_str(int ud) 2944 { 2945 switch (ud) { 2946 case UD_VAL_MISSINGINFO: 2947 case UD_VAL_MISSINGOBS: 2948 case UD_VAL_MISSINGPATH: 2949 case UD_VAL_MISSINGZERO: 2950 case UD_VAL_BADOBS: 2951 case UD_VAL_CFGMISMATCH: 2952 return (UNDIAG_DEFECT_CHKPT); 2953 2954 case UD_VAL_BADEVENTI: 2955 case UD_VAL_BADEVENTPATH: 2956 case UD_VAL_BADEVENTCLASS: 2957 case UD_VAL_INSTFAIL: 2958 case UD_VAL_NOPATH: 2959 case UD_VAL_UNSOLVD: 2960 return (UNDIAG_DEFECT_FME); 2961 2962 case UD_VAL_MAXFME: 2963 return (UNDIAG_DEFECT_LIMIT); 2964 2965 case UD_VAL_UNKNOWN: 2966 default: 2967 return (UNDIAG_DEFECT_UNKNOWN); 2968 } 2969 } 2970 2971 static const char * 2972 undiag_2fault_str(int ud) 2973 { 2974 switch (ud) { 2975 case UD_VAL_BADEVENTI: 2976 case UD_VAL_BADEVENTPATH: 2977 case UD_VAL_BADEVENTCLASS: 2978 case UD_VAL_INSTFAIL: 2979 case UD_VAL_NOPATH: 2980 case UD_VAL_UNSOLVD: 2981 return (UNDIAG_FAULT_FME); 2982 default: 2983 return (NULL); 2984 } 2985 } 2986 2987 static char * 2988 undiag_2reason_str(int ud, char *arg) 2989 { 2990 const char *ptr; 2991 char *buf; 2992 int with_arg = 0; 2993 2994 switch (ud) { 2995 case UD_VAL_BADEVENTPATH: 2996 ptr = UD_STR_BADEVENTPATH; 2997 with_arg = 1; 2998 break; 2999 case UD_VAL_BADEVENTCLASS: 3000 ptr = UD_STR_BADEVENTCLASS; 3001 with_arg = 1; 3002 break; 3003 case UD_VAL_BADEVENTI: 3004 ptr = UD_STR_BADEVENTI; 3005 with_arg = 1; 3006 break; 3007 case UD_VAL_BADOBS: 3008 ptr = UD_STR_BADOBS; 3009 break; 3010 case UD_VAL_CFGMISMATCH: 3011 ptr = UD_STR_CFGMISMATCH; 3012 break; 3013 case UD_VAL_INSTFAIL: 3014 ptr = UD_STR_INSTFAIL; 3015 with_arg = 1; 3016 break; 3017 case UD_VAL_MAXFME: 3018 ptr = UD_STR_MAXFME; 3019 break; 3020 case UD_VAL_MISSINGINFO: 3021 ptr = UD_STR_MISSINGINFO; 3022 break; 3023 case UD_VAL_MISSINGOBS: 3024 ptr = UD_STR_MISSINGOBS; 3025 break; 3026 case UD_VAL_MISSINGPATH: 3027 ptr = UD_STR_MISSINGPATH; 3028 break; 3029 case UD_VAL_MISSINGZERO: 3030 ptr = UD_STR_MISSINGZERO; 3031 break; 3032 case UD_VAL_NOPATH: 3033 ptr = UD_STR_NOPATH; 3034 with_arg = 1; 3035 break; 3036 case UD_VAL_UNSOLVD: 3037 ptr = UD_STR_UNSOLVD; 3038 break; 3039 case UD_VAL_UNKNOWN: 3040 default: 3041 ptr = UD_STR_UNKNOWN; 3042 break; 3043 } 3044 if (with_arg) { 3045 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3046 (void) sprintf(buf, ptr, arg); 3047 } else { 3048 buf = MALLOC(strlen(ptr) + 1); 3049 (void) sprintf(buf, ptr); 3050 } 3051 return (buf); 3052 } 3053 3054 static void 3055 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3056 nvlist_t *detector, char *arg) 3057 { 3058 struct case_list *newcase; 3059 nvlist_t *defect, *fault; 3060 const char *faultstr; 3061 char *reason = undiag_2reason_str(Undiag_reason, arg); 3062 3063 out(O_ALTFP, 3064 "[undiagnosable ereport received, " 3065 "creating and closing a new case (%s)]", reason); 3066 3067 newcase = MALLOC(sizeof (struct case_list)); 3068 newcase->next = NULL; 3069 newcase->fmcase = fmcase; 3070 if (Undiagablecaselist != NULL) 3071 newcase->next = Undiagablecaselist; 3072 Undiagablecaselist = newcase; 3073 3074 if (ffep != NULL) 3075 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3076 3077 /* add defect */ 3078 defect = fmd_nvl_create_fault(hdl, 3079 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3080 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3081 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3082 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3083 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3084 3085 /* add fault if appropriate */ 3086 faultstr = undiag_2fault_str(Undiag_reason); 3087 if (faultstr != NULL) { 3088 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3089 detector); 3090 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3091 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3092 B_FALSE); 3093 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3094 B_FALSE); 3095 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3096 } 3097 FREE(reason); 3098 3099 /* solve and close case */ 3100 fmd_case_solve(hdl, newcase->fmcase); 3101 fmd_case_close(hdl, newcase->fmcase); 3102 Undiag_reason = UD_VAL_UNKNOWN; 3103 } 3104 3105 static void 3106 fme_undiagnosable(struct fme *f) 3107 { 3108 nvlist_t *defect, *fault, *detector = NULL; 3109 struct event *ep; 3110 char *pathstr; 3111 const char *faultstr; 3112 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3113 3114 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3115 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3116 3117 for (ep = f->observations; ep; ep = ep->observations) { 3118 3119 if (ep->ffep != f->e0r) 3120 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3121 3122 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3123 platform_units_translate(0, f->config, NULL, NULL, &detector, 3124 pathstr); 3125 FREE(pathstr); 3126 3127 /* add defect */ 3128 defect = fmd_nvl_create_fault(f->hdl, 3129 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3130 NULL, NULL, detector); 3131 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3132 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3133 B_FALSE); 3134 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3135 B_FALSE); 3136 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3137 3138 /* add fault if appropriate */ 3139 faultstr = undiag_2fault_str(Undiag_reason); 3140 if (faultstr == NULL) 3141 continue; 3142 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3143 NULL, NULL, detector); 3144 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3145 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3146 B_FALSE); 3147 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3148 B_FALSE); 3149 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3150 nvlist_free(detector); 3151 } 3152 FREE(reason); 3153 fmd_case_solve(f->hdl, f->fmcase); 3154 fmd_case_close(f->hdl, f->fmcase); 3155 Undiag_reason = UD_VAL_UNKNOWN; 3156 } 3157 3158 /* 3159 * fme_close_case 3160 * 3161 * Find the requested case amongst our fmes and close it. Free up 3162 * the related fme. 3163 */ 3164 void 3165 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3166 { 3167 struct case_list *ucasep, *prevcasep = NULL; 3168 struct fme *prev = NULL; 3169 struct fme *fmep; 3170 3171 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3172 if (fmcase != ucasep->fmcase) { 3173 prevcasep = ucasep; 3174 continue; 3175 } 3176 3177 if (prevcasep == NULL) 3178 Undiagablecaselist = Undiagablecaselist->next; 3179 else 3180 prevcasep->next = ucasep->next; 3181 3182 FREE(ucasep); 3183 return; 3184 } 3185 3186 for (fmep = FMElist; fmep; fmep = fmep->next) { 3187 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3188 break; 3189 prev = fmep; 3190 } 3191 3192 if (fmep == NULL) { 3193 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3194 fmd_case_uuid(hdl, fmcase)); 3195 return; 3196 } 3197 3198 if (EFMElist == fmep) 3199 EFMElist = prev; 3200 3201 if (prev == NULL) 3202 FMElist = FMElist->next; 3203 else 3204 prev->next = fmep->next; 3205 3206 fmep->next = NULL; 3207 3208 /* Get rid of any timer this fme has set */ 3209 if (fmep->wull != 0) 3210 fmd_timer_remove(fmep->hdl, fmep->timer); 3211 3212 if (ClosedFMEs == NULL) { 3213 ClosedFMEs = fmep; 3214 } else { 3215 fmep->next = ClosedFMEs; 3216 ClosedFMEs = fmep; 3217 } 3218 3219 Open_fme_count--; 3220 3221 /* See if we can close the overflow FME */ 3222 if (Open_fme_count <= Max_fme) { 3223 for (fmep = FMElist; fmep; fmep = fmep->next) { 3224 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3225 fmep->fmcase))) 3226 break; 3227 } 3228 3229 if (fmep != NULL) 3230 fmd_case_close(fmep->hdl, fmep->fmcase); 3231 } 3232 } 3233 3234 /* 3235 * fme_set_timer() 3236 * If the time we need to wait for the given FME is less than the 3237 * current timer, kick that old timer out and establish a new one. 3238 */ 3239 static int 3240 fme_set_timer(struct fme *fmep, unsigned long long wull) 3241 { 3242 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3243 ptree_timeval(O_ALTFP|O_VERB, &wull); 3244 3245 if (wull <= fmep->pull) { 3246 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3247 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3248 out(O_ALTFP|O_VERB, NULL); 3249 /* we've waited at least wull already, don't need timer */ 3250 return (0); 3251 } 3252 3253 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3254 if (fmep->wull != 0) { 3255 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3256 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3257 out(O_ALTFP|O_VERB, NULL); 3258 } else { 3259 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3260 out(O_ALTFP|O_VERB, NULL); 3261 } 3262 3263 if (fmep->wull != 0) 3264 if (wull >= fmep->wull) 3265 /* New timer would fire later than established timer */ 3266 return (0); 3267 3268 if (fmep->wull != 0) { 3269 fmd_timer_remove(fmep->hdl, fmep->timer); 3270 } 3271 3272 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3273 fmep->e0r, wull); 3274 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3275 fmep->wull = wull; 3276 return (1); 3277 } 3278 3279 void 3280 fme_timer_fired(struct fme *fmep, id_t tid) 3281 { 3282 struct fme *ffmep = NULL; 3283 3284 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3285 if (ffmep == fmep) 3286 break; 3287 3288 if (ffmep == NULL) { 3289 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3290 (void *)fmep); 3291 return; 3292 } 3293 3294 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3295 fmep->pull = fmep->wull; 3296 fmep->wull = 0; 3297 fmd_buf_write(fmep->hdl, fmep->fmcase, 3298 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3299 3300 fme_eval(fmep, fmep->e0r); 3301 } 3302 3303 /* 3304 * Preserve the fme's suspect list in its psuspects list, NULLing the 3305 * suspects list in the meantime. 3306 */ 3307 static void 3308 save_suspects(struct fme *fmep) 3309 { 3310 struct event *ep; 3311 struct event *nextep; 3312 3313 /* zero out the previous suspect list */ 3314 for (ep = fmep->psuspects; ep; ep = nextep) { 3315 nextep = ep->psuspects; 3316 ep->psuspects = NULL; 3317 } 3318 fmep->psuspects = NULL; 3319 3320 /* zero out the suspect list, copying it to previous suspect list */ 3321 fmep->psuspects = fmep->suspects; 3322 for (ep = fmep->suspects; ep; ep = nextep) { 3323 nextep = ep->suspects; 3324 ep->psuspects = ep->suspects; 3325 ep->suspects = NULL; 3326 ep->is_suspect = 0; 3327 } 3328 fmep->suspects = NULL; 3329 fmep->nsuspects = 0; 3330 } 3331 3332 /* 3333 * Retrieve the fme's suspect list from its psuspects list. 3334 */ 3335 static void 3336 restore_suspects(struct fme *fmep) 3337 { 3338 struct event *ep; 3339 struct event *nextep; 3340 3341 fmep->nsuspects = 0; 3342 fmep->suspects = fmep->psuspects; 3343 for (ep = fmep->psuspects; ep; ep = nextep) { 3344 fmep->nsuspects++; 3345 nextep = ep->psuspects; 3346 ep->suspects = ep->psuspects; 3347 } 3348 } 3349 3350 /* 3351 * this is what we use to call the Emrys prototype code instead of main() 3352 */ 3353 static void 3354 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3355 { 3356 struct event *ep; 3357 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3358 struct rsl *srl = NULL; 3359 struct rsl *srl2 = NULL; 3360 int mess_zero_count; 3361 int rpcnt; 3362 3363 save_suspects(fmep); 3364 3365 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3366 indent_set(" "); 3367 3368 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3369 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3370 3371 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3372 fme_state2str(fmep->state)); 3373 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3374 out(O_ALTFP|O_NONL, " "); 3375 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3376 } 3377 out(O_ALTFP, NULL); 3378 3379 switch (fmep->state) { 3380 case FME_CREDIBLE: 3381 print_suspects(SLNEW, fmep); 3382 (void) upsets_eval(fmep, ffep); 3383 3384 /* 3385 * we may have already posted suspects in upsets_eval() which 3386 * can recurse into fme_eval() again. If so then just return. 3387 */ 3388 if (fmep->posted_suspects) 3389 return; 3390 3391 stats_counter_bump(fmep->diags); 3392 rpcnt = fmep->nsuspects; 3393 save_suspects(fmep); 3394 3395 /* 3396 * create two lists, one for "message=1" faults and one for 3397 * "message=0" faults. If we have a mixture we will generate 3398 * two separate suspect lists. 3399 */ 3400 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3401 bzero(srl, rpcnt * sizeof (struct rsl)); 3402 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3403 bzero(srl2, rpcnt * sizeof (struct rsl)); 3404 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3405 3406 /* 3407 * If the resulting suspect list has no members, we're 3408 * done so simply close the case. Otherwise sort and publish. 3409 */ 3410 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3411 out(O_ALTFP, 3412 "[FME%d, case %s (all suspects are upsets)]", 3413 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3414 fmd_case_close(fmep->hdl, fmep->fmcase); 3415 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3416 publish_suspects(fmep, srl); 3417 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3418 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3419 fmd_case_solve(fmep->hdl, fmep->fmcase); 3420 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3421 fmep->nsuspects = mess_zero_count; 3422 publish_suspects(fmep, srl2); 3423 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3424 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3425 fmd_case_solve(fmep->hdl, fmep->fmcase); 3426 } else { 3427 struct event *obsp; 3428 struct fme *nfmep; 3429 3430 publish_suspects(fmep, srl); 3431 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3432 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3433 fmd_case_solve(fmep->hdl, fmep->fmcase); 3434 3435 /* 3436 * Got both message=0 and message=1 so create a 3437 * duplicate case. Also need a temporary duplicate fme 3438 * structure for use by publish_suspects(). 3439 */ 3440 nfmep = alloc_fme(); 3441 nfmep->id = Nextid++; 3442 nfmep->hdl = fmep->hdl; 3443 nfmep->nsuspects = mess_zero_count; 3444 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3445 out(O_ALTFP|O_STAMP, 3446 "[creating parallel FME%d, case %s]", nfmep->id, 3447 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3448 Open_fme_count++; 3449 if (ffep) { 3450 fmd_case_setprincipal(nfmep->hdl, 3451 nfmep->fmcase, ffep); 3452 fmd_case_add_ereport(nfmep->hdl, 3453 nfmep->fmcase, ffep); 3454 } 3455 for (obsp = fmep->observations; obsp; 3456 obsp = obsp->observations) 3457 if (obsp->ffep && obsp->ffep != ffep) 3458 fmd_case_add_ereport(nfmep->hdl, 3459 nfmep->fmcase, obsp->ffep); 3460 3461 publish_suspects(nfmep, srl2); 3462 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3463 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3464 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3465 FREE(nfmep); 3466 } 3467 FREE(srl); 3468 FREE(srl2); 3469 restore_suspects(fmep); 3470 3471 fmep->posted_suspects = 1; 3472 fmd_buf_write(fmep->hdl, fmep->fmcase, 3473 WOBUF_POSTD, 3474 (void *)&fmep->posted_suspects, 3475 sizeof (fmep->posted_suspects)); 3476 3477 /* 3478 * Now the suspects have been posted, we can clear up 3479 * the instance tree as we won't be looking at it again. 3480 * Also cancel the timer as the case is now solved. 3481 */ 3482 if (fmep->wull != 0) { 3483 fmd_timer_remove(fmep->hdl, fmep->timer); 3484 fmep->wull = 0; 3485 } 3486 break; 3487 3488 case FME_WAIT: 3489 ASSERT(my_delay > fmep->ull); 3490 (void) fme_set_timer(fmep, my_delay); 3491 print_suspects(SLWAIT, fmep); 3492 itree_prune(fmep->eventtree); 3493 return; 3494 3495 case FME_DISPROVED: 3496 print_suspects(SLDISPROVED, fmep); 3497 Undiag_reason = UD_VAL_UNSOLVD; 3498 fme_undiagnosable(fmep); 3499 break; 3500 } 3501 3502 itree_free(fmep->eventtree); 3503 fmep->eventtree = NULL; 3504 structconfig_free(fmep->config); 3505 fmep->config = NULL; 3506 destroy_fme_bufs(fmep); 3507 } 3508 3509 static void indent(void); 3510 static int triggered(struct fme *fmep, struct event *ep, int mark); 3511 static enum fme_state effects_test(struct fme *fmep, 3512 struct event *fault_event, unsigned long long at_latest_by, 3513 unsigned long long *pdelay); 3514 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3515 unsigned long long at_latest_by, unsigned long long *pdelay); 3516 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3517 unsigned long long at_latest_by, unsigned long long *pdelay); 3518 3519 static int 3520 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3521 { 3522 struct constraintlist *ctp; 3523 struct evalue value; 3524 char *sep = ""; 3525 3526 if (arrowp->forever_false) { 3527 indent(); 3528 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3529 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3530 out(O_ALTFP|O_VERB|O_NONL, sep); 3531 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3532 sep = ", "; 3533 } 3534 out(O_ALTFP|O_VERB, NULL); 3535 return (0); 3536 } 3537 if (arrowp->forever_true) { 3538 indent(); 3539 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3540 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3541 out(O_ALTFP|O_VERB|O_NONL, sep); 3542 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3543 sep = ", "; 3544 } 3545 out(O_ALTFP|O_VERB, NULL); 3546 return (1); 3547 } 3548 3549 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3550 if (eval_expr(ctp->cnode, NULL, NULL, 3551 &fmep->globals, fmep->config, 3552 arrowp, 0, &value)) { 3553 /* evaluation successful */ 3554 if (value.t == UNDEFINED || value.v == 0) { 3555 /* known false */ 3556 arrowp->forever_false = 1; 3557 indent(); 3558 out(O_ALTFP|O_VERB|O_NONL, 3559 " False constraint: "); 3560 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3561 out(O_ALTFP|O_VERB, NULL); 3562 return (0); 3563 } 3564 } else { 3565 /* evaluation unsuccessful -- unknown value */ 3566 indent(); 3567 out(O_ALTFP|O_VERB|O_NONL, 3568 " Deferred constraint: "); 3569 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3570 out(O_ALTFP|O_VERB, NULL); 3571 return (1); 3572 } 3573 } 3574 /* known true */ 3575 arrowp->forever_true = 1; 3576 indent(); 3577 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3578 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3579 out(O_ALTFP|O_VERB|O_NONL, sep); 3580 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3581 sep = ", "; 3582 } 3583 out(O_ALTFP|O_VERB, NULL); 3584 return (1); 3585 } 3586 3587 static int 3588 triggered(struct fme *fmep, struct event *ep, int mark) 3589 { 3590 struct bubble *bp; 3591 struct arrowlist *ap; 3592 int count = 0; 3593 3594 stats_counter_bump(fmep->Tcallcount); 3595 for (bp = itree_next_bubble(ep, NULL); bp; 3596 bp = itree_next_bubble(ep, bp)) { 3597 if (bp->t != B_TO) 3598 continue; 3599 for (ap = itree_next_arrow(bp, NULL); ap; 3600 ap = itree_next_arrow(bp, ap)) { 3601 /* check count of marks against K in the bubble */ 3602 if ((ap->arrowp->mark & mark) && 3603 ++count >= bp->nork) 3604 return (1); 3605 } 3606 } 3607 return (0); 3608 } 3609 3610 static int 3611 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3612 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3613 { 3614 struct bubble *bp; 3615 struct arrowlist *ap; 3616 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3617 unsigned long long my_delay; 3618 enum fme_state result; 3619 int retval = 0; 3620 3621 for (bp = itree_next_bubble(ep, NULL); bp; 3622 bp = itree_next_bubble(ep, bp)) { 3623 if (bp->t != B_FROM) 3624 continue; 3625 stats_counter_bump(fmep->Marrowcount); 3626 for (ap = itree_next_arrow(bp, NULL); ap; 3627 ap = itree_next_arrow(bp, ap)) { 3628 struct event *ep2 = ap->arrowp->head->myevent; 3629 /* 3630 * if we're clearing marks, we can avoid doing 3631 * all that work evaluating constraints. 3632 */ 3633 if (mark == 0) { 3634 if (ap->arrowp->arrow_marked == 0) 3635 continue; 3636 ap->arrowp->arrow_marked = 0; 3637 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3638 if (keep && (ep2->cached_state & 3639 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3640 ep2->keep_in_tree = 1; 3641 ep2->cached_state &= 3642 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3643 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3644 keep); 3645 continue; 3646 } 3647 ap->arrowp->arrow_marked = 1; 3648 if (ep2->cached_state & REQMNTS_DISPROVED) { 3649 indent(); 3650 out(O_ALTFP|O_VERB|O_NONL, 3651 " ALREADY DISPROVED "); 3652 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3653 out(O_ALTFP|O_VERB, NULL); 3654 continue; 3655 } 3656 if (ep2->cached_state & WAIT_EFFECT) { 3657 indent(); 3658 out(O_ALTFP|O_VERB|O_NONL, 3659 " ALREADY EFFECTS WAIT "); 3660 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3661 out(O_ALTFP|O_VERB, NULL); 3662 continue; 3663 } 3664 if (ep2->cached_state & CREDIBLE_EFFECT) { 3665 indent(); 3666 out(O_ALTFP|O_VERB|O_NONL, 3667 " ALREADY EFFECTS CREDIBLE "); 3668 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3669 out(O_ALTFP|O_VERB, NULL); 3670 continue; 3671 } 3672 if ((ep2->cached_state & PARENT_WAIT) && 3673 (mark & PARENT_WAIT)) { 3674 indent(); 3675 out(O_ALTFP|O_VERB|O_NONL, 3676 " ALREADY PARENT EFFECTS WAIT "); 3677 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3678 out(O_ALTFP|O_VERB, NULL); 3679 continue; 3680 } 3681 platform_set_payloadnvp(ep2->nvp); 3682 if (checkconstraints(fmep, ap->arrowp) == 0) { 3683 platform_set_payloadnvp(NULL); 3684 indent(); 3685 out(O_ALTFP|O_VERB|O_NONL, 3686 " CONSTRAINTS FAIL "); 3687 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3688 out(O_ALTFP|O_VERB, NULL); 3689 continue; 3690 } 3691 platform_set_payloadnvp(NULL); 3692 ap->arrowp->mark |= EFFECTS_COUNTER; 3693 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3694 indent(); 3695 out(O_ALTFP|O_VERB|O_NONL, 3696 " K-COUNT NOT YET MET "); 3697 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3698 out(O_ALTFP|O_VERB, NULL); 3699 continue; 3700 } 3701 ep2->cached_state &= ~PARENT_WAIT; 3702 /* 3703 * if we've reached an ereport and no propagation time 3704 * is specified, use the Hesitate value 3705 */ 3706 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3707 ap->arrowp->maxdelay == 0ULL) { 3708 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3709 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3710 out(O_ALTFP|O_VERB, NULL); 3711 result = requirements_test(fmep, ep2, Hesitate, 3712 &my_delay); 3713 } else { 3714 result = requirements_test(fmep, ep2, 3715 at_latest_by + ap->arrowp->maxdelay, 3716 &my_delay); 3717 } 3718 if (result == FME_WAIT) { 3719 retval = WAIT_EFFECT; 3720 if (overall_delay > my_delay) 3721 overall_delay = my_delay; 3722 ep2->cached_state |= WAIT_EFFECT; 3723 indent(); 3724 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3725 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3726 out(O_ALTFP|O_VERB, NULL); 3727 indent_push(" E"); 3728 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3729 at_latest_by, &my_delay, 0) == 3730 WAIT_EFFECT) { 3731 retval = WAIT_EFFECT; 3732 if (overall_delay > my_delay) 3733 overall_delay = my_delay; 3734 } 3735 indent_pop(); 3736 } else if (result == FME_DISPROVED) { 3737 indent(); 3738 out(O_ALTFP|O_VERB|O_NONL, 3739 " EFFECTS DISPROVED "); 3740 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3741 out(O_ALTFP|O_VERB, NULL); 3742 } else { 3743 ep2->cached_state |= mark; 3744 indent(); 3745 if (mark == CREDIBLE_EFFECT) 3746 out(O_ALTFP|O_VERB|O_NONL, 3747 " EFFECTS CREDIBLE "); 3748 else 3749 out(O_ALTFP|O_VERB|O_NONL, 3750 " PARENT EFFECTS WAIT "); 3751 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3752 out(O_ALTFP|O_VERB, NULL); 3753 indent_push(" E"); 3754 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3755 &my_delay, 0) == WAIT_EFFECT) { 3756 retval = WAIT_EFFECT; 3757 if (overall_delay > my_delay) 3758 overall_delay = my_delay; 3759 } 3760 indent_pop(); 3761 } 3762 } 3763 } 3764 if (retval == WAIT_EFFECT) 3765 *pdelay = overall_delay; 3766 return (retval); 3767 } 3768 3769 static enum fme_state 3770 effects_test(struct fme *fmep, struct event *fault_event, 3771 unsigned long long at_latest_by, unsigned long long *pdelay) 3772 { 3773 struct event *error_event; 3774 enum fme_state return_value = FME_CREDIBLE; 3775 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3776 unsigned long long my_delay; 3777 3778 stats_counter_bump(fmep->Ecallcount); 3779 indent_push(" E"); 3780 indent(); 3781 out(O_ALTFP|O_VERB|O_NONL, "->"); 3782 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3783 out(O_ALTFP|O_VERB, NULL); 3784 3785 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3786 &my_delay, 0) == WAIT_EFFECT) { 3787 return_value = FME_WAIT; 3788 if (overall_delay > my_delay) 3789 overall_delay = my_delay; 3790 } 3791 for (error_event = fmep->observations; 3792 error_event; error_event = error_event->observations) { 3793 indent(); 3794 out(O_ALTFP|O_VERB|O_NONL, " "); 3795 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3796 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3797 if (error_event->cached_state & 3798 (PARENT_WAIT|WAIT_EFFECT)) { 3799 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3800 continue; 3801 } 3802 return_value = FME_DISPROVED; 3803 out(O_ALTFP|O_VERB, " NOT triggered"); 3804 break; 3805 } else { 3806 out(O_ALTFP|O_VERB, " triggered"); 3807 } 3808 } 3809 if (return_value == FME_DISPROVED) { 3810 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3811 } else { 3812 fault_event->keep_in_tree = 1; 3813 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3814 } 3815 3816 indent(); 3817 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3818 fme_state2str(return_value)); 3819 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3820 out(O_ALTFP|O_VERB, NULL); 3821 indent_pop(); 3822 if (return_value == FME_WAIT) 3823 *pdelay = overall_delay; 3824 return (return_value); 3825 } 3826 3827 static enum fme_state 3828 requirements_test(struct fme *fmep, struct event *ep, 3829 unsigned long long at_latest_by, unsigned long long *pdelay) 3830 { 3831 int waiting_events; 3832 int credible_events; 3833 int deferred_events; 3834 enum fme_state return_value = FME_CREDIBLE; 3835 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3836 unsigned long long arrow_delay; 3837 unsigned long long my_delay; 3838 struct event *ep2; 3839 struct bubble *bp; 3840 struct arrowlist *ap; 3841 3842 if (ep->cached_state & REQMNTS_CREDIBLE) { 3843 indent(); 3844 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3845 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3846 out(O_ALTFP|O_VERB, NULL); 3847 return (FME_CREDIBLE); 3848 } 3849 if (ep->cached_state & REQMNTS_DISPROVED) { 3850 indent(); 3851 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3852 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3853 out(O_ALTFP|O_VERB, NULL); 3854 return (FME_DISPROVED); 3855 } 3856 if (ep->cached_state & REQMNTS_WAIT) { 3857 indent(); 3858 *pdelay = ep->cached_delay; 3859 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3860 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3861 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3862 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3863 out(O_ALTFP|O_VERB, NULL); 3864 return (FME_WAIT); 3865 } 3866 stats_counter_bump(fmep->Rcallcount); 3867 indent_push(" R"); 3868 indent(); 3869 out(O_ALTFP|O_VERB|O_NONL, "->"); 3870 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3871 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3872 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3873 out(O_ALTFP|O_VERB, NULL); 3874 3875 if (ep->t == N_EREPORT) { 3876 if (ep->count == 0) { 3877 if (fmep->pull >= at_latest_by) { 3878 return_value = FME_DISPROVED; 3879 } else { 3880 ep->cached_delay = *pdelay = at_latest_by; 3881 return_value = FME_WAIT; 3882 } 3883 } 3884 3885 indent(); 3886 switch (return_value) { 3887 case FME_CREDIBLE: 3888 ep->cached_state |= REQMNTS_CREDIBLE; 3889 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3890 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3891 break; 3892 case FME_DISPROVED: 3893 ep->cached_state |= REQMNTS_DISPROVED; 3894 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3895 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3896 break; 3897 case FME_WAIT: 3898 ep->cached_state |= REQMNTS_WAIT; 3899 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3900 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3901 out(O_ALTFP|O_VERB|O_NONL, " to "); 3902 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3903 break; 3904 default: 3905 out(O_DIE, "requirements_test: unexpected fme_state"); 3906 break; 3907 } 3908 out(O_ALTFP|O_VERB, NULL); 3909 indent_pop(); 3910 3911 return (return_value); 3912 } 3913 3914 /* this event is not a report, descend the tree */ 3915 for (bp = itree_next_bubble(ep, NULL); bp; 3916 bp = itree_next_bubble(ep, bp)) { 3917 int n; 3918 3919 if (bp->t != B_FROM) 3920 continue; 3921 3922 n = bp->nork; 3923 3924 credible_events = 0; 3925 waiting_events = 0; 3926 deferred_events = 0; 3927 arrow_delay = TIMEVAL_EVENTUALLY; 3928 /* 3929 * n is -1 for 'A' so adjust it. 3930 * XXX just count up the arrows for now. 3931 */ 3932 if (n < 0) { 3933 n = 0; 3934 for (ap = itree_next_arrow(bp, NULL); ap; 3935 ap = itree_next_arrow(bp, ap)) 3936 n++; 3937 indent(); 3938 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3939 } else { 3940 indent(); 3941 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3942 } 3943 3944 if (n == 0) 3945 continue; 3946 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3947 for (ap = itree_next_arrow(bp, NULL); ap; 3948 ap = itree_next_arrow(bp, ap)) { 3949 ep2 = ap->arrowp->head->myevent; 3950 platform_set_payloadnvp(ep2->nvp); 3951 (void) checkconstraints(fmep, ap->arrowp); 3952 if (!ap->arrowp->forever_false) { 3953 /* 3954 * if all arrows are invalidated by the 3955 * constraints, then we should elide the 3956 * whole bubble to be consistant with 3957 * the tree creation time behaviour 3958 */ 3959 bp->mark |= BUBBLE_OK; 3960 platform_set_payloadnvp(NULL); 3961 break; 3962 } 3963 platform_set_payloadnvp(NULL); 3964 } 3965 } 3966 for (ap = itree_next_arrow(bp, NULL); ap; 3967 ap = itree_next_arrow(bp, ap)) { 3968 ep2 = ap->arrowp->head->myevent; 3969 if (n <= credible_events) 3970 break; 3971 3972 ap->arrowp->mark |= REQMNTS_COUNTER; 3973 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3974 /* XXX adding max timevals! */ 3975 switch (requirements_test(fmep, ep2, 3976 at_latest_by + ap->arrowp->maxdelay, 3977 &my_delay)) { 3978 case FME_DEFERRED: 3979 deferred_events++; 3980 break; 3981 case FME_CREDIBLE: 3982 credible_events++; 3983 break; 3984 case FME_DISPROVED: 3985 break; 3986 case FME_WAIT: 3987 if (my_delay < arrow_delay) 3988 arrow_delay = my_delay; 3989 waiting_events++; 3990 break; 3991 default: 3992 out(O_DIE, 3993 "Bug in requirements_test."); 3994 } 3995 else 3996 deferred_events++; 3997 } 3998 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3999 bp->mark |= BUBBLE_ELIDED; 4000 continue; 4001 } 4002 indent(); 4003 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 4004 credible_events + deferred_events, waiting_events); 4005 if (credible_events + deferred_events + waiting_events < n) { 4006 /* Can never meet requirements */ 4007 ep->cached_state |= REQMNTS_DISPROVED; 4008 indent(); 4009 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4010 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4011 out(O_ALTFP|O_VERB, NULL); 4012 indent_pop(); 4013 return (FME_DISPROVED); 4014 } 4015 if (credible_events + deferred_events < n) { 4016 /* will have to wait */ 4017 /* wait time is shortest known */ 4018 if (arrow_delay < overall_delay) 4019 overall_delay = arrow_delay; 4020 return_value = FME_WAIT; 4021 } else if (credible_events < n) { 4022 if (return_value != FME_WAIT) 4023 return_value = FME_DEFERRED; 4024 } 4025 } 4026 4027 /* 4028 * don't mark as FME_DEFERRED. If this event isn't reached by another 4029 * path, then this will be considered FME_CREDIBLE. But if it is 4030 * reached by a different path so the K-count is met, then might 4031 * get overridden by FME_WAIT or FME_DISPROVED. 4032 */ 4033 if (return_value == FME_WAIT) { 4034 ep->cached_state |= REQMNTS_WAIT; 4035 ep->cached_delay = *pdelay = overall_delay; 4036 } else if (return_value == FME_CREDIBLE) { 4037 ep->cached_state |= REQMNTS_CREDIBLE; 4038 } 4039 indent(); 4040 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4041 fme_state2str(return_value)); 4042 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4043 out(O_ALTFP|O_VERB, NULL); 4044 indent_pop(); 4045 return (return_value); 4046 } 4047 4048 static enum fme_state 4049 causes_test(struct fme *fmep, struct event *ep, 4050 unsigned long long at_latest_by, unsigned long long *pdelay) 4051 { 4052 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4053 unsigned long long my_delay; 4054 int credible_results = 0; 4055 int waiting_results = 0; 4056 enum fme_state fstate; 4057 struct event *tail_event; 4058 struct bubble *bp; 4059 struct arrowlist *ap; 4060 int k = 1; 4061 4062 stats_counter_bump(fmep->Ccallcount); 4063 indent_push(" C"); 4064 indent(); 4065 out(O_ALTFP|O_VERB|O_NONL, "->"); 4066 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4067 out(O_ALTFP|O_VERB, NULL); 4068 4069 for (bp = itree_next_bubble(ep, NULL); bp; 4070 bp = itree_next_bubble(ep, bp)) { 4071 if (bp->t != B_TO) 4072 continue; 4073 k = bp->nork; /* remember the K value */ 4074 for (ap = itree_next_arrow(bp, NULL); ap; 4075 ap = itree_next_arrow(bp, ap)) { 4076 int do_not_follow = 0; 4077 4078 /* 4079 * if we get to the same event multiple times 4080 * only worry about the first one. 4081 */ 4082 if (ap->arrowp->tail->myevent->cached_state & 4083 CAUSES_TESTED) { 4084 indent(); 4085 out(O_ALTFP|O_VERB|O_NONL, 4086 " causes test already run for "); 4087 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4088 ap->arrowp->tail->myevent); 4089 out(O_ALTFP|O_VERB, NULL); 4090 continue; 4091 } 4092 4093 /* 4094 * see if false constraint prevents us 4095 * from traversing this arrow 4096 */ 4097 platform_set_payloadnvp(ep->nvp); 4098 if (checkconstraints(fmep, ap->arrowp) == 0) 4099 do_not_follow = 1; 4100 platform_set_payloadnvp(NULL); 4101 if (do_not_follow) { 4102 indent(); 4103 out(O_ALTFP|O_VERB|O_NONL, 4104 " False arrow from "); 4105 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4106 ap->arrowp->tail->myevent); 4107 out(O_ALTFP|O_VERB, NULL); 4108 continue; 4109 } 4110 4111 ap->arrowp->tail->myevent->cached_state |= 4112 CAUSES_TESTED; 4113 tail_event = ap->arrowp->tail->myevent; 4114 fstate = hypothesise(fmep, tail_event, at_latest_by, 4115 &my_delay); 4116 4117 switch (fstate) { 4118 case FME_WAIT: 4119 if (my_delay < overall_delay) 4120 overall_delay = my_delay; 4121 waiting_results++; 4122 break; 4123 case FME_CREDIBLE: 4124 credible_results++; 4125 break; 4126 case FME_DISPROVED: 4127 break; 4128 default: 4129 out(O_DIE, "Bug in causes_test"); 4130 } 4131 } 4132 } 4133 /* compare against K */ 4134 if (credible_results + waiting_results < k) { 4135 indent(); 4136 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4137 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4138 out(O_ALTFP|O_VERB, NULL); 4139 indent_pop(); 4140 return (FME_DISPROVED); 4141 } 4142 if (waiting_results != 0) { 4143 *pdelay = overall_delay; 4144 indent(); 4145 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4146 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4147 out(O_ALTFP|O_VERB|O_NONL, " to "); 4148 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4149 out(O_ALTFP|O_VERB, NULL); 4150 indent_pop(); 4151 return (FME_WAIT); 4152 } 4153 indent(); 4154 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4155 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4156 out(O_ALTFP|O_VERB, NULL); 4157 indent_pop(); 4158 return (FME_CREDIBLE); 4159 } 4160 4161 static enum fme_state 4162 hypothesise(struct fme *fmep, struct event *ep, 4163 unsigned long long at_latest_by, unsigned long long *pdelay) 4164 { 4165 enum fme_state rtr, otr; 4166 unsigned long long my_delay; 4167 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4168 4169 stats_counter_bump(fmep->Hcallcount); 4170 indent_push(" H"); 4171 indent(); 4172 out(O_ALTFP|O_VERB|O_NONL, "->"); 4173 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4174 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4175 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4176 out(O_ALTFP|O_VERB, NULL); 4177 4178 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4179 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4180 overall_delay = my_delay; 4181 if (rtr != FME_DISPROVED) { 4182 if (is_problem(ep->t)) { 4183 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4184 if (otr != FME_DISPROVED) { 4185 if (fmep->peek == 0 && ep->is_suspect == 0) { 4186 ep->suspects = fmep->suspects; 4187 ep->is_suspect = 1; 4188 fmep->suspects = ep; 4189 fmep->nsuspects++; 4190 } 4191 } 4192 } else 4193 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4194 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4195 overall_delay = my_delay; 4196 if ((otr != FME_DISPROVED) && 4197 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4198 *pdelay = overall_delay; 4199 } 4200 if (rtr == FME_DISPROVED) { 4201 indent(); 4202 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4203 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4204 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4205 indent_pop(); 4206 return (FME_DISPROVED); 4207 } 4208 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4209 indent(); 4210 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4211 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4212 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4213 indent_pop(); 4214 return (FME_DISPROVED); 4215 } 4216 if (otr == FME_DISPROVED) { 4217 indent(); 4218 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4219 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4220 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4221 indent_pop(); 4222 return (FME_DISPROVED); 4223 } 4224 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4225 indent(); 4226 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4227 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4228 out(O_ALTFP|O_VERB|O_NONL, " to "); 4229 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4230 out(O_ALTFP|O_VERB, NULL); 4231 indent_pop(); 4232 return (FME_WAIT); 4233 } 4234 indent(); 4235 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4236 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4237 out(O_ALTFP|O_VERB, NULL); 4238 indent_pop(); 4239 return (FME_CREDIBLE); 4240 } 4241 4242 /* 4243 * fme_istat_load -- reconstitute any persistent istats 4244 */ 4245 void 4246 fme_istat_load(fmd_hdl_t *hdl) 4247 { 4248 int sz; 4249 char *sbuf; 4250 char *ptr; 4251 4252 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4253 out(O_ALTFP, "fme_istat_load: No stats"); 4254 return; 4255 } 4256 4257 sbuf = alloca(sz); 4258 4259 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4260 4261 /* 4262 * pick apart the serialized stats 4263 * 4264 * format is: 4265 * <class-name>, '@', <path>, '\0', <value>, '\0' 4266 * for example: 4267 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4268 * 4269 * since this is parsing our own serialized data, any parsing issues 4270 * are fatal, so we check for them all with ASSERT() below. 4271 */ 4272 ptr = sbuf; 4273 while (ptr < &sbuf[sz]) { 4274 char *sepptr; 4275 struct node *np; 4276 int val; 4277 4278 sepptr = strchr(ptr, '@'); 4279 ASSERT(sepptr != NULL); 4280 *sepptr = '\0'; 4281 4282 /* construct the event */ 4283 np = newnode(T_EVENT, NULL, 0); 4284 np->u.event.ename = newnode(T_NAME, NULL, 0); 4285 np->u.event.ename->u.name.t = N_STAT; 4286 np->u.event.ename->u.name.s = stable(ptr); 4287 np->u.event.ename->u.name.it = IT_ENAME; 4288 np->u.event.ename->u.name.last = np->u.event.ename; 4289 4290 ptr = sepptr + 1; 4291 ASSERT(ptr < &sbuf[sz]); 4292 ptr += strlen(ptr); 4293 ptr++; /* move past the '\0' separating path from value */ 4294 ASSERT(ptr < &sbuf[sz]); 4295 ASSERT(isdigit(*ptr)); 4296 val = atoi(ptr); 4297 ASSERT(val > 0); 4298 ptr += strlen(ptr); 4299 ptr++; /* move past the final '\0' for this entry */ 4300 4301 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4302 ASSERT(np->u.event.epname != NULL); 4303 4304 istat_bump(np, val); 4305 tree_free(np); 4306 } 4307 4308 istat_save(); 4309 } 4310