1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2012 Milan Jurik. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 * 27 * fme.c -- fault management exercise module 28 * 29 * this module provides the simulated fault management exercise. 30 */ 31 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <strings.h> 36 #include <ctype.h> 37 #include <alloca.h> 38 #include <libnvpair.h> 39 #include <sys/fm/protocol.h> 40 #include <fm/fmd_api.h> 41 #include "alloc.h" 42 #include "out.h" 43 #include "stats.h" 44 #include "stable.h" 45 #include "literals.h" 46 #include "lut.h" 47 #include "tree.h" 48 #include "ptree.h" 49 #include "itree.h" 50 #include "ipath.h" 51 #include "fme.h" 52 #include "evnv.h" 53 #include "eval.h" 54 #include "config.h" 55 #include "platform.h" 56 #include "esclex.h" 57 58 /* imported from eft.c... */ 59 extern hrtime_t Hesitate; 60 extern char *Serd_Override; 61 extern nv_alloc_t Eft_nv_hdl; 62 extern int Max_fme; 63 extern fmd_hdl_t *Hdl; 64 65 static int Istat_need_save; 66 static int Serd_need_save; 67 void istat_save(void); 68 void serd_save(void); 69 70 /* fme under construction is global so we can free it on module abort */ 71 static struct fme *Nfmep; 72 73 static int Undiag_reason = UD_VAL_UNKNOWN; 74 75 static int Nextid = 0; 76 77 static int Open_fme_count = 0; /* Count of open FMEs */ 78 79 /* list of fault management exercises underway */ 80 static struct fme { 81 struct fme *next; /* next exercise */ 82 unsigned long long ull; /* time when fme was created */ 83 int id; /* FME id */ 84 struct config *config; /* cooked configuration data */ 85 struct lut *eventtree; /* propagation tree for this FME */ 86 /* 87 * The initial error report that created this FME is kept in 88 * two forms. e0 points to the instance tree node and is used 89 * by fme_eval() as the starting point for the inference 90 * algorithm. e0r is the event handle FMD passed to us when 91 * the ereport first arrived and is used when setting timers, 92 * which are always relative to the time of this initial 93 * report. 94 */ 95 struct event *e0; 96 fmd_event_t *e0r; 97 98 id_t timer; /* for setting an fmd time-out */ 99 100 struct event *ecurrent; /* ereport under consideration */ 101 struct event *suspects; /* current suspect list */ 102 struct event *psuspects; /* previous suspect list */ 103 int nsuspects; /* count of suspects */ 104 int posted_suspects; /* true if we've posted a diagnosis */ 105 int uniqobs; /* number of unique events observed */ 106 int peek; /* just peeking, don't track suspects */ 107 int overflow; /* true if overflow FME */ 108 enum fme_state { 109 FME_NOTHING = 5000, /* not evaluated yet */ 110 FME_WAIT, /* need to wait for more info */ 111 FME_CREDIBLE, /* suspect list is credible */ 112 FME_DISPROVED, /* no valid suspects found */ 113 FME_DEFERRED /* don't know yet (k-count not met) */ 114 } state; 115 116 unsigned long long pull; /* time passed since created */ 117 unsigned long long wull; /* wait until this time for re-eval */ 118 struct event *observations; /* observation list */ 119 struct lut *globals; /* values of global variables */ 120 /* fmd interfacing */ 121 fmd_hdl_t *hdl; /* handle for talking with fmd */ 122 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 123 /* stats */ 124 struct stats *Rcount; 125 struct stats *Hcallcount; 126 struct stats *Rcallcount; 127 struct stats *Ccallcount; 128 struct stats *Ecallcount; 129 struct stats *Tcallcount; 130 struct stats *Marrowcount; 131 struct stats *diags; 132 } *FMElist, *EFMElist, *ClosedFMEs; 133 134 static struct case_list { 135 fmd_case_t *fmcase; 136 struct case_list *next; 137 } *Undiagablecaselist; 138 139 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 140 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 141 unsigned long long at_latest_by, unsigned long long *pdelay); 142 static struct node *eventprop_lookup(struct event *ep, const char *propname); 143 static struct node *pathstring2epnamenp(char *path); 144 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 145 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 146 static char *undiag_2reason_str(int ud, char *arg); 147 static const char *undiag_2defect_str(int ud); 148 static void restore_suspects(struct fme *fmep); 149 static void save_suspects(struct fme *fmep); 150 static void destroy_fme(struct fme *f); 151 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 152 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 153 static void istat_counter_reset_cb(struct istat_entry *entp, 154 struct stats *statp, const struct ipath *ipp); 155 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 156 struct stats *statp, void *unused); 157 static void serd_reset_cb(struct serd_entry *entp, void *unused, 158 const struct ipath *ipp); 159 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 160 void *unused2); 161 static void destroy_fme_bufs(struct fme *fp); 162 163 static struct fme * 164 alloc_fme(void) 165 { 166 struct fme *fmep; 167 168 fmep = MALLOC(sizeof (*fmep)); 169 bzero(fmep, sizeof (*fmep)); 170 return (fmep); 171 } 172 173 /* 174 * fme_ready -- called when all initialization of the FME (except for 175 * stats) has completed successfully. Adds the fme to global lists 176 * and establishes its stats. 177 */ 178 static struct fme * 179 fme_ready(struct fme *fmep) 180 { 181 char nbuf[100]; 182 183 Nfmep = NULL; /* don't need to free this on module abort now */ 184 185 if (EFMElist) { 186 EFMElist->next = fmep; 187 EFMElist = fmep; 188 } else 189 FMElist = EFMElist = fmep; 190 191 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 192 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 193 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 194 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 195 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 196 fmep->Rcallcount = stats_new_counter(nbuf, 197 "calls to requirements_test()", 1); 198 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 199 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 200 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 201 fmep->Ecallcount = 202 stats_new_counter(nbuf, "calls to effects_test()", 1); 203 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 204 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 205 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 206 fmep->Marrowcount = stats_new_counter(nbuf, 207 "arrows marked by mark_arrows()", 1); 208 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 209 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 210 211 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 212 config_print(O_ALTFP|O_VERB2, fmep->config); 213 214 return (fmep); 215 } 216 217 extern void ipath_dummy_lut(struct arrow *); 218 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 219 220 /* ARGSUSED */ 221 static void 222 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 223 { 224 struct bubble *bp; 225 struct arrowlist *ap; 226 227 for (bp = itree_next_bubble(ep, NULL); bp; 228 bp = itree_next_bubble(ep, bp)) { 229 if (bp->t != B_FROM) 230 continue; 231 for (ap = itree_next_arrow(bp, NULL); ap; 232 ap = itree_next_arrow(bp, ap)) { 233 ap->arrowp->pnode->u.arrow.needed = 1; 234 ipath_dummy_lut(ap->arrowp); 235 } 236 } 237 } 238 239 /* ARGSUSED */ 240 static void 241 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 242 { 243 struct bubble *bp; 244 struct arrowlist *ap; 245 246 for (bp = itree_next_bubble(ep, NULL); bp; 247 bp = itree_next_bubble(ep, bp)) { 248 if (bp->t != B_FROM) 249 continue; 250 for (ap = itree_next_arrow(bp, NULL); ap; 251 ap = itree_next_arrow(bp, ap)) 252 ap->arrowp->pnode->u.arrow.needed = 0; 253 } 254 } 255 256 static void globals_destructor(void *left, void *right, void *arg); 257 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 258 259 static boolean_t 260 prune_propagations(const char *e0class, const struct ipath *e0ipp) 261 { 262 char nbuf[100]; 263 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 264 extern struct lut *Usednames; 265 266 Nfmep = alloc_fme(); 267 Nfmep->id = Nextid; 268 Nfmep->state = FME_NOTHING; 269 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 270 if ((Nfmep->e0 = 271 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 272 itree_free(Nfmep->eventtree); 273 FREE(Nfmep); 274 Nfmep = NULL; 275 return (B_FALSE); 276 } 277 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 278 Nfmep->e0->count++; 279 280 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 281 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 282 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 283 Nfmep->Hcallcount = 284 stats_new_counter(nbuf, "calls to hypothesise()", 1); 285 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 286 Nfmep->Rcallcount = stats_new_counter(nbuf, 287 "calls to requirements_test()", 1); 288 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 289 Nfmep->Ccallcount = 290 stats_new_counter(nbuf, "calls to causes_test()", 1); 291 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 292 Nfmep->Ecallcount = 293 stats_new_counter(nbuf, "calls to effects_test()", 1); 294 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 295 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 296 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 297 Nfmep->Marrowcount = stats_new_counter(nbuf, 298 "arrows marked by mark_arrows()", 1); 299 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 300 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 301 302 Nfmep->peek = 1; 303 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 304 lut_free(Usednames, NULL, NULL); 305 Usednames = NULL; 306 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 307 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 308 itree_prune(Nfmep->eventtree); 309 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 310 311 stats_delete(Nfmep->Rcount); 312 stats_delete(Nfmep->Hcallcount); 313 stats_delete(Nfmep->Rcallcount); 314 stats_delete(Nfmep->Ccallcount); 315 stats_delete(Nfmep->Ecallcount); 316 stats_delete(Nfmep->Tcallcount); 317 stats_delete(Nfmep->Marrowcount); 318 stats_delete(Nfmep->diags); 319 itree_free(Nfmep->eventtree); 320 lut_free(Nfmep->globals, globals_destructor, NULL); 321 FREE(Nfmep); 322 return (B_TRUE); 323 } 324 325 static struct fme * 326 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 327 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 328 { 329 struct cfgdata *cfgdata; 330 int init_size; 331 extern int alloc_total(); 332 nvlist_t *detector = NULL; 333 char *pathstr; 334 char *arg; 335 336 /* 337 * First check if e0ipp is actually in the topology so we can give a 338 * more useful error message. 339 */ 340 ipathlastcomp(e0ipp); 341 pathstr = ipath2str(NULL, e0ipp); 342 cfgdata = config_snapshot(); 343 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 344 &detector, pathstr); 345 FREE(pathstr); 346 structconfig_free(cfgdata->cooked); 347 config_free(cfgdata); 348 if (detector == NULL) { 349 /* See if class permits silent discard on unknown component. */ 350 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 351 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 352 "to component path, but silent discard allowed.", 353 e0class); 354 fmd_case_close(hdl, fmcase); 355 } else { 356 Undiag_reason = UD_VAL_BADEVENTPATH; 357 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 358 &detector); 359 arg = ipath2str(e0class, e0ipp); 360 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 361 FREE(arg); 362 } 363 return (NULL); 364 } 365 366 /* 367 * Next run a quick first pass of the rules with a dummy config. This 368 * allows us to prune those rules which can't possibly cause this 369 * ereport. 370 */ 371 if (!prune_propagations(e0class, e0ipp)) { 372 /* 373 * The fault class must have been in the rules or we would 374 * not have registered for it (and got a "nosub"), and the 375 * pathname must be in the topology or we would have failed the 376 * previous test. So to get here means the combination of 377 * class and pathname in the ereport must be invalid. 378 */ 379 Undiag_reason = UD_VAL_BADEVENTCLASS; 380 arg = ipath2str(e0class, e0ipp); 381 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 382 nvlist_free(detector); 383 FREE(arg); 384 return (NULL); 385 } 386 387 /* 388 * Now go ahead and create the real fme using the pruned rules. 389 */ 390 init_size = alloc_total(); 391 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 392 nvlist_free(detector); 393 pathstr = ipath2str(NULL, e0ipp); 394 cfgdata = config_snapshot(); 395 platform_units_translate(0, cfgdata->cooked, NULL, NULL, 396 &detector, pathstr); 397 FREE(pathstr); 398 platform_save_config(hdl, fmcase); 399 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 400 alloc_total() - init_size); 401 402 Nfmep = alloc_fme(); 403 404 Nfmep->id = Nextid++; 405 Nfmep->config = cfgdata->cooked; 406 config_free(cfgdata); 407 Nfmep->posted_suspects = 0; 408 Nfmep->uniqobs = 0; 409 Nfmep->state = FME_NOTHING; 410 Nfmep->pull = 0ULL; 411 Nfmep->overflow = 0; 412 413 Nfmep->fmcase = fmcase; 414 Nfmep->hdl = hdl; 415 416 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 417 Undiag_reason = UD_VAL_INSTFAIL; 418 arg = ipath2str(e0class, e0ipp); 419 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 420 nvlist_free(detector); 421 FREE(arg); 422 structconfig_free(Nfmep->config); 423 destroy_fme_bufs(Nfmep); 424 FREE(Nfmep); 425 Nfmep = NULL; 426 return (NULL); 427 } 428 429 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 430 431 if ((Nfmep->e0 = 432 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 433 Undiag_reason = UD_VAL_BADEVENTI; 434 arg = ipath2str(e0class, e0ipp); 435 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 436 nvlist_free(detector); 437 FREE(arg); 438 itree_free(Nfmep->eventtree); 439 structconfig_free(Nfmep->config); 440 destroy_fme_bufs(Nfmep); 441 FREE(Nfmep); 442 Nfmep = NULL; 443 return (NULL); 444 } 445 446 nvlist_free(detector); 447 return (fme_ready(Nfmep)); 448 } 449 450 void 451 fme_fini(void) 452 { 453 struct fme *sfp, *fp; 454 struct case_list *ucasep, *nextcasep; 455 456 ucasep = Undiagablecaselist; 457 while (ucasep != NULL) { 458 nextcasep = ucasep->next; 459 FREE(ucasep); 460 ucasep = nextcasep; 461 } 462 Undiagablecaselist = NULL; 463 464 /* clean up closed fmes */ 465 fp = ClosedFMEs; 466 while (fp != NULL) { 467 sfp = fp->next; 468 destroy_fme(fp); 469 fp = sfp; 470 } 471 ClosedFMEs = NULL; 472 473 fp = FMElist; 474 while (fp != NULL) { 475 sfp = fp->next; 476 destroy_fme(fp); 477 fp = sfp; 478 } 479 FMElist = EFMElist = NULL; 480 481 /* if we were in the middle of creating an fme, free it now */ 482 if (Nfmep) { 483 destroy_fme(Nfmep); 484 Nfmep = NULL; 485 } 486 } 487 488 /* 489 * Allocated space for a buffer name. 20 bytes allows for 490 * a ridiculous 9,999,999 unique observations. 491 */ 492 #define OBBUFNMSZ 20 493 494 /* 495 * serialize_observation 496 * 497 * Create a recoverable version of the current observation 498 * (f->ecurrent). We keep a serialized version of each unique 499 * observation in order that we may resume correctly the fme in the 500 * correct state if eft or fmd crashes and we're restarted. 501 */ 502 static void 503 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 504 { 505 size_t pkdlen; 506 char tmpbuf[OBBUFNMSZ]; 507 char *pkd = NULL; 508 char *estr; 509 510 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 511 estr = ipath2str(cls, ipp); 512 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 513 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 514 strlen(estr) + 1); 515 FREE(estr); 516 517 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 518 (void) snprintf(tmpbuf, 519 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 520 if (nvlist_xpack(fp->ecurrent->nvp, 521 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 522 out(O_DIE|O_SYS, "pack of observed nvl failed"); 523 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 524 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 525 FREE(pkd); 526 } 527 528 fp->uniqobs++; 529 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 530 sizeof (fp->uniqobs)); 531 } 532 533 /* 534 * init_fme_bufs -- We keep several bits of state about an fme for 535 * use if eft or fmd crashes and we're restarted. 536 */ 537 static void 538 init_fme_bufs(struct fme *fp) 539 { 540 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 541 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 542 sizeof (fp->pull)); 543 544 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 545 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 546 sizeof (fp->id)); 547 548 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 549 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 550 sizeof (fp->uniqobs)); 551 552 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 553 sizeof (fp->posted_suspects)); 554 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 555 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 556 } 557 558 static void 559 destroy_fme_bufs(struct fme *fp) 560 { 561 char tmpbuf[OBBUFNMSZ]; 562 int o; 563 564 platform_restore_config(fp->hdl, fp->fmcase); 565 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 566 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 567 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 568 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 569 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 570 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 571 572 for (o = 0; o < fp->uniqobs; o++) { 573 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 574 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 575 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 576 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 577 } 578 } 579 580 /* 581 * reconstitute_observations -- convert a case's serialized observations 582 * back into struct events. Returns zero if all observations are 583 * successfully reconstituted. 584 */ 585 static int 586 reconstitute_observations(struct fme *fmep) 587 { 588 struct event *ep; 589 struct node *epnamenp = NULL; 590 size_t pkdlen; 591 char *pkd = NULL; 592 char *tmpbuf = alloca(OBBUFNMSZ); 593 char *sepptr; 594 char *estr; 595 int ocnt; 596 int elen; 597 598 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 599 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 600 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 601 if (elen == 0) { 602 out(O_ALTFP, 603 "reconstitute_observation: no %s buffer found.", 604 tmpbuf); 605 Undiag_reason = UD_VAL_MISSINGOBS; 606 break; 607 } 608 609 estr = MALLOC(elen); 610 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 611 sepptr = strchr(estr, '@'); 612 if (sepptr == NULL) { 613 out(O_ALTFP, 614 "reconstitute_observation: %s: " 615 "missing @ separator in %s.", 616 tmpbuf, estr); 617 Undiag_reason = UD_VAL_MISSINGPATH; 618 FREE(estr); 619 break; 620 } 621 622 *sepptr = '\0'; 623 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 624 out(O_ALTFP, 625 "reconstitute_observation: %s: " 626 "trouble converting path string \"%s\" " 627 "to internal representation.", 628 tmpbuf, sepptr + 1); 629 Undiag_reason = UD_VAL_MISSINGPATH; 630 FREE(estr); 631 break; 632 } 633 634 /* construct the event */ 635 ep = itree_lookup(fmep->eventtree, 636 stable(estr), ipath(epnamenp)); 637 if (ep == NULL) { 638 out(O_ALTFP, 639 "reconstitute_observation: %s: " 640 "lookup of \"%s\" in itree failed.", 641 tmpbuf, ipath2str(estr, ipath(epnamenp))); 642 Undiag_reason = UD_VAL_BADOBS; 643 tree_free(epnamenp); 644 FREE(estr); 645 break; 646 } 647 tree_free(epnamenp); 648 649 /* 650 * We may or may not have a saved nvlist for the observation 651 */ 652 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 653 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 654 if (pkdlen != 0) { 655 pkd = MALLOC(pkdlen); 656 fmd_buf_read(fmep->hdl, 657 fmep->fmcase, tmpbuf, pkd, pkdlen); 658 ASSERT(ep->nvp == NULL); 659 if (nvlist_xunpack(pkd, 660 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 661 out(O_DIE|O_SYS, "pack of observed nvl failed"); 662 FREE(pkd); 663 } 664 665 if (ocnt == 0) 666 fmep->e0 = ep; 667 668 FREE(estr); 669 fmep->ecurrent = ep; 670 ep->count++; 671 672 /* link it into list of observations seen */ 673 ep->observations = fmep->observations; 674 fmep->observations = ep; 675 } 676 677 if (ocnt == fmep->uniqobs) { 678 (void) fme_ready(fmep); 679 return (0); 680 } 681 682 return (1); 683 } 684 685 /* 686 * restart_fme -- called during eft initialization. Reconstitutes 687 * an in-progress fme. 688 */ 689 void 690 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 691 { 692 nvlist_t *defect; 693 struct case_list *bad; 694 struct fme *fmep; 695 struct cfgdata *cfgdata; 696 size_t rawsz; 697 struct event *ep; 698 char *tmpbuf = alloca(OBBUFNMSZ); 699 char *sepptr; 700 char *estr; 701 int elen; 702 struct node *epnamenp = NULL; 703 int init_size; 704 extern int alloc_total(); 705 char *reason; 706 707 /* 708 * ignore solved or closed cases 709 */ 710 if (fmd_case_solved(hdl, inprogress) || 711 fmd_case_closed(hdl, inprogress)) 712 return; 713 714 fmep = alloc_fme(); 715 fmep->fmcase = inprogress; 716 fmep->hdl = hdl; 717 718 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 719 out(O_ALTFP, "restart_fme: no saved posted status"); 720 Undiag_reason = UD_VAL_MISSINGINFO; 721 goto badcase; 722 } else { 723 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 724 (void *)&fmep->posted_suspects, 725 sizeof (fmep->posted_suspects)); 726 } 727 728 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 729 out(O_ALTFP, "restart_fme: no saved id"); 730 Undiag_reason = UD_VAL_MISSINGINFO; 731 goto badcase; 732 } else { 733 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 734 sizeof (fmep->id)); 735 } 736 if (Nextid <= fmep->id) 737 Nextid = fmep->id + 1; 738 739 out(O_ALTFP, "Replay FME %d", fmep->id); 740 741 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 742 out(O_ALTFP, "restart_fme: No config data"); 743 Undiag_reason = UD_VAL_MISSINGINFO; 744 goto badcase; 745 } 746 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 747 sizeof (size_t)); 748 749 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 750 out(O_ALTFP, "restart_fme: No event zero"); 751 Undiag_reason = UD_VAL_MISSINGZERO; 752 goto badcase; 753 } 754 755 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 756 out(O_ALTFP, "restart_fme: no saved wait time"); 757 Undiag_reason = UD_VAL_MISSINGINFO; 758 goto badcase; 759 } else { 760 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 761 sizeof (fmep->pull)); 762 } 763 764 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 765 out(O_ALTFP, "restart_fme: no count of observations"); 766 Undiag_reason = UD_VAL_MISSINGINFO; 767 goto badcase; 768 } else { 769 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 770 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 771 } 772 773 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 774 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 775 if (elen == 0) { 776 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 777 tmpbuf); 778 Undiag_reason = UD_VAL_MISSINGOBS; 779 goto badcase; 780 } 781 estr = MALLOC(elen); 782 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 783 sepptr = strchr(estr, '@'); 784 if (sepptr == NULL) { 785 out(O_ALTFP, "reconstitute_observation: %s: " 786 "missing @ separator in %s.", 787 tmpbuf, estr); 788 Undiag_reason = UD_VAL_MISSINGPATH; 789 FREE(estr); 790 goto badcase; 791 } 792 *sepptr = '\0'; 793 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 794 out(O_ALTFP, "reconstitute_observation: %s: " 795 "trouble converting path string \"%s\" " 796 "to internal representation.", tmpbuf, sepptr + 1); 797 Undiag_reason = UD_VAL_MISSINGPATH; 798 FREE(estr); 799 goto badcase; 800 } 801 (void) prune_propagations(stable(estr), ipath(epnamenp)); 802 tree_free(epnamenp); 803 FREE(estr); 804 805 init_size = alloc_total(); 806 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 807 cfgdata = MALLOC(sizeof (struct cfgdata)); 808 cfgdata->cooked = NULL; 809 cfgdata->devcache = NULL; 810 cfgdata->devidcache = NULL; 811 cfgdata->tpcache = NULL; 812 cfgdata->cpucache = NULL; 813 cfgdata->raw_refcnt = 1; 814 815 if (rawsz > 0) { 816 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 817 out(O_ALTFP, "restart_fme: Config data size mismatch"); 818 Undiag_reason = UD_VAL_CFGMISMATCH; 819 goto badcase; 820 } 821 cfgdata->begin = MALLOC(rawsz); 822 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 823 fmd_buf_read(hdl, 824 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 825 } else { 826 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 827 } 828 829 config_cook(cfgdata); 830 fmep->config = cfgdata->cooked; 831 config_free(cfgdata); 832 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 833 alloc_total() - init_size); 834 835 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 836 /* case not properly saved or irretrievable */ 837 out(O_ALTFP, "restart_fme: NULL instance tree"); 838 Undiag_reason = UD_VAL_INSTFAIL; 839 goto badcase; 840 } 841 842 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 843 844 if (reconstitute_observations(fmep) != 0) 845 goto badcase; 846 847 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 848 for (ep = fmep->observations; ep; ep = ep->observations) { 849 out(O_ALTFP|O_NONL, " "); 850 itree_pevent_brief(O_ALTFP|O_NONL, ep); 851 } 852 out(O_ALTFP, NULL); 853 854 Open_fme_count++; 855 856 /* give the diagnosis algorithm a shot at the new FME state */ 857 fme_eval(fmep, fmep->e0r); 858 return; 859 860 badcase: 861 if (fmep->eventtree != NULL) 862 itree_free(fmep->eventtree); 863 if (fmep->config) 864 structconfig_free(fmep->config); 865 destroy_fme_bufs(fmep); 866 FREE(fmep); 867 868 /* 869 * Since we're unable to restart the case, add it to the undiagable 870 * list and solve and close it as appropriate. 871 */ 872 bad = MALLOC(sizeof (struct case_list)); 873 bad->next = NULL; 874 875 if (Undiagablecaselist != NULL) 876 bad->next = Undiagablecaselist; 877 Undiagablecaselist = bad; 878 bad->fmcase = inprogress; 879 880 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 881 fmd_case_uuid(hdl, bad->fmcase)); 882 883 if (fmd_case_solved(hdl, bad->fmcase)) { 884 out(O_ALTFP|O_NONL, "already solved, "); 885 } else { 886 out(O_ALTFP|O_NONL, "solving, "); 887 defect = fmd_nvl_create_fault(hdl, 888 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 889 reason = undiag_2reason_str(Undiag_reason, NULL); 890 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 891 FREE(reason); 892 fmd_case_add_suspect(hdl, bad->fmcase, defect); 893 fmd_case_solve(hdl, bad->fmcase); 894 Undiag_reason = UD_VAL_UNKNOWN; 895 } 896 897 if (fmd_case_closed(hdl, bad->fmcase)) { 898 out(O_ALTFP, "already closed ]"); 899 } else { 900 out(O_ALTFP, "closing ]"); 901 fmd_case_close(hdl, bad->fmcase); 902 } 903 } 904 905 /*ARGSUSED*/ 906 static void 907 globals_destructor(void *left, void *right, void *arg) 908 { 909 struct evalue *evp = (struct evalue *)right; 910 if (evp->t == NODEPTR) 911 tree_free((struct node *)(uintptr_t)evp->v); 912 evp->v = (uintptr_t)NULL; 913 FREE(evp); 914 } 915 916 void 917 destroy_fme(struct fme *f) 918 { 919 stats_delete(f->Rcount); 920 stats_delete(f->Hcallcount); 921 stats_delete(f->Rcallcount); 922 stats_delete(f->Ccallcount); 923 stats_delete(f->Ecallcount); 924 stats_delete(f->Tcallcount); 925 stats_delete(f->Marrowcount); 926 stats_delete(f->diags); 927 928 if (f->eventtree != NULL) 929 itree_free(f->eventtree); 930 if (f->config) 931 structconfig_free(f->config); 932 lut_free(f->globals, globals_destructor, NULL); 933 FREE(f); 934 } 935 936 static const char * 937 fme_state2str(enum fme_state s) 938 { 939 switch (s) { 940 case FME_NOTHING: return ("NOTHING"); 941 case FME_WAIT: return ("WAIT"); 942 case FME_CREDIBLE: return ("CREDIBLE"); 943 case FME_DISPROVED: return ("DISPROVED"); 944 case FME_DEFERRED: return ("DEFERRED"); 945 default: return ("UNKNOWN"); 946 } 947 } 948 949 static int 950 is_problem(enum nametype t) 951 { 952 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 953 } 954 955 static int 956 is_defect(enum nametype t) 957 { 958 return (t == N_DEFECT); 959 } 960 961 static int 962 is_upset(enum nametype t) 963 { 964 return (t == N_UPSET); 965 } 966 967 static void 968 fme_print(int flags, struct fme *fmep) 969 { 970 struct event *ep; 971 972 out(flags, "Fault Management Exercise %d", fmep->id); 973 out(flags, "\t State: %s", fme_state2str(fmep->state)); 974 out(flags|O_NONL, "\t Start time: "); 975 ptree_timeval(flags|O_NONL, &fmep->ull); 976 out(flags, NULL); 977 if (fmep->wull) { 978 out(flags|O_NONL, "\t Wait time: "); 979 ptree_timeval(flags|O_NONL, &fmep->wull); 980 out(flags, NULL); 981 } 982 out(flags|O_NONL, "\t E0: "); 983 if (fmep->e0) 984 itree_pevent_brief(flags|O_NONL, fmep->e0); 985 else 986 out(flags|O_NONL, "NULL"); 987 out(flags, NULL); 988 out(flags|O_NONL, "\tObservations:"); 989 for (ep = fmep->observations; ep; ep = ep->observations) { 990 out(flags|O_NONL, " "); 991 itree_pevent_brief(flags|O_NONL, ep); 992 } 993 out(flags, NULL); 994 out(flags|O_NONL, "\tSuspect list:"); 995 for (ep = fmep->suspects; ep; ep = ep->suspects) { 996 out(flags|O_NONL, " "); 997 itree_pevent_brief(flags|O_NONL, ep); 998 } 999 out(flags, NULL); 1000 if (fmep->eventtree != NULL) { 1001 out(flags|O_VERB2, "\t Tree:"); 1002 itree_ptree(flags|O_VERB2, fmep->eventtree); 1003 } 1004 } 1005 1006 static struct node * 1007 pathstring2epnamenp(char *path) 1008 { 1009 char *sep = "/"; 1010 struct node *ret; 1011 char *ptr; 1012 1013 if ((ptr = strtok(path, sep)) == NULL) 1014 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1015 1016 ret = tree_iname(stable(ptr), NULL, 0); 1017 1018 while ((ptr = strtok(NULL, sep)) != NULL) 1019 ret = tree_name_append(ret, 1020 tree_iname(stable(ptr), NULL, 0)); 1021 1022 return (ret); 1023 } 1024 1025 /* 1026 * for a given upset sp, increment the corresponding SERD engine. if the 1027 * SERD engine trips, return the ename and ipp of the resulting ereport. 1028 * returns true if engine tripped and *enamep and *ippp were filled in. 1029 */ 1030 static int 1031 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1032 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1033 const struct ipath **ippp) 1034 { 1035 struct node *serdinst; 1036 char *serdname; 1037 char *serdresource; 1038 char *serdclass; 1039 struct node *nid; 1040 struct serd_entry *newentp; 1041 int i, serdn = -1, serdincrement = 1, len = 0; 1042 char *serdsuffix = NULL, *serdt = NULL; 1043 struct evalue *ep; 1044 1045 ASSERT(sp->t == N_UPSET); 1046 ASSERT(ffep != NULL); 1047 1048 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1049 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1050 ASSERT(ep->t == UINT64); 1051 serdn = (int)ep->v; 1052 } 1053 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1054 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1055 ASSERT(ep->t == STRING); 1056 serdt = (char *)(uintptr_t)ep->v; 1057 } 1058 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1059 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1060 ASSERT(ep->t == STRING); 1061 serdsuffix = (char *)(uintptr_t)ep->v; 1062 } 1063 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1064 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1065 ASSERT(ep->t == UINT64); 1066 serdincrement = (int)ep->v; 1067 } 1068 1069 /* 1070 * obtain instanced SERD engine from the upset sp. from this 1071 * derive serdname, the string used to identify the SERD engine. 1072 */ 1073 serdinst = eventprop_lookup(sp, L_engine); 1074 1075 if (serdinst == NULL) 1076 return (-1); 1077 1078 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1079 if (serdsuffix != NULL) 1080 len += strlen(serdsuffix); 1081 serdclass = MALLOC(len); 1082 if (serdsuffix != NULL) 1083 (void) snprintf(serdclass, len, "%s%s", 1084 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1085 else 1086 (void) snprintf(serdclass, len, "%s", 1087 serdinst->u.stmt.np->u.event.ename->u.name.s); 1088 serdresource = ipath2str(NULL, 1089 ipath(serdinst->u.stmt.np->u.event.epname)); 1090 len += strlen(serdresource) + 1; 1091 serdname = MALLOC(len); 1092 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1093 FREE(serdresource); 1094 1095 /* handle serd engine "id" property, if there is one */ 1096 if ((nid = 1097 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1098 struct evalue *gval; 1099 char suffixbuf[200]; 1100 char *suffix; 1101 char *nserdname; 1102 size_t nname; 1103 1104 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1105 ptree_name_iter(O_ALTFP|O_NONL, nid); 1106 1107 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1108 1109 if ((gval = lut_lookup(fmep->globals, 1110 (void *)nid->u.globid.s, NULL)) == NULL) { 1111 out(O_ALTFP, " undefined"); 1112 } else if (gval->t == UINT64) { 1113 out(O_ALTFP, " %llu", gval->v); 1114 (void) sprintf(suffixbuf, "%llu", gval->v); 1115 suffix = suffixbuf; 1116 } else { 1117 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1118 suffix = (char *)(uintptr_t)gval->v; 1119 } 1120 1121 nname = strlen(serdname) + strlen(suffix) + 2; 1122 nserdname = MALLOC(nname); 1123 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1124 FREE(serdname); 1125 serdname = nserdname; 1126 } 1127 1128 /* 1129 * if the engine is empty, and we have an override for n/t then 1130 * destroy and recreate it. 1131 */ 1132 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1133 fmd_serd_empty(hdl, serdname)) 1134 fmd_serd_destroy(hdl, serdname); 1135 1136 if (!fmd_serd_exists(hdl, serdname)) { 1137 struct node *nN, *nT; 1138 const char *s; 1139 struct node *nodep; 1140 struct config *cp; 1141 char *path; 1142 uint_t nval; 1143 hrtime_t tval; 1144 int i; 1145 char *ptr; 1146 int got_n_override = 0, got_t_override = 0; 1147 1148 /* no SERD engine yet, so create it */ 1149 nodep = serdinst->u.stmt.np->u.event.epname; 1150 path = ipath2str(NULL, ipath(nodep)); 1151 cp = config_lookup(fmep->config, path, 0); 1152 FREE((void *)path); 1153 1154 /* 1155 * We allow serd paramaters to be overridden, either from 1156 * eft.conf file values (if Serd_Override is set) or from 1157 * driver properties (for "serd.io.device" engines). 1158 */ 1159 if (Serd_Override != NULL) { 1160 char *save_ptr, *ptr1, *ptr2, *ptr3; 1161 ptr3 = save_ptr = STRDUP(Serd_Override); 1162 while (*ptr3 != '\0') { 1163 ptr1 = strchr(ptr3, ','); 1164 *ptr1 = '\0'; 1165 if (strcmp(ptr3, serdclass) == 0) { 1166 ptr2 = strchr(ptr1 + 1, ','); 1167 *ptr2 = '\0'; 1168 nval = atoi(ptr1 + 1); 1169 out(O_ALTFP, "serd override %s_n %d", 1170 serdclass, nval); 1171 ptr3 = strchr(ptr2 + 1, ' '); 1172 if (ptr3) 1173 *ptr3 = '\0'; 1174 ptr = STRDUP(ptr2 + 1); 1175 out(O_ALTFP, "serd override %s_t %s", 1176 serdclass, ptr); 1177 got_n_override = 1; 1178 got_t_override = 1; 1179 break; 1180 } else { 1181 ptr2 = strchr(ptr1 + 1, ','); 1182 ptr3 = strchr(ptr2 + 1, ' '); 1183 if (ptr3 == NULL) 1184 break; 1185 } 1186 ptr3++; 1187 } 1188 FREE(save_ptr); 1189 } 1190 1191 if (cp && got_n_override == 0) { 1192 /* 1193 * convert serd engine class into property name 1194 */ 1195 char *prop_name = MALLOC(strlen(serdclass) + 3); 1196 for (i = 0; i < strlen(serdclass); i++) { 1197 if (serdclass[i] == '.') 1198 prop_name[i] = '_'; 1199 else 1200 prop_name[i] = serdclass[i]; 1201 } 1202 prop_name[i++] = '_'; 1203 prop_name[i++] = 'n'; 1204 prop_name[i] = '\0'; 1205 if (s = config_getprop(cp, prop_name)) { 1206 nval = atoi(s); 1207 out(O_ALTFP, "serd override %s_n %s", 1208 serdclass, s); 1209 got_n_override = 1; 1210 } 1211 prop_name[i - 1] = 't'; 1212 if (s = config_getprop(cp, prop_name)) { 1213 ptr = STRDUP(s); 1214 out(O_ALTFP, "serd override %s_t %s", 1215 serdclass, s); 1216 got_t_override = 1; 1217 } 1218 FREE(prop_name); 1219 } 1220 1221 if (serdn != -1 && got_n_override == 0) { 1222 nval = serdn; 1223 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1224 got_n_override = 1; 1225 } 1226 if (serdt != NULL && got_t_override == 0) { 1227 ptr = STRDUP(serdt); 1228 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1229 got_t_override = 1; 1230 } 1231 1232 if (!got_n_override) { 1233 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1234 NULL); 1235 ASSERT(nN->t == T_NUM); 1236 nval = (uint_t)nN->u.ull; 1237 } 1238 if (!got_t_override) { 1239 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1240 NULL); 1241 ASSERT(nT->t == T_TIMEVAL); 1242 tval = (hrtime_t)nT->u.ull; 1243 } else { 1244 const unsigned long long *ullp; 1245 const char *suffix; 1246 int len; 1247 1248 len = strspn(ptr, "0123456789"); 1249 suffix = stable(&ptr[len]); 1250 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1251 (void *)suffix, NULL); 1252 ptr[len] = '\0'; 1253 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1254 FREE(ptr); 1255 } 1256 fmd_serd_create(hdl, serdname, nval, tval); 1257 } 1258 1259 newentp = MALLOC(sizeof (*newentp)); 1260 newentp->ename = stable(serdclass); 1261 FREE(serdclass); 1262 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1263 newentp->hdl = hdl; 1264 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1265 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1266 (void *)newentp, (lut_cmp)serd_cmp); 1267 Serd_need_save = 1; 1268 serd_save(); 1269 } else { 1270 FREE(newentp); 1271 } 1272 1273 1274 /* 1275 * increment SERD engine. if engine fires, reset serd 1276 * engine and return trip_strcode if required. 1277 */ 1278 for (i = 0; i < serdincrement; i++) { 1279 if (fmd_serd_record(hdl, serdname, ffep)) { 1280 fmd_case_add_serd(hdl, fmcase, serdname); 1281 fmd_serd_reset(hdl, serdname); 1282 1283 if (ippp) { 1284 struct node *tripinst = 1285 lut_lookup(serdinst->u.stmt.lutp, 1286 (void *)L_trip, NULL); 1287 ASSERT(tripinst != NULL); 1288 *enamep = tripinst->u.event.ename->u.name.s; 1289 *ippp = ipath(tripinst->u.event.epname); 1290 out(O_ALTFP|O_NONL, 1291 "[engine fired: %s, sending: ", serdname); 1292 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1293 out(O_ALTFP, "]"); 1294 } else { 1295 out(O_ALTFP, "[engine fired: %s, no trip]", 1296 serdname); 1297 } 1298 FREE(serdname); 1299 return (1); 1300 } 1301 } 1302 1303 FREE(serdname); 1304 return (0); 1305 } 1306 1307 /* 1308 * search a suspect list for upsets. feed each upset to serd_eval() and 1309 * build up tripped[], an array of ereports produced by the firing of 1310 * any SERD engines. then feed each ereport back into 1311 * fme_receive_report(). 1312 * 1313 * returns ntrip, the number of these ereports produced. 1314 */ 1315 static int 1316 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1317 { 1318 /* we build an array of tripped ereports that we send ourselves */ 1319 struct { 1320 const char *ename; 1321 const struct ipath *ipp; 1322 } *tripped; 1323 struct event *sp; 1324 int ntrip, nupset, i; 1325 1326 /* 1327 * count the number of upsets to determine the upper limit on 1328 * expected trip ereport strings. remember that one upset can 1329 * lead to at most one ereport. 1330 */ 1331 nupset = 0; 1332 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1333 if (sp->t == N_UPSET) 1334 nupset++; 1335 } 1336 1337 if (nupset == 0) 1338 return (0); 1339 1340 /* 1341 * get to this point if we have upsets and expect some trip 1342 * ereports 1343 */ 1344 tripped = alloca(sizeof (*tripped) * nupset); 1345 bzero((void *)tripped, sizeof (*tripped) * nupset); 1346 1347 ntrip = 0; 1348 for (sp = fmep->suspects; sp; sp = sp->suspects) 1349 if (sp->t == N_UPSET && 1350 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1351 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1352 ntrip++; 1353 1354 for (i = 0; i < ntrip; i++) { 1355 struct event *ep, *nep; 1356 struct fme *nfmep; 1357 fmd_case_t *fmcase; 1358 const struct ipath *ipp; 1359 const char *eventstring; 1360 int prev_verbose; 1361 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1362 enum fme_state state; 1363 1364 /* 1365 * First try and evaluate a case with the trip ereport plus 1366 * all the other ereports that cause the trip. If that fails 1367 * to evaluate then try again with just this ereport on its own. 1368 */ 1369 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1370 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1371 out(O_ALTFP|O_STAMP, NULL); 1372 ep = fmep->e0; 1373 eventstring = ep->enode->u.event.ename->u.name.s; 1374 ipp = ep->ipp; 1375 1376 /* 1377 * create a duplicate fme and case 1378 */ 1379 fmcase = fmd_case_open(fmep->hdl, NULL); 1380 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1381 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1382 out(O_ALTFP, " ]"); 1383 1384 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1385 fmcase, ffep, ep->nvp)) == NULL) { 1386 out(O_ALTFP|O_NONL, "["); 1387 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1388 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1389 continue; 1390 } 1391 1392 Open_fme_count++; 1393 nfmep->pull = fmep->pull; 1394 init_fme_bufs(nfmep); 1395 out(O_ALTFP|O_NONL, "["); 1396 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1397 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1398 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1399 if (ffep) { 1400 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1401 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1402 nfmep->e0r = ffep; 1403 } 1404 1405 /* 1406 * add the original ereports 1407 */ 1408 for (ep = fmep->observations; ep; ep = ep->observations) { 1409 eventstring = ep->enode->u.event.ename->u.name.s; 1410 ipp = ep->ipp; 1411 out(O_ALTFP|O_NONL, "adding event ["); 1412 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1413 out(O_ALTFP, " ]"); 1414 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1415 if (nep->count++ == 0) { 1416 nep->observations = nfmep->observations; 1417 nfmep->observations = nep; 1418 serialize_observation(nfmep, eventstring, ipp); 1419 nep->nvp = evnv_dupnvl(ep->nvp); 1420 } 1421 if (ep->ffep && ep->ffep != ffep) 1422 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1423 ep->ffep); 1424 stats_counter_bump(nfmep->Rcount); 1425 } 1426 1427 /* 1428 * add the serd trigger ereport 1429 */ 1430 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1431 tripped[i].ipp)) == NULL) { 1432 /* 1433 * The trigger ereport is not in the instance tree. It 1434 * was presumably removed by prune_propagations() as 1435 * this combination of events is not present in the 1436 * rules. 1437 */ 1438 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1439 Undiag_reason = UD_VAL_BADEVENTI; 1440 goto retry_lone_ereport; 1441 } 1442 out(O_ALTFP|O_NONL, "adding event ["); 1443 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1444 out(O_ALTFP, " ]"); 1445 nfmep->ecurrent = ep; 1446 ep->nvp = NULL; 1447 ep->count = 1; 1448 ep->observations = nfmep->observations; 1449 nfmep->observations = ep; 1450 1451 /* 1452 * just peek first. 1453 */ 1454 nfmep->peek = 1; 1455 prev_verbose = Verbose; 1456 if (Debug == 0) 1457 Verbose = 0; 1458 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1459 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1460 nfmep->peek = 0; 1461 Verbose = prev_verbose; 1462 if (state == FME_DISPROVED) { 1463 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1464 Undiag_reason = UD_VAL_UNSOLVD; 1465 retry_lone_ereport: 1466 /* 1467 * However the trigger ereport on its own might be 1468 * diagnosable, so check for that. Undo the new fme 1469 * and case we just created and call fme_receive_report. 1470 */ 1471 out(O_ALTFP|O_NONL, "["); 1472 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1473 tripped[i].ipp); 1474 out(O_ALTFP, " retrying with just trigger ereport]"); 1475 itree_free(nfmep->eventtree); 1476 nfmep->eventtree = NULL; 1477 structconfig_free(nfmep->config); 1478 nfmep->config = NULL; 1479 destroy_fme_bufs(nfmep); 1480 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1481 fme_receive_report(fmep->hdl, ffep, 1482 tripped[i].ename, tripped[i].ipp, NULL); 1483 continue; 1484 } 1485 1486 /* 1487 * and evaluate 1488 */ 1489 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1490 fme_eval(nfmep, ffep); 1491 } 1492 1493 return (ntrip); 1494 } 1495 1496 /* 1497 * fme_receive_external_report -- call when an external ereport comes in 1498 * 1499 * this routine just converts the relevant information from the ereport 1500 * into a format used internally and passes it on to fme_receive_report(). 1501 */ 1502 void 1503 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1504 const char *class) 1505 { 1506 struct node *epnamenp; 1507 fmd_case_t *fmcase; 1508 const struct ipath *ipp; 1509 nvlist_t *detector = NULL; 1510 1511 class = stable(class); 1512 1513 /* Get the component path from the ereport */ 1514 epnamenp = platform_getpath(nvl); 1515 1516 /* See if we ended up without a path. */ 1517 if (epnamenp == NULL) { 1518 /* See if class permits silent discard on unknown component. */ 1519 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1520 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1521 "to component path, but silent discard allowed.", 1522 class); 1523 } else { 1524 /* 1525 * XFILE: Failure to find a component is bad unless 1526 * 'discard_if_config_unknown=1' was specified in the 1527 * ereport definition. Indicate undiagnosable. 1528 */ 1529 Undiag_reason = UD_VAL_NOPATH; 1530 fmcase = fmd_case_open(hdl, NULL); 1531 1532 /* 1533 * We don't have a component path here (which means that 1534 * the detector was not in hc-scheme and couldn't be 1535 * converted to hc-scheme. Report the raw detector as 1536 * the suspect resource if there is one. 1537 */ 1538 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1539 &detector); 1540 publish_undiagnosable(hdl, ffep, fmcase, detector, 1541 (char *)class); 1542 } 1543 return; 1544 } 1545 1546 ipp = ipath(epnamenp); 1547 tree_free(epnamenp); 1548 fme_receive_report(hdl, ffep, class, ipp, nvl); 1549 } 1550 1551 /*ARGSUSED*/ 1552 void 1553 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1554 const char *eventstring) 1555 { 1556 char *uuid; 1557 nvlist_t **nva; 1558 uint_t nvc; 1559 const struct ipath *ipp; 1560 1561 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1562 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1563 &nva, &nvc) != 0) { 1564 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1565 return; 1566 } 1567 1568 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1569 1570 while (nvc-- != 0) { 1571 /* 1572 * Reset any istat or serd engine associated with this path. 1573 */ 1574 char *path; 1575 1576 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1577 continue; 1578 1579 path = ipath2str(NULL, ipp); 1580 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1581 path); 1582 FREE(path); 1583 1584 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1585 istat_save(); 1586 1587 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1588 serd_save(); 1589 } 1590 } 1591 1592 /*ARGSUSED*/ 1593 void 1594 fme_receive_topology_change(void) 1595 { 1596 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1597 istat_save(); 1598 1599 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1600 serd_save(); 1601 } 1602 1603 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1604 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1605 1606 /* ARGSUSED */ 1607 static void 1608 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1609 { 1610 struct bubble *bp; 1611 struct arrowlist *ap; 1612 1613 ep->cached_state = 0; 1614 ep->keep_in_tree = 0; 1615 for (bp = itree_next_bubble(ep, NULL); bp; 1616 bp = itree_next_bubble(ep, bp)) { 1617 if (bp->t != B_FROM) 1618 continue; 1619 bp->mark = 0; 1620 for (ap = itree_next_arrow(bp, NULL); ap; 1621 ap = itree_next_arrow(bp, ap)) 1622 ap->arrowp->mark = 0; 1623 } 1624 } 1625 1626 static void 1627 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1628 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1629 { 1630 struct event *ep; 1631 struct fme *fmep = NULL; 1632 struct fme *ofmep = NULL; 1633 struct fme *cfmep, *svfmep; 1634 int matched = 0; 1635 nvlist_t *defect; 1636 fmd_case_t *fmcase; 1637 char *reason; 1638 1639 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1640 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1641 out(O_ALTFP|O_STAMP, NULL); 1642 1643 /* decide which FME it goes to */ 1644 for (fmep = FMElist; fmep; fmep = fmep->next) { 1645 int prev_verbose; 1646 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1647 enum fme_state state; 1648 nvlist_t *pre_peek_nvp = NULL; 1649 1650 if (fmep->overflow) { 1651 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1652 ofmep = fmep; 1653 1654 continue; 1655 } 1656 1657 /* 1658 * ignore solved or closed cases 1659 */ 1660 if (fmep->posted_suspects || 1661 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1662 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1663 continue; 1664 1665 /* look up event in event tree for this FME */ 1666 if ((ep = itree_lookup(fmep->eventtree, 1667 eventstring, ipp)) == NULL) 1668 continue; 1669 1670 /* note observation */ 1671 fmep->ecurrent = ep; 1672 if (ep->count++ == 0) { 1673 /* link it into list of observations seen */ 1674 ep->observations = fmep->observations; 1675 fmep->observations = ep; 1676 ep->nvp = evnv_dupnvl(nvl); 1677 } else { 1678 /* use new payload values for peek */ 1679 pre_peek_nvp = ep->nvp; 1680 ep->nvp = evnv_dupnvl(nvl); 1681 } 1682 1683 /* tell hypothesise() not to mess with suspect list */ 1684 fmep->peek = 1; 1685 1686 /* don't want this to be verbose (unless Debug is set) */ 1687 prev_verbose = Verbose; 1688 if (Debug == 0) 1689 Verbose = 0; 1690 1691 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1692 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1693 1694 fmep->peek = 0; 1695 1696 /* put verbose flag back */ 1697 Verbose = prev_verbose; 1698 1699 if (state != FME_DISPROVED) { 1700 /* found an FME that explains the ereport */ 1701 matched++; 1702 out(O_ALTFP|O_NONL, "["); 1703 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1704 out(O_ALTFP, " explained by FME%d]", fmep->id); 1705 1706 nvlist_free(pre_peek_nvp); 1707 1708 if (ep->count == 1) 1709 serialize_observation(fmep, eventstring, ipp); 1710 1711 if (ffep) { 1712 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1713 ep->ffep = ffep; 1714 } 1715 1716 stats_counter_bump(fmep->Rcount); 1717 1718 /* re-eval FME */ 1719 fme_eval(fmep, ffep); 1720 } else { 1721 1722 /* not a match, undo noting of observation */ 1723 fmep->ecurrent = NULL; 1724 if (--ep->count == 0) { 1725 /* unlink it from observations */ 1726 fmep->observations = ep->observations; 1727 ep->observations = NULL; 1728 nvlist_free(ep->nvp); 1729 ep->nvp = NULL; 1730 } else { 1731 nvlist_free(ep->nvp); 1732 ep->nvp = pre_peek_nvp; 1733 } 1734 } 1735 } 1736 1737 if (matched) 1738 return; /* explained by at least one existing FME */ 1739 1740 /* clean up closed fmes */ 1741 cfmep = ClosedFMEs; 1742 while (cfmep != NULL) { 1743 svfmep = cfmep->next; 1744 destroy_fme(cfmep); 1745 cfmep = svfmep; 1746 } 1747 ClosedFMEs = NULL; 1748 1749 if (ofmep) { 1750 out(O_ALTFP|O_NONL, "["); 1751 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1752 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1753 if (ffep) 1754 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1755 1756 return; 1757 1758 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1759 out(O_ALTFP|O_NONL, "["); 1760 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1761 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1762 1763 fmcase = fmd_case_open(hdl, NULL); 1764 1765 /* Create overflow fme */ 1766 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1767 nvl)) == NULL) { 1768 out(O_ALTFP|O_NONL, "["); 1769 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1770 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1771 return; 1772 } 1773 1774 Open_fme_count++; 1775 1776 init_fme_bufs(fmep); 1777 fmep->overflow = B_TRUE; 1778 1779 if (ffep) 1780 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1781 1782 Undiag_reason = UD_VAL_MAXFME; 1783 defect = fmd_nvl_create_fault(hdl, 1784 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1785 reason = undiag_2reason_str(Undiag_reason, NULL); 1786 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1787 FREE(reason); 1788 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1789 fmd_case_solve(hdl, fmep->fmcase); 1790 Undiag_reason = UD_VAL_UNKNOWN; 1791 return; 1792 } 1793 1794 /* open a case */ 1795 fmcase = fmd_case_open(hdl, NULL); 1796 1797 /* start a new FME */ 1798 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1799 out(O_ALTFP|O_NONL, "["); 1800 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1801 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1802 return; 1803 } 1804 1805 Open_fme_count++; 1806 1807 init_fme_bufs(fmep); 1808 1809 out(O_ALTFP|O_NONL, "["); 1810 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1811 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1812 fmd_case_uuid(hdl, fmep->fmcase)); 1813 1814 ep = fmep->e0; 1815 ASSERT(ep != NULL); 1816 1817 /* note observation */ 1818 fmep->ecurrent = ep; 1819 if (ep->count++ == 0) { 1820 /* link it into list of observations seen */ 1821 ep->observations = fmep->observations; 1822 fmep->observations = ep; 1823 ep->nvp = evnv_dupnvl(nvl); 1824 serialize_observation(fmep, eventstring, ipp); 1825 } else { 1826 /* new payload overrides any previous */ 1827 nvlist_free(ep->nvp); 1828 ep->nvp = evnv_dupnvl(nvl); 1829 } 1830 1831 stats_counter_bump(fmep->Rcount); 1832 1833 if (ffep) { 1834 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1835 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1836 fmep->e0r = ffep; 1837 ep->ffep = ffep; 1838 } 1839 1840 /* give the diagnosis algorithm a shot at the new FME state */ 1841 fme_eval(fmep, ffep); 1842 } 1843 1844 void 1845 fme_status(int flags) 1846 { 1847 struct fme *fmep; 1848 1849 if (FMElist == NULL) { 1850 out(flags, "No fault management exercises underway."); 1851 return; 1852 } 1853 1854 for (fmep = FMElist; fmep; fmep = fmep->next) 1855 fme_print(flags, fmep); 1856 } 1857 1858 /* 1859 * "indent" routines used mostly for nicely formatted debug output, but also 1860 * for sanity checking for infinite recursion bugs. 1861 */ 1862 1863 #define MAX_INDENT 1024 1864 static const char *indent_s[MAX_INDENT]; 1865 static int current_indent; 1866 1867 static void 1868 indent_push(const char *s) 1869 { 1870 if (current_indent < MAX_INDENT) 1871 indent_s[current_indent++] = s; 1872 else 1873 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1874 } 1875 1876 static void 1877 indent_set(const char *s) 1878 { 1879 current_indent = 0; 1880 indent_push(s); 1881 } 1882 1883 static void 1884 indent_pop(void) 1885 { 1886 if (current_indent > 0) 1887 current_indent--; 1888 else 1889 out(O_DIE, "recursion underflow"); 1890 } 1891 1892 static void 1893 indent(void) 1894 { 1895 int i; 1896 if (!Verbose) 1897 return; 1898 for (i = 0; i < current_indent; i++) 1899 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1900 } 1901 1902 #define SLNEW 1 1903 #define SLCHANGED 2 1904 #define SLWAIT 3 1905 #define SLDISPROVED 4 1906 1907 static void 1908 print_suspects(int circumstance, struct fme *fmep) 1909 { 1910 struct event *ep; 1911 1912 out(O_ALTFP|O_NONL, "["); 1913 if (circumstance == SLCHANGED) { 1914 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1915 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1916 } else if (circumstance == SLWAIT) { 1917 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1918 fmep->timer); 1919 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1920 } else if (circumstance == SLDISPROVED) { 1921 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1922 } else { 1923 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1924 } 1925 1926 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1927 out(O_ALTFP, "]"); 1928 return; 1929 } 1930 1931 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1932 out(O_ALTFP|O_NONL, " "); 1933 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1934 } 1935 out(O_ALTFP, "]"); 1936 } 1937 1938 static struct node * 1939 eventprop_lookup(struct event *ep, const char *propname) 1940 { 1941 return (lut_lookup(ep->props, (void *)propname, NULL)); 1942 } 1943 1944 #define MAXDIGITIDX 23 1945 static char numbuf[MAXDIGITIDX + 1]; 1946 1947 static int 1948 node2uint(struct node *n, uint_t *valp) 1949 { 1950 struct evalue value; 1951 struct lut *globals = NULL; 1952 1953 if (n == NULL) 1954 return (1); 1955 1956 /* 1957 * check value.v since we are being asked to convert an unsigned 1958 * long long int to an unsigned int 1959 */ 1960 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1961 value.t != UINT64 || value.v > (1ULL << 32)) 1962 return (1); 1963 1964 *valp = (uint_t)value.v; 1965 1966 return (0); 1967 } 1968 1969 static nvlist_t * 1970 node2fmri(struct node *n) 1971 { 1972 nvlist_t **pa, *f, *p; 1973 struct node *nc; 1974 uint_t depth = 0; 1975 char *numstr, *nullbyte; 1976 char *failure; 1977 int err, i; 1978 1979 /* XXX do we need to be able to handle a non-T_NAME node? */ 1980 if (n == NULL || n->t != T_NAME) 1981 return (NULL); 1982 1983 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1984 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1985 break; 1986 depth++; 1987 } 1988 1989 if (nc != NULL) { 1990 /* We bailed early, something went wrong */ 1991 return (NULL); 1992 } 1993 1994 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1995 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1996 pa = alloca(depth * sizeof (nvlist_t *)); 1997 for (i = 0; i < depth; i++) 1998 pa[i] = NULL; 1999 2000 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2001 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2002 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2003 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2004 if (err != 0) { 2005 failure = "basic construction of FMRI failed"; 2006 goto boom; 2007 } 2008 2009 numbuf[MAXDIGITIDX] = '\0'; 2010 nullbyte = &numbuf[MAXDIGITIDX]; 2011 i = 0; 2012 2013 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2014 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2015 if (err != 0) { 2016 failure = "alloc of an hc-pair failed"; 2017 goto boom; 2018 } 2019 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2020 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2021 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2022 if (err != 0) { 2023 failure = "construction of an hc-pair failed"; 2024 goto boom; 2025 } 2026 pa[i++] = p; 2027 } 2028 2029 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2030 if (err == 0) { 2031 for (i = 0; i < depth; i++) 2032 nvlist_free(pa[i]); 2033 return (f); 2034 } 2035 failure = "addition of hc-pair array to FMRI failed"; 2036 2037 boom: 2038 for (i = 0; i < depth; i++) 2039 nvlist_free(pa[i]); 2040 nvlist_free(f); 2041 out(O_DIE, "%s", failure); 2042 /*NOTREACHED*/ 2043 return (NULL); 2044 } 2045 2046 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2047 struct ipath { 2048 const char *s; /* component name (in stable) */ 2049 int i; /* instance number */ 2050 }; 2051 2052 static nvlist_t * 2053 ipath2fmri(struct ipath *ipath) 2054 { 2055 nvlist_t **pa, *f, *p; 2056 uint_t depth = 0; 2057 char *numstr, *nullbyte; 2058 char *failure; 2059 int err, i; 2060 struct ipath *ipp; 2061 2062 for (ipp = ipath; ipp->s != NULL; ipp++) 2063 depth++; 2064 2065 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2066 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2067 pa = alloca(depth * sizeof (nvlist_t *)); 2068 for (i = 0; i < depth; i++) 2069 pa[i] = NULL; 2070 2071 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2072 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2073 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2074 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2075 if (err != 0) { 2076 failure = "basic construction of FMRI failed"; 2077 goto boom; 2078 } 2079 2080 numbuf[MAXDIGITIDX] = '\0'; 2081 nullbyte = &numbuf[MAXDIGITIDX]; 2082 i = 0; 2083 2084 for (ipp = ipath; ipp->s != NULL; ipp++) { 2085 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2086 if (err != 0) { 2087 failure = "alloc of an hc-pair failed"; 2088 goto boom; 2089 } 2090 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2091 numstr = ulltostr(ipp->i, nullbyte); 2092 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2093 if (err != 0) { 2094 failure = "construction of an hc-pair failed"; 2095 goto boom; 2096 } 2097 pa[i++] = p; 2098 } 2099 2100 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2101 if (err == 0) { 2102 for (i = 0; i < depth; i++) 2103 nvlist_free(pa[i]); 2104 return (f); 2105 } 2106 failure = "addition of hc-pair array to FMRI failed"; 2107 2108 boom: 2109 for (i = 0; i < depth; i++) 2110 nvlist_free(pa[i]); 2111 nvlist_free(f); 2112 out(O_DIE, "%s", failure); 2113 /*NOTREACHED*/ 2114 return (NULL); 2115 } 2116 2117 static uint8_t 2118 percentof(uint_t part, uint_t whole) 2119 { 2120 unsigned long long p = part * 1000; 2121 2122 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2123 } 2124 2125 struct rsl { 2126 struct event *suspect; 2127 nvlist_t *asru; 2128 nvlist_t *fru; 2129 nvlist_t *rsrc; 2130 }; 2131 2132 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2133 2134 /* 2135 * rslfree -- free internal members of struct rsl not expected to be 2136 * freed elsewhere. 2137 */ 2138 static void 2139 rslfree(struct rsl *freeme) 2140 { 2141 nvlist_free(freeme->asru); 2142 nvlist_free(freeme->fru); 2143 if (freeme->rsrc != freeme->asru) 2144 nvlist_free(freeme->rsrc); 2145 } 2146 2147 /* 2148 * rslcmp -- compare two rsl structures. Use the following 2149 * comparisons to establish cardinality: 2150 * 2151 * 1. Name of the suspect's class. (simple strcmp) 2152 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2153 * 2154 */ 2155 static int 2156 rslcmp(const void *a, const void *b) 2157 { 2158 struct rsl *r1 = (struct rsl *)a; 2159 struct rsl *r2 = (struct rsl *)b; 2160 int rv; 2161 2162 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2163 r2->suspect->enode->u.event.ename->u.name.s); 2164 if (rv != 0) 2165 return (rv); 2166 2167 if (r1->rsrc == NULL && r2->rsrc == NULL) 2168 return (0); 2169 if (r1->rsrc == NULL) 2170 return (-1); 2171 if (r2->rsrc == NULL) 2172 return (1); 2173 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2174 } 2175 2176 /* 2177 * get_resources -- for a given suspect, determine what ASRU, FRU and 2178 * RSRC nvlists should be advertised in the final suspect list. 2179 */ 2180 void 2181 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2182 { 2183 struct node *asrudef, *frudef; 2184 nvlist_t *asru, *fru; 2185 nvlist_t *rsrc = NULL; 2186 char *pathstr; 2187 2188 /* 2189 * First find any ASRU and/or FRU defined in the 2190 * initial fault tree. 2191 */ 2192 asrudef = eventprop_lookup(sp, L_ASRU); 2193 frudef = eventprop_lookup(sp, L_FRU); 2194 2195 /* 2196 * Create FMRIs based on those definitions 2197 */ 2198 asru = node2fmri(asrudef); 2199 fru = node2fmri(frudef); 2200 pathstr = ipath2str(NULL, sp->ipp); 2201 2202 /* 2203 * Allow for platform translations of the FMRIs 2204 */ 2205 platform_units_translate(is_defect(sp->t), croot, &asru, &fru, &rsrc, 2206 pathstr); 2207 2208 FREE(pathstr); 2209 rsrcs->suspect = sp; 2210 rsrcs->asru = asru; 2211 rsrcs->fru = fru; 2212 rsrcs->rsrc = rsrc; 2213 } 2214 2215 /* 2216 * trim_suspects -- prior to publishing, we may need to remove some 2217 * suspects from the list. If we're auto-closing upsets, we don't 2218 * want any of those in the published list. If the ASRUs for multiple 2219 * defects resolve to the same ASRU (driver) we only want to publish 2220 * that as a single suspect. 2221 */ 2222 static int 2223 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2224 fmd_event_t *ffep) 2225 { 2226 struct event *ep; 2227 struct rsl *rp = begin; 2228 struct rsl *rp2 = begin2; 2229 int mess_zero_count = 0; 2230 int serd_rval; 2231 uint_t messval; 2232 2233 /* remove any unwanted upsets and populate our array */ 2234 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2235 if (is_upset(ep->t)) 2236 continue; 2237 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2238 NULL, NULL); 2239 if (serd_rval == 0) 2240 continue; 2241 if (node2uint(eventprop_lookup(ep, L_message), 2242 &messval) == 0 && messval == 0) { 2243 get_resources(ep, rp2, fmep->config); 2244 rp2++; 2245 mess_zero_count++; 2246 } else { 2247 get_resources(ep, rp, fmep->config); 2248 rp++; 2249 fmep->nsuspects++; 2250 } 2251 } 2252 return (mess_zero_count); 2253 } 2254 2255 /* 2256 * addpayloadprop -- add a payload prop to a problem 2257 */ 2258 static void 2259 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2260 { 2261 nvlist_t *rsrc, *hcs; 2262 2263 ASSERT(fault != NULL); 2264 ASSERT(lhs != NULL); 2265 ASSERT(rhs != NULL); 2266 2267 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2268 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2269 2270 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2271 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2272 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2273 out(O_DIE, 2274 "cannot add payloadprop \"%s\" to fault", lhs); 2275 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2276 out(O_DIE, 2277 "cannot add payloadprop \"%s\" to fault", lhs); 2278 nvlist_free(hcs); 2279 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2280 out(O_DIE, 2281 "cannot add payloadprop \"%s\" to fault", lhs); 2282 } else 2283 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2284 2285 if (rhs->t == UINT64) { 2286 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2287 2288 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2289 out(O_DIE, 2290 "cannot add payloadprop \"%s\" to fault", lhs); 2291 } else { 2292 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2293 lhs, (char *)(uintptr_t)rhs->v); 2294 2295 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2296 out(O_DIE, 2297 "cannot add payloadprop \"%s\" to fault", lhs); 2298 } 2299 } 2300 2301 static char *Istatbuf; 2302 static char *Istatbufptr; 2303 static int Istatsz; 2304 2305 /* 2306 * istataddsize -- calculate size of istat and add it to Istatsz 2307 */ 2308 /*ARGSUSED2*/ 2309 static void 2310 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2311 { 2312 int val; 2313 2314 ASSERT(lhs != NULL); 2315 ASSERT(rhs != NULL); 2316 2317 if ((val = stats_counter_value(rhs)) == 0) 2318 return; /* skip zero-valued stats */ 2319 2320 /* count up the size of the stat name */ 2321 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2322 Istatsz++; /* for the trailing NULL byte */ 2323 2324 /* count up the size of the stat value */ 2325 Istatsz += snprintf(NULL, 0, "%d", val); 2326 Istatsz++; /* for the trailing NULL byte */ 2327 } 2328 2329 /* 2330 * istat2str -- serialize an istat, writing result to *Istatbufptr 2331 */ 2332 /*ARGSUSED2*/ 2333 static void 2334 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2335 { 2336 char *str; 2337 int len; 2338 int val; 2339 2340 ASSERT(lhs != NULL); 2341 ASSERT(rhs != NULL); 2342 2343 if ((val = stats_counter_value(rhs)) == 0) 2344 return; /* skip zero-valued stats */ 2345 2346 /* serialize the stat name */ 2347 str = ipath2str(lhs->ename, lhs->ipath); 2348 len = strlen(str); 2349 2350 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2351 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2352 Istatbufptr += len; 2353 FREE(str); 2354 *Istatbufptr++ = '\0'; 2355 2356 /* serialize the stat value */ 2357 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2358 "%d", val); 2359 *Istatbufptr++ = '\0'; 2360 2361 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2362 } 2363 2364 void 2365 istat_save() 2366 { 2367 if (Istat_need_save == 0) 2368 return; 2369 2370 /* figure out how big the serialzed info is */ 2371 Istatsz = 0; 2372 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2373 2374 if (Istatsz == 0) { 2375 /* no stats to save */ 2376 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2377 return; 2378 } 2379 2380 /* create the serialized buffer */ 2381 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2382 lut_walk(Istats, (lut_cb)istat2str, NULL); 2383 2384 /* clear out current saved stats */ 2385 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2386 2387 /* write out the new version */ 2388 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2389 FREE(Istatbuf); 2390 2391 Istat_need_save = 0; 2392 } 2393 2394 int 2395 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2396 { 2397 if (ent1->ename != ent2->ename) 2398 return (ent2->ename - ent1->ename); 2399 if (ent1->ipath != ent2->ipath) 2400 return ((char *)ent2->ipath - (char *)ent1->ipath); 2401 2402 return (0); 2403 } 2404 2405 /* 2406 * istat-verify -- verify the component associated with a stat still exists 2407 * 2408 * if the component no longer exists, this routine resets the stat and 2409 * returns 0. if the component still exists, it returns 1. 2410 */ 2411 static int 2412 istat_verify(struct node *snp, struct istat_entry *entp) 2413 { 2414 struct stats *statp; 2415 nvlist_t *fmri; 2416 2417 fmri = node2fmri(snp->u.event.epname); 2418 if (platform_path_exists(fmri)) { 2419 nvlist_free(fmri); 2420 return (1); 2421 } 2422 nvlist_free(fmri); 2423 2424 /* component no longer in system. zero out the associated stats */ 2425 if ((statp = (struct stats *) 2426 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2427 stats_counter_value(statp) == 0) 2428 return (0); /* stat is already reset */ 2429 2430 Istat_need_save = 1; 2431 stats_counter_reset(statp); 2432 return (0); 2433 } 2434 2435 static void 2436 istat_bump(struct node *snp, int n) 2437 { 2438 struct stats *statp; 2439 struct istat_entry ent; 2440 2441 ASSERT(snp != NULL); 2442 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2443 ASSERT(snp->u.event.epname != NULL); 2444 2445 /* class name should be hoisted into a single stable entry */ 2446 ASSERT(snp->u.event.ename->u.name.next == NULL); 2447 ent.ename = snp->u.event.ename->u.name.s; 2448 ent.ipath = ipath(snp->u.event.epname); 2449 2450 if (!istat_verify(snp, &ent)) { 2451 /* component no longer exists in system, nothing to do */ 2452 return; 2453 } 2454 2455 if ((statp = (struct stats *) 2456 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2457 /* need to create the counter */ 2458 int cnt = 0; 2459 struct node *np; 2460 char *sname; 2461 char *snamep; 2462 struct istat_entry *newentp; 2463 2464 /* count up the size of the stat name */ 2465 np = snp->u.event.ename; 2466 while (np != NULL) { 2467 cnt += strlen(np->u.name.s); 2468 cnt++; /* for the '.' or '@' */ 2469 np = np->u.name.next; 2470 } 2471 np = snp->u.event.epname; 2472 while (np != NULL) { 2473 cnt += snprintf(NULL, 0, "%s%llu", 2474 np->u.name.s, np->u.name.child->u.ull); 2475 cnt++; /* for the '/' or trailing NULL byte */ 2476 np = np->u.name.next; 2477 } 2478 2479 /* build the stat name */ 2480 snamep = sname = alloca(cnt); 2481 np = snp->u.event.ename; 2482 while (np != NULL) { 2483 snamep += snprintf(snamep, &sname[cnt] - snamep, 2484 "%s", np->u.name.s); 2485 np = np->u.name.next; 2486 if (np) 2487 *snamep++ = '.'; 2488 } 2489 *snamep++ = '@'; 2490 np = snp->u.event.epname; 2491 while (np != NULL) { 2492 snamep += snprintf(snamep, &sname[cnt] - snamep, 2493 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2494 np = np->u.name.next; 2495 if (np) 2496 *snamep++ = '/'; 2497 } 2498 *snamep++ = '\0'; 2499 2500 /* create the new stat & add it to our list */ 2501 newentp = MALLOC(sizeof (*newentp)); 2502 *newentp = ent; 2503 statp = stats_new_counter(NULL, sname, 0); 2504 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2505 (lut_cmp)istat_cmp); 2506 } 2507 2508 /* if n is non-zero, set that value instead of bumping */ 2509 if (n) { 2510 stats_counter_reset(statp); 2511 stats_counter_add(statp, n); 2512 } else 2513 stats_counter_bump(statp); 2514 Istat_need_save = 1; 2515 2516 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2517 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2518 stats_counter_value(statp)); 2519 } 2520 2521 /*ARGSUSED*/ 2522 static void 2523 istat_destructor(void *left, void *right, void *arg) 2524 { 2525 struct istat_entry *entp = (struct istat_entry *)left; 2526 struct stats *statp = (struct stats *)right; 2527 FREE(entp); 2528 stats_delete(statp); 2529 } 2530 2531 /* 2532 * Callback used in a walk of the Istats to reset matching stat counters. 2533 */ 2534 static void 2535 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2536 const struct ipath *ipp) 2537 { 2538 char *path; 2539 2540 if (entp->ipath == ipp) { 2541 path = ipath2str(entp->ename, ipp); 2542 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2543 FREE(path); 2544 stats_counter_reset(statp); 2545 Istat_need_save = 1; 2546 } 2547 } 2548 2549 /*ARGSUSED*/ 2550 static void 2551 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2552 void *unused) 2553 { 2554 char *path; 2555 nvlist_t *fmri; 2556 2557 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2558 if (!platform_path_exists(fmri)) { 2559 path = ipath2str(entp->ename, entp->ipath); 2560 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2561 FREE(path); 2562 stats_counter_reset(statp); 2563 Istat_need_save = 1; 2564 } 2565 nvlist_free(fmri); 2566 } 2567 2568 void 2569 istat_fini(void) 2570 { 2571 lut_free(Istats, istat_destructor, NULL); 2572 } 2573 2574 static char *Serdbuf; 2575 static char *Serdbufptr; 2576 static int Serdsz; 2577 2578 /* 2579 * serdaddsize -- calculate size of serd and add it to Serdsz 2580 */ 2581 /*ARGSUSED*/ 2582 static void 2583 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2584 { 2585 ASSERT(lhs != NULL); 2586 2587 /* count up the size of the stat name */ 2588 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2589 Serdsz++; /* for the trailing NULL byte */ 2590 } 2591 2592 /* 2593 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2594 */ 2595 /*ARGSUSED*/ 2596 static void 2597 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2598 { 2599 char *str; 2600 int len; 2601 2602 ASSERT(lhs != NULL); 2603 2604 /* serialize the serd engine name */ 2605 str = ipath2str(lhs->ename, lhs->ipath); 2606 len = strlen(str); 2607 2608 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2609 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2610 Serdbufptr += len; 2611 FREE(str); 2612 *Serdbufptr++ = '\0'; 2613 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2614 } 2615 2616 void 2617 serd_save() 2618 { 2619 if (Serd_need_save == 0) 2620 return; 2621 2622 /* figure out how big the serialzed info is */ 2623 Serdsz = 0; 2624 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2625 2626 if (Serdsz == 0) { 2627 /* no serd engines to save */ 2628 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2629 return; 2630 } 2631 2632 /* create the serialized buffer */ 2633 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2634 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2635 2636 /* clear out current saved stats */ 2637 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2638 2639 /* write out the new version */ 2640 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2641 FREE(Serdbuf); 2642 Serd_need_save = 0; 2643 } 2644 2645 int 2646 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2647 { 2648 if (ent1->ename != ent2->ename) 2649 return (ent2->ename - ent1->ename); 2650 if (ent1->ipath != ent2->ipath) 2651 return ((char *)ent2->ipath - (char *)ent1->ipath); 2652 2653 return (0); 2654 } 2655 2656 void 2657 fme_serd_load(fmd_hdl_t *hdl) 2658 { 2659 int sz; 2660 char *sbuf; 2661 char *sepptr; 2662 char *ptr; 2663 struct serd_entry *newentp; 2664 struct node *epname; 2665 nvlist_t *fmri; 2666 char *namestring; 2667 2668 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2669 return; 2670 sbuf = alloca(sz); 2671 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2672 ptr = sbuf; 2673 while (ptr < &sbuf[sz]) { 2674 sepptr = strchr(ptr, '@'); 2675 *sepptr = '\0'; 2676 namestring = ptr; 2677 sepptr++; 2678 ptr = sepptr; 2679 ptr += strlen(ptr); 2680 ptr++; /* move past the '\0' separating paths */ 2681 epname = pathstring2epnamenp(sepptr); 2682 fmri = node2fmri(epname); 2683 if (platform_path_exists(fmri)) { 2684 newentp = MALLOC(sizeof (*newentp)); 2685 newentp->hdl = hdl; 2686 newentp->ipath = ipath(epname); 2687 newentp->ename = stable(namestring); 2688 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2689 (void *)newentp, (lut_cmp)serd_cmp); 2690 } else 2691 Serd_need_save = 1; 2692 tree_free(epname); 2693 nvlist_free(fmri); 2694 } 2695 /* save it back again in case some of the paths no longer exist */ 2696 serd_save(); 2697 } 2698 2699 /*ARGSUSED*/ 2700 static void 2701 serd_destructor(void *left, void *right, void *arg) 2702 { 2703 struct serd_entry *entp = (struct serd_entry *)left; 2704 FREE(entp); 2705 } 2706 2707 /* 2708 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2709 */ 2710 /*ARGSUSED*/ 2711 static void 2712 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2713 { 2714 char *path; 2715 2716 if (entp->ipath == ipp) { 2717 path = ipath2str(entp->ename, ipp); 2718 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2719 fmd_serd_reset(entp->hdl, path); 2720 FREE(path); 2721 Serd_need_save = 1; 2722 } 2723 } 2724 2725 /*ARGSUSED*/ 2726 static void 2727 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2728 { 2729 char *path; 2730 nvlist_t *fmri; 2731 2732 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2733 if (!platform_path_exists(fmri)) { 2734 path = ipath2str(entp->ename, entp->ipath); 2735 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2736 fmd_serd_reset(entp->hdl, path); 2737 FREE(path); 2738 Serd_need_save = 1; 2739 } 2740 nvlist_free(fmri); 2741 } 2742 2743 void 2744 serd_fini(void) 2745 { 2746 lut_free(SerdEngines, serd_destructor, NULL); 2747 } 2748 2749 static void 2750 publish_suspects(struct fme *fmep, struct rsl *srl) 2751 { 2752 struct rsl *rp; 2753 nvlist_t *fault; 2754 uint8_t cert; 2755 uint_t *frs; 2756 uint_t frsum, fr; 2757 uint_t messval; 2758 uint_t retireval; 2759 uint_t responseval; 2760 struct node *snp; 2761 int frcnt, fridx; 2762 boolean_t allfaulty = B_TRUE; 2763 struct rsl *erl = srl + fmep->nsuspects - 1; 2764 2765 /* 2766 * sort the array 2767 */ 2768 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2769 2770 /* sum the fitrates */ 2771 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2772 fridx = frcnt = frsum = 0; 2773 2774 for (rp = srl; rp <= erl; rp++) { 2775 struct node *n; 2776 2777 n = eventprop_lookup(rp->suspect, L_FITrate); 2778 if (node2uint(n, &fr) != 0) { 2779 out(O_DEBUG|O_NONL, "event "); 2780 ipath_print(O_DEBUG|O_NONL, 2781 rp->suspect->enode->u.event.ename->u.name.s, 2782 rp->suspect->ipp); 2783 out(O_VERB, " has no FITrate (using 1)"); 2784 fr = 1; 2785 } else if (fr == 0) { 2786 out(O_DEBUG|O_NONL, "event "); 2787 ipath_print(O_DEBUG|O_NONL, 2788 rp->suspect->enode->u.event.ename->u.name.s, 2789 rp->suspect->ipp); 2790 out(O_VERB, " has zero FITrate (using 1)"); 2791 fr = 1; 2792 } 2793 2794 frs[fridx++] = fr; 2795 frsum += fr; 2796 frcnt++; 2797 } 2798 2799 /* Add them in reverse order of our sort, as fmd reverses order */ 2800 for (rp = erl; rp >= srl; rp--) { 2801 cert = percentof(frs[--fridx], frsum); 2802 fault = fmd_nvl_create_fault(fmep->hdl, 2803 rp->suspect->enode->u.event.ename->u.name.s, 2804 cert, 2805 rp->asru, 2806 rp->fru, 2807 rp->rsrc); 2808 if (fault == NULL) 2809 out(O_DIE, "fault creation failed"); 2810 /* if "message" property exists, add it to the fault */ 2811 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2812 &messval) == 0) { 2813 2814 out(O_ALTFP, 2815 "[FME%d, %s adds message=%d to suspect list]", 2816 fmep->id, 2817 rp->suspect->enode->u.event.ename->u.name.s, 2818 messval); 2819 if (nvlist_add_boolean_value(fault, 2820 FM_SUSPECT_MESSAGE, 2821 (messval) ? B_TRUE : B_FALSE) != 0) { 2822 out(O_DIE, "cannot add no-message to fault"); 2823 } 2824 } 2825 2826 /* if "retire" property exists, add it to the fault */ 2827 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2828 &retireval) == 0) { 2829 2830 out(O_ALTFP, 2831 "[FME%d, %s adds retire=%d to suspect list]", 2832 fmep->id, 2833 rp->suspect->enode->u.event.ename->u.name.s, 2834 retireval); 2835 if (nvlist_add_boolean_value(fault, 2836 FM_SUSPECT_RETIRE, 2837 (retireval) ? B_TRUE : B_FALSE) != 0) { 2838 out(O_DIE, "cannot add no-retire to fault"); 2839 } 2840 } 2841 2842 /* if "response" property exists, add it to the fault */ 2843 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2844 &responseval) == 0) { 2845 2846 out(O_ALTFP, 2847 "[FME%d, %s adds response=%d to suspect list]", 2848 fmep->id, 2849 rp->suspect->enode->u.event.ename->u.name.s, 2850 responseval); 2851 if (nvlist_add_boolean_value(fault, 2852 FM_SUSPECT_RESPONSE, 2853 (responseval) ? B_TRUE : B_FALSE) != 0) { 2854 out(O_DIE, "cannot add no-response to fault"); 2855 } 2856 } 2857 2858 /* add any payload properties */ 2859 lut_walk(rp->suspect->payloadprops, 2860 (lut_cb)addpayloadprop, (void *)fault); 2861 rslfree(rp); 2862 2863 /* 2864 * If "action" property exists, evaluate it; this must be done 2865 * before the allfaulty check below since some actions may 2866 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2867 * needs to be restructured if any new actions are introduced 2868 * that have effects that we do not want to be visible if 2869 * we decide not to publish in the dupclose check below. 2870 */ 2871 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2872 struct evalue evalue; 2873 2874 out(O_ALTFP|O_NONL, 2875 "[FME%d, %s action ", fmep->id, 2876 rp->suspect->enode->u.event.ename->u.name.s); 2877 ptree_name_iter(O_ALTFP|O_NONL, snp); 2878 out(O_ALTFP, "]"); 2879 Action_nvl = fault; 2880 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2881 NULL, 0, &evalue); 2882 } 2883 2884 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2885 2886 /* 2887 * check if the asru is already marked as "faulty". 2888 */ 2889 if (allfaulty) { 2890 nvlist_t *asru; 2891 2892 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2893 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2894 out(O_ALTFP|O_VERB|O_NONL, " "); 2895 if (nvlist_lookup_nvlist(fault, 2896 FM_FAULT_ASRU, &asru) != 0) { 2897 out(O_ALTFP|O_VERB, "NULL asru"); 2898 allfaulty = B_FALSE; 2899 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2900 FMD_HAS_FAULT_ASRU, NULL)) { 2901 out(O_ALTFP|O_VERB, "faulty"); 2902 } else { 2903 out(O_ALTFP|O_VERB, "not faulty"); 2904 allfaulty = B_FALSE; 2905 } 2906 } 2907 2908 } 2909 2910 if (!allfaulty) { 2911 /* 2912 * don't update the count stat if all asrus are already 2913 * present and unrepaired in the asru cache 2914 */ 2915 for (rp = erl; rp >= srl; rp--) { 2916 struct event *suspect = rp->suspect; 2917 2918 if (suspect == NULL) 2919 continue; 2920 2921 /* if "count" exists, increment the appropriate stat */ 2922 if ((snp = eventprop_lookup(suspect, 2923 L_count)) != NULL) { 2924 out(O_ALTFP|O_NONL, 2925 "[FME%d, %s count ", fmep->id, 2926 suspect->enode->u.event.ename->u.name.s); 2927 ptree_name_iter(O_ALTFP|O_NONL, snp); 2928 out(O_ALTFP, "]"); 2929 istat_bump(snp, 0); 2930 2931 } 2932 } 2933 istat_save(); /* write out any istat changes */ 2934 } 2935 } 2936 2937 static const char * 2938 undiag_2defect_str(int ud) 2939 { 2940 switch (ud) { 2941 case UD_VAL_MISSINGINFO: 2942 case UD_VAL_MISSINGOBS: 2943 case UD_VAL_MISSINGPATH: 2944 case UD_VAL_MISSINGZERO: 2945 case UD_VAL_BADOBS: 2946 case UD_VAL_CFGMISMATCH: 2947 return (UNDIAG_DEFECT_CHKPT); 2948 2949 case UD_VAL_BADEVENTI: 2950 case UD_VAL_BADEVENTPATH: 2951 case UD_VAL_BADEVENTCLASS: 2952 case UD_VAL_INSTFAIL: 2953 case UD_VAL_NOPATH: 2954 case UD_VAL_UNSOLVD: 2955 return (UNDIAG_DEFECT_FME); 2956 2957 case UD_VAL_MAXFME: 2958 return (UNDIAG_DEFECT_LIMIT); 2959 2960 case UD_VAL_UNKNOWN: 2961 default: 2962 return (UNDIAG_DEFECT_UNKNOWN); 2963 } 2964 } 2965 2966 static const char * 2967 undiag_2fault_str(int ud) 2968 { 2969 switch (ud) { 2970 case UD_VAL_BADEVENTI: 2971 case UD_VAL_BADEVENTPATH: 2972 case UD_VAL_BADEVENTCLASS: 2973 case UD_VAL_INSTFAIL: 2974 case UD_VAL_NOPATH: 2975 case UD_VAL_UNSOLVD: 2976 return (UNDIAG_FAULT_FME); 2977 default: 2978 return (NULL); 2979 } 2980 } 2981 2982 static char * 2983 undiag_2reason_str(int ud, char *arg) 2984 { 2985 const char *ptr; 2986 char *buf; 2987 int with_arg = 0; 2988 2989 switch (ud) { 2990 case UD_VAL_BADEVENTPATH: 2991 ptr = UD_STR_BADEVENTPATH; 2992 with_arg = 1; 2993 break; 2994 case UD_VAL_BADEVENTCLASS: 2995 ptr = UD_STR_BADEVENTCLASS; 2996 with_arg = 1; 2997 break; 2998 case UD_VAL_BADEVENTI: 2999 ptr = UD_STR_BADEVENTI; 3000 with_arg = 1; 3001 break; 3002 case UD_VAL_BADOBS: 3003 ptr = UD_STR_BADOBS; 3004 break; 3005 case UD_VAL_CFGMISMATCH: 3006 ptr = UD_STR_CFGMISMATCH; 3007 break; 3008 case UD_VAL_INSTFAIL: 3009 ptr = UD_STR_INSTFAIL; 3010 with_arg = 1; 3011 break; 3012 case UD_VAL_MAXFME: 3013 ptr = UD_STR_MAXFME; 3014 break; 3015 case UD_VAL_MISSINGINFO: 3016 ptr = UD_STR_MISSINGINFO; 3017 break; 3018 case UD_VAL_MISSINGOBS: 3019 ptr = UD_STR_MISSINGOBS; 3020 break; 3021 case UD_VAL_MISSINGPATH: 3022 ptr = UD_STR_MISSINGPATH; 3023 break; 3024 case UD_VAL_MISSINGZERO: 3025 ptr = UD_STR_MISSINGZERO; 3026 break; 3027 case UD_VAL_NOPATH: 3028 ptr = UD_STR_NOPATH; 3029 with_arg = 1; 3030 break; 3031 case UD_VAL_UNSOLVD: 3032 ptr = UD_STR_UNSOLVD; 3033 break; 3034 case UD_VAL_UNKNOWN: 3035 default: 3036 ptr = UD_STR_UNKNOWN; 3037 break; 3038 } 3039 if (with_arg) { 3040 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3041 (void) sprintf(buf, ptr, arg); 3042 } else { 3043 buf = MALLOC(strlen(ptr) + 1); 3044 (void) sprintf(buf, ptr); 3045 } 3046 return (buf); 3047 } 3048 3049 static void 3050 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3051 nvlist_t *detector, char *arg) 3052 { 3053 struct case_list *newcase; 3054 nvlist_t *defect, *fault; 3055 const char *faultstr; 3056 char *reason = undiag_2reason_str(Undiag_reason, arg); 3057 3058 out(O_ALTFP, 3059 "[undiagnosable ereport received, " 3060 "creating and closing a new case (%s)]", reason); 3061 3062 newcase = MALLOC(sizeof (struct case_list)); 3063 newcase->next = NULL; 3064 newcase->fmcase = fmcase; 3065 if (Undiagablecaselist != NULL) 3066 newcase->next = Undiagablecaselist; 3067 Undiagablecaselist = newcase; 3068 3069 if (ffep != NULL) 3070 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3071 3072 /* add defect */ 3073 defect = fmd_nvl_create_fault(hdl, 3074 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3075 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3076 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3077 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3078 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3079 3080 /* add fault if appropriate */ 3081 faultstr = undiag_2fault_str(Undiag_reason); 3082 if (faultstr != NULL) { 3083 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3084 detector); 3085 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3086 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3087 B_FALSE); 3088 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3089 B_FALSE); 3090 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3091 } 3092 FREE(reason); 3093 3094 /* solve and close case */ 3095 fmd_case_solve(hdl, newcase->fmcase); 3096 fmd_case_close(hdl, newcase->fmcase); 3097 Undiag_reason = UD_VAL_UNKNOWN; 3098 } 3099 3100 static void 3101 fme_undiagnosable(struct fme *f) 3102 { 3103 nvlist_t *defect, *fault, *detector = NULL; 3104 struct event *ep; 3105 char *pathstr; 3106 const char *faultstr; 3107 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3108 3109 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3110 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3111 3112 for (ep = f->observations; ep; ep = ep->observations) { 3113 3114 if (ep->ffep != f->e0r) 3115 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3116 3117 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3118 platform_units_translate(0, f->config, NULL, NULL, &detector, 3119 pathstr); 3120 FREE(pathstr); 3121 3122 /* add defect */ 3123 defect = fmd_nvl_create_fault(f->hdl, 3124 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3125 NULL, NULL, detector); 3126 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3127 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3128 B_FALSE); 3129 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3130 B_FALSE); 3131 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3132 3133 /* add fault if appropriate */ 3134 faultstr = undiag_2fault_str(Undiag_reason); 3135 if (faultstr == NULL) 3136 continue; 3137 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3138 NULL, NULL, detector); 3139 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3140 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3141 B_FALSE); 3142 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3143 B_FALSE); 3144 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3145 nvlist_free(detector); 3146 } 3147 FREE(reason); 3148 fmd_case_solve(f->hdl, f->fmcase); 3149 fmd_case_close(f->hdl, f->fmcase); 3150 Undiag_reason = UD_VAL_UNKNOWN; 3151 } 3152 3153 /* 3154 * fme_close_case 3155 * 3156 * Find the requested case amongst our fmes and close it. Free up 3157 * the related fme. 3158 */ 3159 void 3160 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3161 { 3162 struct case_list *ucasep, *prevcasep = NULL; 3163 struct fme *prev = NULL; 3164 struct fme *fmep; 3165 3166 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3167 if (fmcase != ucasep->fmcase) { 3168 prevcasep = ucasep; 3169 continue; 3170 } 3171 3172 if (prevcasep == NULL) 3173 Undiagablecaselist = Undiagablecaselist->next; 3174 else 3175 prevcasep->next = ucasep->next; 3176 3177 FREE(ucasep); 3178 return; 3179 } 3180 3181 for (fmep = FMElist; fmep; fmep = fmep->next) { 3182 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3183 break; 3184 prev = fmep; 3185 } 3186 3187 if (fmep == NULL) { 3188 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3189 fmd_case_uuid(hdl, fmcase)); 3190 return; 3191 } 3192 3193 if (EFMElist == fmep) 3194 EFMElist = prev; 3195 3196 if (prev == NULL) 3197 FMElist = FMElist->next; 3198 else 3199 prev->next = fmep->next; 3200 3201 fmep->next = NULL; 3202 3203 /* Get rid of any timer this fme has set */ 3204 if (fmep->wull != 0) 3205 fmd_timer_remove(fmep->hdl, fmep->timer); 3206 3207 if (ClosedFMEs == NULL) { 3208 ClosedFMEs = fmep; 3209 } else { 3210 fmep->next = ClosedFMEs; 3211 ClosedFMEs = fmep; 3212 } 3213 3214 Open_fme_count--; 3215 3216 /* See if we can close the overflow FME */ 3217 if (Open_fme_count <= Max_fme) { 3218 for (fmep = FMElist; fmep; fmep = fmep->next) { 3219 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3220 fmep->fmcase))) 3221 break; 3222 } 3223 3224 if (fmep != NULL) 3225 fmd_case_close(fmep->hdl, fmep->fmcase); 3226 } 3227 } 3228 3229 /* 3230 * fme_set_timer() 3231 * If the time we need to wait for the given FME is less than the 3232 * current timer, kick that old timer out and establish a new one. 3233 */ 3234 static int 3235 fme_set_timer(struct fme *fmep, unsigned long long wull) 3236 { 3237 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3238 ptree_timeval(O_ALTFP|O_VERB, &wull); 3239 3240 if (wull <= fmep->pull) { 3241 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3242 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3243 out(O_ALTFP|O_VERB, NULL); 3244 /* we've waited at least wull already, don't need timer */ 3245 return (0); 3246 } 3247 3248 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3249 if (fmep->wull != 0) { 3250 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3251 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3252 out(O_ALTFP|O_VERB, NULL); 3253 } else { 3254 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3255 out(O_ALTFP|O_VERB, NULL); 3256 } 3257 3258 if (fmep->wull != 0) 3259 if (wull >= fmep->wull) 3260 /* New timer would fire later than established timer */ 3261 return (0); 3262 3263 if (fmep->wull != 0) { 3264 fmd_timer_remove(fmep->hdl, fmep->timer); 3265 } 3266 3267 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3268 fmep->e0r, wull); 3269 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3270 fmep->wull = wull; 3271 return (1); 3272 } 3273 3274 void 3275 fme_timer_fired(struct fme *fmep, id_t tid) 3276 { 3277 struct fme *ffmep = NULL; 3278 3279 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3280 if (ffmep == fmep) 3281 break; 3282 3283 if (ffmep == NULL) { 3284 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3285 (void *)fmep); 3286 return; 3287 } 3288 3289 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3290 fmep->pull = fmep->wull; 3291 fmep->wull = 0; 3292 fmd_buf_write(fmep->hdl, fmep->fmcase, 3293 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3294 3295 fme_eval(fmep, fmep->e0r); 3296 } 3297 3298 /* 3299 * Preserve the fme's suspect list in its psuspects list, NULLing the 3300 * suspects list in the meantime. 3301 */ 3302 static void 3303 save_suspects(struct fme *fmep) 3304 { 3305 struct event *ep; 3306 struct event *nextep; 3307 3308 /* zero out the previous suspect list */ 3309 for (ep = fmep->psuspects; ep; ep = nextep) { 3310 nextep = ep->psuspects; 3311 ep->psuspects = NULL; 3312 } 3313 fmep->psuspects = NULL; 3314 3315 /* zero out the suspect list, copying it to previous suspect list */ 3316 fmep->psuspects = fmep->suspects; 3317 for (ep = fmep->suspects; ep; ep = nextep) { 3318 nextep = ep->suspects; 3319 ep->psuspects = ep->suspects; 3320 ep->suspects = NULL; 3321 ep->is_suspect = 0; 3322 } 3323 fmep->suspects = NULL; 3324 fmep->nsuspects = 0; 3325 } 3326 3327 /* 3328 * Retrieve the fme's suspect list from its psuspects list. 3329 */ 3330 static void 3331 restore_suspects(struct fme *fmep) 3332 { 3333 struct event *ep; 3334 struct event *nextep; 3335 3336 fmep->nsuspects = 0; 3337 fmep->suspects = fmep->psuspects; 3338 for (ep = fmep->psuspects; ep; ep = nextep) { 3339 fmep->nsuspects++; 3340 nextep = ep->psuspects; 3341 ep->suspects = ep->psuspects; 3342 } 3343 } 3344 3345 /* 3346 * this is what we use to call the Emrys prototype code instead of main() 3347 */ 3348 static void 3349 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3350 { 3351 struct event *ep; 3352 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3353 struct rsl *srl = NULL; 3354 struct rsl *srl2 = NULL; 3355 int mess_zero_count; 3356 int rpcnt; 3357 3358 save_suspects(fmep); 3359 3360 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3361 indent_set(" "); 3362 3363 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3364 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3365 3366 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3367 fme_state2str(fmep->state)); 3368 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3369 out(O_ALTFP|O_NONL, " "); 3370 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3371 } 3372 out(O_ALTFP, NULL); 3373 3374 switch (fmep->state) { 3375 case FME_CREDIBLE: 3376 print_suspects(SLNEW, fmep); 3377 (void) upsets_eval(fmep, ffep); 3378 3379 /* 3380 * we may have already posted suspects in upsets_eval() which 3381 * can recurse into fme_eval() again. If so then just return. 3382 */ 3383 if (fmep->posted_suspects) 3384 return; 3385 3386 stats_counter_bump(fmep->diags); 3387 rpcnt = fmep->nsuspects; 3388 save_suspects(fmep); 3389 3390 /* 3391 * create two lists, one for "message=1" faults and one for 3392 * "message=0" faults. If we have a mixture we will generate 3393 * two separate suspect lists. 3394 */ 3395 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3396 bzero(srl, rpcnt * sizeof (struct rsl)); 3397 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3398 bzero(srl2, rpcnt * sizeof (struct rsl)); 3399 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3400 3401 /* 3402 * If the resulting suspect list has no members, we're 3403 * done so simply close the case. Otherwise sort and publish. 3404 */ 3405 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3406 out(O_ALTFP, 3407 "[FME%d, case %s (all suspects are upsets)]", 3408 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3409 fmd_case_close(fmep->hdl, fmep->fmcase); 3410 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3411 publish_suspects(fmep, srl); 3412 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3413 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3414 fmd_case_solve(fmep->hdl, fmep->fmcase); 3415 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3416 fmep->nsuspects = mess_zero_count; 3417 publish_suspects(fmep, srl2); 3418 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3419 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3420 fmd_case_solve(fmep->hdl, fmep->fmcase); 3421 } else { 3422 struct event *obsp; 3423 struct fme *nfmep; 3424 3425 publish_suspects(fmep, srl); 3426 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3427 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3428 fmd_case_solve(fmep->hdl, fmep->fmcase); 3429 3430 /* 3431 * Got both message=0 and message=1 so create a 3432 * duplicate case. Also need a temporary duplicate fme 3433 * structure for use by publish_suspects(). 3434 */ 3435 nfmep = alloc_fme(); 3436 nfmep->id = Nextid++; 3437 nfmep->hdl = fmep->hdl; 3438 nfmep->nsuspects = mess_zero_count; 3439 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3440 out(O_ALTFP|O_STAMP, 3441 "[creating parallel FME%d, case %s]", nfmep->id, 3442 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3443 Open_fme_count++; 3444 if (ffep) { 3445 fmd_case_setprincipal(nfmep->hdl, 3446 nfmep->fmcase, ffep); 3447 fmd_case_add_ereport(nfmep->hdl, 3448 nfmep->fmcase, ffep); 3449 } 3450 for (obsp = fmep->observations; obsp; 3451 obsp = obsp->observations) 3452 if (obsp->ffep && obsp->ffep != ffep) 3453 fmd_case_add_ereport(nfmep->hdl, 3454 nfmep->fmcase, obsp->ffep); 3455 3456 publish_suspects(nfmep, srl2); 3457 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3458 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3459 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3460 FREE(nfmep); 3461 } 3462 FREE(srl); 3463 FREE(srl2); 3464 restore_suspects(fmep); 3465 3466 fmep->posted_suspects = 1; 3467 fmd_buf_write(fmep->hdl, fmep->fmcase, 3468 WOBUF_POSTD, 3469 (void *)&fmep->posted_suspects, 3470 sizeof (fmep->posted_suspects)); 3471 3472 /* 3473 * Now the suspects have been posted, we can clear up 3474 * the instance tree as we won't be looking at it again. 3475 * Also cancel the timer as the case is now solved. 3476 */ 3477 if (fmep->wull != 0) { 3478 fmd_timer_remove(fmep->hdl, fmep->timer); 3479 fmep->wull = 0; 3480 } 3481 break; 3482 3483 case FME_WAIT: 3484 ASSERT(my_delay > fmep->ull); 3485 (void) fme_set_timer(fmep, my_delay); 3486 print_suspects(SLWAIT, fmep); 3487 itree_prune(fmep->eventtree); 3488 return; 3489 3490 case FME_DISPROVED: 3491 print_suspects(SLDISPROVED, fmep); 3492 Undiag_reason = UD_VAL_UNSOLVD; 3493 fme_undiagnosable(fmep); 3494 break; 3495 } 3496 3497 itree_free(fmep->eventtree); 3498 fmep->eventtree = NULL; 3499 structconfig_free(fmep->config); 3500 fmep->config = NULL; 3501 destroy_fme_bufs(fmep); 3502 } 3503 3504 static void indent(void); 3505 static int triggered(struct fme *fmep, struct event *ep, int mark); 3506 static enum fme_state effects_test(struct fme *fmep, 3507 struct event *fault_event, unsigned long long at_latest_by, 3508 unsigned long long *pdelay); 3509 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3510 unsigned long long at_latest_by, unsigned long long *pdelay); 3511 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3512 unsigned long long at_latest_by, unsigned long long *pdelay); 3513 3514 static int 3515 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3516 { 3517 struct constraintlist *ctp; 3518 struct evalue value; 3519 char *sep = ""; 3520 3521 if (arrowp->forever_false) { 3522 indent(); 3523 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3524 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3525 out(O_ALTFP|O_VERB|O_NONL, sep); 3526 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3527 sep = ", "; 3528 } 3529 out(O_ALTFP|O_VERB, NULL); 3530 return (0); 3531 } 3532 if (arrowp->forever_true) { 3533 indent(); 3534 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3535 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3536 out(O_ALTFP|O_VERB|O_NONL, sep); 3537 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3538 sep = ", "; 3539 } 3540 out(O_ALTFP|O_VERB, NULL); 3541 return (1); 3542 } 3543 3544 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3545 if (eval_expr(ctp->cnode, NULL, NULL, 3546 &fmep->globals, fmep->config, 3547 arrowp, 0, &value)) { 3548 /* evaluation successful */ 3549 if (value.t == UNDEFINED || value.v == 0) { 3550 /* known false */ 3551 arrowp->forever_false = 1; 3552 indent(); 3553 out(O_ALTFP|O_VERB|O_NONL, 3554 " False constraint: "); 3555 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3556 out(O_ALTFP|O_VERB, NULL); 3557 return (0); 3558 } 3559 } else { 3560 /* evaluation unsuccessful -- unknown value */ 3561 indent(); 3562 out(O_ALTFP|O_VERB|O_NONL, 3563 " Deferred constraint: "); 3564 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3565 out(O_ALTFP|O_VERB, NULL); 3566 return (1); 3567 } 3568 } 3569 /* known true */ 3570 arrowp->forever_true = 1; 3571 indent(); 3572 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3573 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3574 out(O_ALTFP|O_VERB|O_NONL, sep); 3575 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3576 sep = ", "; 3577 } 3578 out(O_ALTFP|O_VERB, NULL); 3579 return (1); 3580 } 3581 3582 static int 3583 triggered(struct fme *fmep, struct event *ep, int mark) 3584 { 3585 struct bubble *bp; 3586 struct arrowlist *ap; 3587 int count = 0; 3588 3589 stats_counter_bump(fmep->Tcallcount); 3590 for (bp = itree_next_bubble(ep, NULL); bp; 3591 bp = itree_next_bubble(ep, bp)) { 3592 if (bp->t != B_TO) 3593 continue; 3594 for (ap = itree_next_arrow(bp, NULL); ap; 3595 ap = itree_next_arrow(bp, ap)) { 3596 /* check count of marks against K in the bubble */ 3597 if ((ap->arrowp->mark & mark) && 3598 ++count >= bp->nork) 3599 return (1); 3600 } 3601 } 3602 return (0); 3603 } 3604 3605 static int 3606 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3607 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3608 { 3609 struct bubble *bp; 3610 struct arrowlist *ap; 3611 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3612 unsigned long long my_delay; 3613 enum fme_state result; 3614 int retval = 0; 3615 3616 for (bp = itree_next_bubble(ep, NULL); bp; 3617 bp = itree_next_bubble(ep, bp)) { 3618 if (bp->t != B_FROM) 3619 continue; 3620 stats_counter_bump(fmep->Marrowcount); 3621 for (ap = itree_next_arrow(bp, NULL); ap; 3622 ap = itree_next_arrow(bp, ap)) { 3623 struct event *ep2 = ap->arrowp->head->myevent; 3624 /* 3625 * if we're clearing marks, we can avoid doing 3626 * all that work evaluating constraints. 3627 */ 3628 if (mark == 0) { 3629 if (ap->arrowp->arrow_marked == 0) 3630 continue; 3631 ap->arrowp->arrow_marked = 0; 3632 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3633 if (keep && (ep2->cached_state & 3634 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3635 ep2->keep_in_tree = 1; 3636 ep2->cached_state &= 3637 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3638 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3639 keep); 3640 continue; 3641 } 3642 ap->arrowp->arrow_marked = 1; 3643 if (ep2->cached_state & REQMNTS_DISPROVED) { 3644 indent(); 3645 out(O_ALTFP|O_VERB|O_NONL, 3646 " ALREADY DISPROVED "); 3647 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3648 out(O_ALTFP|O_VERB, NULL); 3649 continue; 3650 } 3651 if (ep2->cached_state & WAIT_EFFECT) { 3652 indent(); 3653 out(O_ALTFP|O_VERB|O_NONL, 3654 " ALREADY EFFECTS WAIT "); 3655 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3656 out(O_ALTFP|O_VERB, NULL); 3657 continue; 3658 } 3659 if (ep2->cached_state & CREDIBLE_EFFECT) { 3660 indent(); 3661 out(O_ALTFP|O_VERB|O_NONL, 3662 " ALREADY EFFECTS CREDIBLE "); 3663 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3664 out(O_ALTFP|O_VERB, NULL); 3665 continue; 3666 } 3667 if ((ep2->cached_state & PARENT_WAIT) && 3668 (mark & PARENT_WAIT)) { 3669 indent(); 3670 out(O_ALTFP|O_VERB|O_NONL, 3671 " ALREADY PARENT EFFECTS WAIT "); 3672 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3673 out(O_ALTFP|O_VERB, NULL); 3674 continue; 3675 } 3676 platform_set_payloadnvp(ep2->nvp); 3677 if (checkconstraints(fmep, ap->arrowp) == 0) { 3678 platform_set_payloadnvp(NULL); 3679 indent(); 3680 out(O_ALTFP|O_VERB|O_NONL, 3681 " CONSTRAINTS FAIL "); 3682 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3683 out(O_ALTFP|O_VERB, NULL); 3684 continue; 3685 } 3686 platform_set_payloadnvp(NULL); 3687 ap->arrowp->mark |= EFFECTS_COUNTER; 3688 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3689 indent(); 3690 out(O_ALTFP|O_VERB|O_NONL, 3691 " K-COUNT NOT YET MET "); 3692 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3693 out(O_ALTFP|O_VERB, NULL); 3694 continue; 3695 } 3696 ep2->cached_state &= ~PARENT_WAIT; 3697 /* 3698 * if we've reached an ereport and no propagation time 3699 * is specified, use the Hesitate value 3700 */ 3701 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3702 ap->arrowp->maxdelay == 0ULL) { 3703 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3704 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3705 out(O_ALTFP|O_VERB, NULL); 3706 result = requirements_test(fmep, ep2, Hesitate, 3707 &my_delay); 3708 } else { 3709 result = requirements_test(fmep, ep2, 3710 at_latest_by + ap->arrowp->maxdelay, 3711 &my_delay); 3712 } 3713 if (result == FME_WAIT) { 3714 retval = WAIT_EFFECT; 3715 if (overall_delay > my_delay) 3716 overall_delay = my_delay; 3717 ep2->cached_state |= WAIT_EFFECT; 3718 indent(); 3719 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3720 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3721 out(O_ALTFP|O_VERB, NULL); 3722 indent_push(" E"); 3723 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3724 at_latest_by, &my_delay, 0) == 3725 WAIT_EFFECT) { 3726 retval = WAIT_EFFECT; 3727 if (overall_delay > my_delay) 3728 overall_delay = my_delay; 3729 } 3730 indent_pop(); 3731 } else if (result == FME_DISPROVED) { 3732 indent(); 3733 out(O_ALTFP|O_VERB|O_NONL, 3734 " EFFECTS DISPROVED "); 3735 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3736 out(O_ALTFP|O_VERB, NULL); 3737 } else { 3738 ep2->cached_state |= mark; 3739 indent(); 3740 if (mark == CREDIBLE_EFFECT) 3741 out(O_ALTFP|O_VERB|O_NONL, 3742 " EFFECTS CREDIBLE "); 3743 else 3744 out(O_ALTFP|O_VERB|O_NONL, 3745 " PARENT EFFECTS WAIT "); 3746 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3747 out(O_ALTFP|O_VERB, NULL); 3748 indent_push(" E"); 3749 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3750 &my_delay, 0) == WAIT_EFFECT) { 3751 retval = WAIT_EFFECT; 3752 if (overall_delay > my_delay) 3753 overall_delay = my_delay; 3754 } 3755 indent_pop(); 3756 } 3757 } 3758 } 3759 if (retval == WAIT_EFFECT) 3760 *pdelay = overall_delay; 3761 return (retval); 3762 } 3763 3764 static enum fme_state 3765 effects_test(struct fme *fmep, struct event *fault_event, 3766 unsigned long long at_latest_by, unsigned long long *pdelay) 3767 { 3768 struct event *error_event; 3769 enum fme_state return_value = FME_CREDIBLE; 3770 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3771 unsigned long long my_delay; 3772 3773 stats_counter_bump(fmep->Ecallcount); 3774 indent_push(" E"); 3775 indent(); 3776 out(O_ALTFP|O_VERB|O_NONL, "->"); 3777 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3778 out(O_ALTFP|O_VERB, NULL); 3779 3780 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3781 &my_delay, 0) == WAIT_EFFECT) { 3782 return_value = FME_WAIT; 3783 if (overall_delay > my_delay) 3784 overall_delay = my_delay; 3785 } 3786 for (error_event = fmep->observations; 3787 error_event; error_event = error_event->observations) { 3788 indent(); 3789 out(O_ALTFP|O_VERB|O_NONL, " "); 3790 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3791 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3792 if (error_event->cached_state & 3793 (PARENT_WAIT|WAIT_EFFECT)) { 3794 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3795 continue; 3796 } 3797 return_value = FME_DISPROVED; 3798 out(O_ALTFP|O_VERB, " NOT triggered"); 3799 break; 3800 } else { 3801 out(O_ALTFP|O_VERB, " triggered"); 3802 } 3803 } 3804 if (return_value == FME_DISPROVED) { 3805 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3806 } else { 3807 fault_event->keep_in_tree = 1; 3808 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3809 } 3810 3811 indent(); 3812 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3813 fme_state2str(return_value)); 3814 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3815 out(O_ALTFP|O_VERB, NULL); 3816 indent_pop(); 3817 if (return_value == FME_WAIT) 3818 *pdelay = overall_delay; 3819 return (return_value); 3820 } 3821 3822 static enum fme_state 3823 requirements_test(struct fme *fmep, struct event *ep, 3824 unsigned long long at_latest_by, unsigned long long *pdelay) 3825 { 3826 int waiting_events; 3827 int credible_events; 3828 int deferred_events; 3829 enum fme_state return_value = FME_CREDIBLE; 3830 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3831 unsigned long long arrow_delay; 3832 unsigned long long my_delay; 3833 struct event *ep2; 3834 struct bubble *bp; 3835 struct arrowlist *ap; 3836 3837 if (ep->cached_state & REQMNTS_CREDIBLE) { 3838 indent(); 3839 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3840 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3841 out(O_ALTFP|O_VERB, NULL); 3842 return (FME_CREDIBLE); 3843 } 3844 if (ep->cached_state & REQMNTS_DISPROVED) { 3845 indent(); 3846 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3847 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3848 out(O_ALTFP|O_VERB, NULL); 3849 return (FME_DISPROVED); 3850 } 3851 if (ep->cached_state & REQMNTS_WAIT) { 3852 indent(); 3853 *pdelay = ep->cached_delay; 3854 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3855 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3856 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3857 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3858 out(O_ALTFP|O_VERB, NULL); 3859 return (FME_WAIT); 3860 } 3861 stats_counter_bump(fmep->Rcallcount); 3862 indent_push(" R"); 3863 indent(); 3864 out(O_ALTFP|O_VERB|O_NONL, "->"); 3865 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3866 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3867 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3868 out(O_ALTFP|O_VERB, NULL); 3869 3870 if (ep->t == N_EREPORT) { 3871 if (ep->count == 0) { 3872 if (fmep->pull >= at_latest_by) { 3873 return_value = FME_DISPROVED; 3874 } else { 3875 ep->cached_delay = *pdelay = at_latest_by; 3876 return_value = FME_WAIT; 3877 } 3878 } 3879 3880 indent(); 3881 switch (return_value) { 3882 case FME_CREDIBLE: 3883 ep->cached_state |= REQMNTS_CREDIBLE; 3884 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3885 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3886 break; 3887 case FME_DISPROVED: 3888 ep->cached_state |= REQMNTS_DISPROVED; 3889 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3890 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3891 break; 3892 case FME_WAIT: 3893 ep->cached_state |= REQMNTS_WAIT; 3894 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3895 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3896 out(O_ALTFP|O_VERB|O_NONL, " to "); 3897 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3898 break; 3899 default: 3900 out(O_DIE, "requirements_test: unexpected fme_state"); 3901 break; 3902 } 3903 out(O_ALTFP|O_VERB, NULL); 3904 indent_pop(); 3905 3906 return (return_value); 3907 } 3908 3909 /* this event is not a report, descend the tree */ 3910 for (bp = itree_next_bubble(ep, NULL); bp; 3911 bp = itree_next_bubble(ep, bp)) { 3912 int n; 3913 3914 if (bp->t != B_FROM) 3915 continue; 3916 3917 n = bp->nork; 3918 3919 credible_events = 0; 3920 waiting_events = 0; 3921 deferred_events = 0; 3922 arrow_delay = TIMEVAL_EVENTUALLY; 3923 /* 3924 * n is -1 for 'A' so adjust it. 3925 * XXX just count up the arrows for now. 3926 */ 3927 if (n < 0) { 3928 n = 0; 3929 for (ap = itree_next_arrow(bp, NULL); ap; 3930 ap = itree_next_arrow(bp, ap)) 3931 n++; 3932 indent(); 3933 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3934 } else { 3935 indent(); 3936 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3937 } 3938 3939 if (n == 0) 3940 continue; 3941 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3942 for (ap = itree_next_arrow(bp, NULL); ap; 3943 ap = itree_next_arrow(bp, ap)) { 3944 ep2 = ap->arrowp->head->myevent; 3945 platform_set_payloadnvp(ep2->nvp); 3946 (void) checkconstraints(fmep, ap->arrowp); 3947 if (!ap->arrowp->forever_false) { 3948 /* 3949 * if all arrows are invalidated by the 3950 * constraints, then we should elide the 3951 * whole bubble to be consistant with 3952 * the tree creation time behaviour 3953 */ 3954 bp->mark |= BUBBLE_OK; 3955 platform_set_payloadnvp(NULL); 3956 break; 3957 } 3958 platform_set_payloadnvp(NULL); 3959 } 3960 } 3961 for (ap = itree_next_arrow(bp, NULL); ap; 3962 ap = itree_next_arrow(bp, ap)) { 3963 ep2 = ap->arrowp->head->myevent; 3964 if (n <= credible_events) 3965 break; 3966 3967 ap->arrowp->mark |= REQMNTS_COUNTER; 3968 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3969 /* XXX adding max timevals! */ 3970 switch (requirements_test(fmep, ep2, 3971 at_latest_by + ap->arrowp->maxdelay, 3972 &my_delay)) { 3973 case FME_DEFERRED: 3974 deferred_events++; 3975 break; 3976 case FME_CREDIBLE: 3977 credible_events++; 3978 break; 3979 case FME_DISPROVED: 3980 break; 3981 case FME_WAIT: 3982 if (my_delay < arrow_delay) 3983 arrow_delay = my_delay; 3984 waiting_events++; 3985 break; 3986 default: 3987 out(O_DIE, 3988 "Bug in requirements_test."); 3989 } 3990 else 3991 deferred_events++; 3992 } 3993 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 3994 bp->mark |= BUBBLE_ELIDED; 3995 continue; 3996 } 3997 indent(); 3998 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 3999 credible_events + deferred_events, waiting_events); 4000 if (credible_events + deferred_events + waiting_events < n) { 4001 /* Can never meet requirements */ 4002 ep->cached_state |= REQMNTS_DISPROVED; 4003 indent(); 4004 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4005 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4006 out(O_ALTFP|O_VERB, NULL); 4007 indent_pop(); 4008 return (FME_DISPROVED); 4009 } 4010 if (credible_events + deferred_events < n) { 4011 /* will have to wait */ 4012 /* wait time is shortest known */ 4013 if (arrow_delay < overall_delay) 4014 overall_delay = arrow_delay; 4015 return_value = FME_WAIT; 4016 } else if (credible_events < n) { 4017 if (return_value != FME_WAIT) 4018 return_value = FME_DEFERRED; 4019 } 4020 } 4021 4022 /* 4023 * don't mark as FME_DEFERRED. If this event isn't reached by another 4024 * path, then this will be considered FME_CREDIBLE. But if it is 4025 * reached by a different path so the K-count is met, then might 4026 * get overridden by FME_WAIT or FME_DISPROVED. 4027 */ 4028 if (return_value == FME_WAIT) { 4029 ep->cached_state |= REQMNTS_WAIT; 4030 ep->cached_delay = *pdelay = overall_delay; 4031 } else if (return_value == FME_CREDIBLE) { 4032 ep->cached_state |= REQMNTS_CREDIBLE; 4033 } 4034 indent(); 4035 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4036 fme_state2str(return_value)); 4037 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4038 out(O_ALTFP|O_VERB, NULL); 4039 indent_pop(); 4040 return (return_value); 4041 } 4042 4043 static enum fme_state 4044 causes_test(struct fme *fmep, struct event *ep, 4045 unsigned long long at_latest_by, unsigned long long *pdelay) 4046 { 4047 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4048 unsigned long long my_delay; 4049 int credible_results = 0; 4050 int waiting_results = 0; 4051 enum fme_state fstate; 4052 struct event *tail_event; 4053 struct bubble *bp; 4054 struct arrowlist *ap; 4055 int k = 1; 4056 4057 stats_counter_bump(fmep->Ccallcount); 4058 indent_push(" C"); 4059 indent(); 4060 out(O_ALTFP|O_VERB|O_NONL, "->"); 4061 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4062 out(O_ALTFP|O_VERB, NULL); 4063 4064 for (bp = itree_next_bubble(ep, NULL); bp; 4065 bp = itree_next_bubble(ep, bp)) { 4066 if (bp->t != B_TO) 4067 continue; 4068 k = bp->nork; /* remember the K value */ 4069 for (ap = itree_next_arrow(bp, NULL); ap; 4070 ap = itree_next_arrow(bp, ap)) { 4071 int do_not_follow = 0; 4072 4073 /* 4074 * if we get to the same event multiple times 4075 * only worry about the first one. 4076 */ 4077 if (ap->arrowp->tail->myevent->cached_state & 4078 CAUSES_TESTED) { 4079 indent(); 4080 out(O_ALTFP|O_VERB|O_NONL, 4081 " causes test already run for "); 4082 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4083 ap->arrowp->tail->myevent); 4084 out(O_ALTFP|O_VERB, NULL); 4085 continue; 4086 } 4087 4088 /* 4089 * see if false constraint prevents us 4090 * from traversing this arrow 4091 */ 4092 platform_set_payloadnvp(ep->nvp); 4093 if (checkconstraints(fmep, ap->arrowp) == 0) 4094 do_not_follow = 1; 4095 platform_set_payloadnvp(NULL); 4096 if (do_not_follow) { 4097 indent(); 4098 out(O_ALTFP|O_VERB|O_NONL, 4099 " False arrow from "); 4100 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4101 ap->arrowp->tail->myevent); 4102 out(O_ALTFP|O_VERB, NULL); 4103 continue; 4104 } 4105 4106 ap->arrowp->tail->myevent->cached_state |= 4107 CAUSES_TESTED; 4108 tail_event = ap->arrowp->tail->myevent; 4109 fstate = hypothesise(fmep, tail_event, at_latest_by, 4110 &my_delay); 4111 4112 switch (fstate) { 4113 case FME_WAIT: 4114 if (my_delay < overall_delay) 4115 overall_delay = my_delay; 4116 waiting_results++; 4117 break; 4118 case FME_CREDIBLE: 4119 credible_results++; 4120 break; 4121 case FME_DISPROVED: 4122 break; 4123 default: 4124 out(O_DIE, "Bug in causes_test"); 4125 } 4126 } 4127 } 4128 /* compare against K */ 4129 if (credible_results + waiting_results < k) { 4130 indent(); 4131 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4132 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4133 out(O_ALTFP|O_VERB, NULL); 4134 indent_pop(); 4135 return (FME_DISPROVED); 4136 } 4137 if (waiting_results != 0) { 4138 *pdelay = overall_delay; 4139 indent(); 4140 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4141 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4142 out(O_ALTFP|O_VERB|O_NONL, " to "); 4143 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4144 out(O_ALTFP|O_VERB, NULL); 4145 indent_pop(); 4146 return (FME_WAIT); 4147 } 4148 indent(); 4149 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4150 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4151 out(O_ALTFP|O_VERB, NULL); 4152 indent_pop(); 4153 return (FME_CREDIBLE); 4154 } 4155 4156 static enum fme_state 4157 hypothesise(struct fme *fmep, struct event *ep, 4158 unsigned long long at_latest_by, unsigned long long *pdelay) 4159 { 4160 enum fme_state rtr, otr; 4161 unsigned long long my_delay; 4162 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4163 4164 stats_counter_bump(fmep->Hcallcount); 4165 indent_push(" H"); 4166 indent(); 4167 out(O_ALTFP|O_VERB|O_NONL, "->"); 4168 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4169 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4170 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4171 out(O_ALTFP|O_VERB, NULL); 4172 4173 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4174 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4175 overall_delay = my_delay; 4176 if (rtr != FME_DISPROVED) { 4177 if (is_problem(ep->t)) { 4178 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4179 if (otr != FME_DISPROVED) { 4180 if (fmep->peek == 0 && ep->is_suspect == 0) { 4181 ep->suspects = fmep->suspects; 4182 ep->is_suspect = 1; 4183 fmep->suspects = ep; 4184 fmep->nsuspects++; 4185 } 4186 } 4187 } else 4188 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4189 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4190 overall_delay = my_delay; 4191 if ((otr != FME_DISPROVED) && 4192 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4193 *pdelay = overall_delay; 4194 } 4195 if (rtr == FME_DISPROVED) { 4196 indent(); 4197 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4198 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4199 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4200 indent_pop(); 4201 return (FME_DISPROVED); 4202 } 4203 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4204 indent(); 4205 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4206 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4207 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4208 indent_pop(); 4209 return (FME_DISPROVED); 4210 } 4211 if (otr == FME_DISPROVED) { 4212 indent(); 4213 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4214 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4215 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4216 indent_pop(); 4217 return (FME_DISPROVED); 4218 } 4219 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4220 indent(); 4221 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4222 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4223 out(O_ALTFP|O_VERB|O_NONL, " to "); 4224 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4225 out(O_ALTFP|O_VERB, NULL); 4226 indent_pop(); 4227 return (FME_WAIT); 4228 } 4229 indent(); 4230 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4231 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4232 out(O_ALTFP|O_VERB, NULL); 4233 indent_pop(); 4234 return (FME_CREDIBLE); 4235 } 4236 4237 /* 4238 * fme_istat_load -- reconstitute any persistent istats 4239 */ 4240 void 4241 fme_istat_load(fmd_hdl_t *hdl) 4242 { 4243 int sz; 4244 char *sbuf; 4245 char *ptr; 4246 4247 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4248 out(O_ALTFP, "fme_istat_load: No stats"); 4249 return; 4250 } 4251 4252 sbuf = alloca(sz); 4253 4254 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4255 4256 /* 4257 * pick apart the serialized stats 4258 * 4259 * format is: 4260 * <class-name>, '@', <path>, '\0', <value>, '\0' 4261 * for example: 4262 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4263 * 4264 * since this is parsing our own serialized data, any parsing issues 4265 * are fatal, so we check for them all with ASSERT() below. 4266 */ 4267 ptr = sbuf; 4268 while (ptr < &sbuf[sz]) { 4269 char *sepptr; 4270 struct node *np; 4271 int val; 4272 4273 sepptr = strchr(ptr, '@'); 4274 ASSERT(sepptr != NULL); 4275 *sepptr = '\0'; 4276 4277 /* construct the event */ 4278 np = newnode(T_EVENT, NULL, 0); 4279 np->u.event.ename = newnode(T_NAME, NULL, 0); 4280 np->u.event.ename->u.name.t = N_STAT; 4281 np->u.event.ename->u.name.s = stable(ptr); 4282 np->u.event.ename->u.name.it = IT_ENAME; 4283 np->u.event.ename->u.name.last = np->u.event.ename; 4284 4285 ptr = sepptr + 1; 4286 ASSERT(ptr < &sbuf[sz]); 4287 ptr += strlen(ptr); 4288 ptr++; /* move past the '\0' separating path from value */ 4289 ASSERT(ptr < &sbuf[sz]); 4290 ASSERT(isdigit(*ptr)); 4291 val = atoi(ptr); 4292 ASSERT(val > 0); 4293 ptr += strlen(ptr); 4294 ptr++; /* move past the final '\0' for this entry */ 4295 4296 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4297 ASSERT(np->u.event.epname != NULL); 4298 4299 istat_bump(np, val); 4300 tree_free(np); 4301 } 4302 4303 istat_save(); 4304 } 4305