1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2012 Milan Jurik. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 * 27 * fme.c -- fault management exercise module 28 * 29 * this module provides the simulated fault management exercise. 30 */ 31 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <strings.h> 36 #include <ctype.h> 37 #include <alloca.h> 38 #include <libnvpair.h> 39 #include <sys/fm/protocol.h> 40 #include <fm/fmd_api.h> 41 #include <fm/libtopo.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 #include "esclex.h" 58 59 struct lut *Istats; 60 struct lut *SerdEngines; 61 nvlist_t *Action_nvl; 62 63 /* imported from eft.c... */ 64 extern hrtime_t Hesitate; 65 extern char *Serd_Override; 66 extern nv_alloc_t Eft_nv_hdl; 67 extern int Max_fme; 68 extern fmd_hdl_t *Hdl; 69 70 static int Istat_need_save; 71 static int Serd_need_save; 72 void istat_save(void); 73 void serd_save(void); 74 75 /* fme under construction is global so we can free it on module abort */ 76 static struct fme *Nfmep; 77 78 static int Undiag_reason = UD_VAL_UNKNOWN; 79 80 static int Nextid = 0; 81 82 static int Open_fme_count = 0; /* Count of open FMEs */ 83 84 /* list of fault management exercises underway */ 85 static struct fme { 86 struct fme *next; /* next exercise */ 87 unsigned long long ull; /* time when fme was created */ 88 int id; /* FME id */ 89 struct config *config; /* cooked configuration data */ 90 struct lut *eventtree; /* propagation tree for this FME */ 91 /* 92 * The initial error report that created this FME is kept in 93 * two forms. e0 points to the instance tree node and is used 94 * by fme_eval() as the starting point for the inference 95 * algorithm. e0r is the event handle FMD passed to us when 96 * the ereport first arrived and is used when setting timers, 97 * which are always relative to the time of this initial 98 * report. 99 */ 100 struct event *e0; 101 fmd_event_t *e0r; 102 103 id_t timer; /* for setting an fmd time-out */ 104 105 struct event *ecurrent; /* ereport under consideration */ 106 struct event *suspects; /* current suspect list */ 107 struct event *psuspects; /* previous suspect list */ 108 int nsuspects; /* count of suspects */ 109 int posted_suspects; /* true if we've posted a diagnosis */ 110 int uniqobs; /* number of unique events observed */ 111 int peek; /* just peeking, don't track suspects */ 112 int overflow; /* true if overflow FME */ 113 enum fme_state { 114 FME_NOTHING = 5000, /* not evaluated yet */ 115 FME_WAIT, /* need to wait for more info */ 116 FME_CREDIBLE, /* suspect list is credible */ 117 FME_DISPROVED, /* no valid suspects found */ 118 FME_DEFERRED /* don't know yet (k-count not met) */ 119 } state; 120 121 unsigned long long pull; /* time passed since created */ 122 unsigned long long wull; /* wait until this time for re-eval */ 123 struct event *observations; /* observation list */ 124 struct lut *globals; /* values of global variables */ 125 /* fmd interfacing */ 126 fmd_hdl_t *hdl; /* handle for talking with fmd */ 127 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 128 /* stats */ 129 struct stats *Rcount; 130 struct stats *Hcallcount; 131 struct stats *Rcallcount; 132 struct stats *Ccallcount; 133 struct stats *Ecallcount; 134 struct stats *Tcallcount; 135 struct stats *Marrowcount; 136 struct stats *diags; 137 } *FMElist, *EFMElist, *ClosedFMEs; 138 139 static struct case_list { 140 fmd_case_t *fmcase; 141 struct case_list *next; 142 } *Undiagablecaselist; 143 144 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 145 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 146 unsigned long long at_latest_by, unsigned long long *pdelay); 147 static struct node *eventprop_lookup(struct event *ep, const char *propname); 148 static struct node *pathstring2epnamenp(char *path); 149 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 150 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 151 static char *undiag_2reason_str(int ud, char *arg); 152 static const char *undiag_2defect_str(int ud); 153 static void restore_suspects(struct fme *fmep); 154 static void save_suspects(struct fme *fmep); 155 static void destroy_fme(struct fme *f); 156 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 157 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 158 static void istat_counter_reset_cb(struct istat_entry *entp, 159 struct stats *statp, const struct ipath *ipp); 160 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 161 struct stats *statp, void *unused); 162 static void serd_reset_cb(struct serd_entry *entp, void *unused, 163 const struct ipath *ipp); 164 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 165 void *unused2); 166 static void destroy_fme_bufs(struct fme *fp); 167 168 static struct fme * 169 alloc_fme(void) 170 { 171 struct fme *fmep; 172 173 fmep = MALLOC(sizeof (*fmep)); 174 bzero(fmep, sizeof (*fmep)); 175 return (fmep); 176 } 177 178 /* 179 * fme_ready -- called when all initialization of the FME (except for 180 * stats) has completed successfully. Adds the fme to global lists 181 * and establishes its stats. 182 */ 183 static struct fme * 184 fme_ready(struct fme *fmep) 185 { 186 char nbuf[100]; 187 188 Nfmep = NULL; /* don't need to free this on module abort now */ 189 190 if (EFMElist) { 191 EFMElist->next = fmep; 192 EFMElist = fmep; 193 } else 194 FMElist = EFMElist = fmep; 195 196 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 197 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 198 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 199 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 200 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 201 fmep->Rcallcount = stats_new_counter(nbuf, 202 "calls to requirements_test()", 1); 203 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 204 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 205 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 206 fmep->Ecallcount = 207 stats_new_counter(nbuf, "calls to effects_test()", 1); 208 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 209 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 210 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 211 fmep->Marrowcount = stats_new_counter(nbuf, 212 "arrows marked by mark_arrows()", 1); 213 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 214 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 215 216 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 217 config_print(O_ALTFP|O_VERB2, fmep->config); 218 219 return (fmep); 220 } 221 222 extern void ipath_dummy_lut(struct arrow *); 223 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 224 225 /* ARGSUSED */ 226 static void 227 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 228 { 229 struct bubble *bp; 230 struct arrowlist *ap; 231 232 for (bp = itree_next_bubble(ep, NULL); bp; 233 bp = itree_next_bubble(ep, bp)) { 234 if (bp->t != B_FROM) 235 continue; 236 for (ap = itree_next_arrow(bp, NULL); ap; 237 ap = itree_next_arrow(bp, ap)) { 238 ap->arrowp->pnode->u.arrow.needed = 1; 239 ipath_dummy_lut(ap->arrowp); 240 } 241 } 242 } 243 244 /* ARGSUSED */ 245 static void 246 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 247 { 248 struct bubble *bp; 249 struct arrowlist *ap; 250 251 for (bp = itree_next_bubble(ep, NULL); bp; 252 bp = itree_next_bubble(ep, bp)) { 253 if (bp->t != B_FROM) 254 continue; 255 for (ap = itree_next_arrow(bp, NULL); ap; 256 ap = itree_next_arrow(bp, ap)) 257 ap->arrowp->pnode->u.arrow.needed = 0; 258 } 259 } 260 261 static void globals_destructor(void *left, void *right, void *arg); 262 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 263 264 static boolean_t 265 prune_propagations(const char *e0class, const struct ipath *e0ipp) 266 { 267 char nbuf[100]; 268 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 269 extern struct lut *Usednames; 270 271 Nfmep = alloc_fme(); 272 Nfmep->id = Nextid; 273 Nfmep->state = FME_NOTHING; 274 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 275 if ((Nfmep->e0 = 276 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 277 itree_free(Nfmep->eventtree); 278 FREE(Nfmep); 279 Nfmep = NULL; 280 return (B_FALSE); 281 } 282 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 283 Nfmep->e0->count++; 284 285 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 286 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 287 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 288 Nfmep->Hcallcount = 289 stats_new_counter(nbuf, "calls to hypothesise()", 1); 290 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 291 Nfmep->Rcallcount = stats_new_counter(nbuf, 292 "calls to requirements_test()", 1); 293 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 294 Nfmep->Ccallcount = 295 stats_new_counter(nbuf, "calls to causes_test()", 1); 296 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 297 Nfmep->Ecallcount = 298 stats_new_counter(nbuf, "calls to effects_test()", 1); 299 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 300 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 301 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 302 Nfmep->Marrowcount = stats_new_counter(nbuf, 303 "arrows marked by mark_arrows()", 1); 304 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 305 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 306 307 Nfmep->peek = 1; 308 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 309 lut_free(Usednames, NULL, NULL); 310 Usednames = NULL; 311 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 312 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 313 itree_prune(Nfmep->eventtree); 314 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 315 316 stats_delete(Nfmep->Rcount); 317 stats_delete(Nfmep->Hcallcount); 318 stats_delete(Nfmep->Rcallcount); 319 stats_delete(Nfmep->Ccallcount); 320 stats_delete(Nfmep->Ecallcount); 321 stats_delete(Nfmep->Tcallcount); 322 stats_delete(Nfmep->Marrowcount); 323 stats_delete(Nfmep->diags); 324 itree_free(Nfmep->eventtree); 325 lut_free(Nfmep->globals, globals_destructor, NULL); 326 FREE(Nfmep); 327 return (B_TRUE); 328 } 329 330 static struct fme * 331 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 332 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 333 { 334 struct cfgdata *cfgdata; 335 int init_size; 336 extern int alloc_total(); 337 nvlist_t *detector = NULL; 338 char *pathstr; 339 char *arg; 340 341 /* 342 * First check if e0ipp is actually in the topology so we can give a 343 * more useful error message. 344 */ 345 ipathlastcomp(e0ipp); 346 pathstr = ipath2str(NULL, e0ipp); 347 cfgdata = config_snapshot(); 348 platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE, 349 &detector, pathstr); 350 FREE(pathstr); 351 structconfig_free(cfgdata->cooked); 352 config_free(cfgdata); 353 if (detector == NULL) { 354 /* See if class permits silent discard on unknown component. */ 355 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 356 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 357 "to component path, but silent discard allowed.", 358 e0class); 359 fmd_case_close(hdl, fmcase); 360 } else { 361 Undiag_reason = UD_VAL_BADEVENTPATH; 362 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 363 &detector); 364 arg = ipath2str(e0class, e0ipp); 365 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 366 FREE(arg); 367 } 368 return (NULL); 369 } 370 371 /* 372 * Next run a quick first pass of the rules with a dummy config. This 373 * allows us to prune those rules which can't possibly cause this 374 * ereport. 375 */ 376 if (!prune_propagations(e0class, e0ipp)) { 377 /* 378 * The fault class must have been in the rules or we would 379 * not have registered for it (and got a "nosub"), and the 380 * pathname must be in the topology or we would have failed the 381 * previous test. So to get here means the combination of 382 * class and pathname in the ereport must be invalid. 383 */ 384 Undiag_reason = UD_VAL_BADEVENTCLASS; 385 arg = ipath2str(e0class, e0ipp); 386 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 387 nvlist_free(detector); 388 FREE(arg); 389 return (NULL); 390 } 391 392 /* 393 * Now go ahead and create the real fme using the pruned rules. 394 */ 395 init_size = alloc_total(); 396 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 397 nvlist_free(detector); 398 pathstr = ipath2str(NULL, e0ipp); 399 cfgdata = config_snapshot(); 400 platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE, 401 &detector, pathstr); 402 FREE(pathstr); 403 platform_save_config(hdl, fmcase); 404 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 405 alloc_total() - init_size); 406 407 Nfmep = alloc_fme(); 408 409 Nfmep->id = Nextid++; 410 Nfmep->config = cfgdata->cooked; 411 config_free(cfgdata); 412 Nfmep->posted_suspects = 0; 413 Nfmep->uniqobs = 0; 414 Nfmep->state = FME_NOTHING; 415 Nfmep->pull = 0ULL; 416 Nfmep->overflow = 0; 417 418 Nfmep->fmcase = fmcase; 419 Nfmep->hdl = hdl; 420 421 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 422 Undiag_reason = UD_VAL_INSTFAIL; 423 arg = ipath2str(e0class, e0ipp); 424 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 425 nvlist_free(detector); 426 FREE(arg); 427 structconfig_free(Nfmep->config); 428 destroy_fme_bufs(Nfmep); 429 FREE(Nfmep); 430 Nfmep = NULL; 431 return (NULL); 432 } 433 434 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 435 436 if ((Nfmep->e0 = 437 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 438 Undiag_reason = UD_VAL_BADEVENTI; 439 arg = ipath2str(e0class, e0ipp); 440 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 441 nvlist_free(detector); 442 FREE(arg); 443 itree_free(Nfmep->eventtree); 444 structconfig_free(Nfmep->config); 445 destroy_fme_bufs(Nfmep); 446 FREE(Nfmep); 447 Nfmep = NULL; 448 return (NULL); 449 } 450 451 nvlist_free(detector); 452 return (fme_ready(Nfmep)); 453 } 454 455 void 456 fme_fini(void) 457 { 458 struct fme *sfp, *fp; 459 struct case_list *ucasep, *nextcasep; 460 461 ucasep = Undiagablecaselist; 462 while (ucasep != NULL) { 463 nextcasep = ucasep->next; 464 FREE(ucasep); 465 ucasep = nextcasep; 466 } 467 Undiagablecaselist = NULL; 468 469 /* clean up closed fmes */ 470 fp = ClosedFMEs; 471 while (fp != NULL) { 472 sfp = fp->next; 473 destroy_fme(fp); 474 fp = sfp; 475 } 476 ClosedFMEs = NULL; 477 478 fp = FMElist; 479 while (fp != NULL) { 480 sfp = fp->next; 481 destroy_fme(fp); 482 fp = sfp; 483 } 484 FMElist = EFMElist = NULL; 485 486 /* if we were in the middle of creating an fme, free it now */ 487 if (Nfmep) { 488 destroy_fme(Nfmep); 489 Nfmep = NULL; 490 } 491 } 492 493 /* 494 * Allocated space for a buffer name. 20 bytes allows for 495 * a ridiculous 9,999,999 unique observations. 496 */ 497 #define OBBUFNMSZ 20 498 499 /* 500 * serialize_observation 501 * 502 * Create a recoverable version of the current observation 503 * (f->ecurrent). We keep a serialized version of each unique 504 * observation in order that we may resume correctly the fme in the 505 * correct state if eft or fmd crashes and we're restarted. 506 */ 507 static void 508 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 509 { 510 size_t pkdlen; 511 char tmpbuf[OBBUFNMSZ]; 512 char *pkd = NULL; 513 char *estr; 514 515 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 516 estr = ipath2str(cls, ipp); 517 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 518 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 519 strlen(estr) + 1); 520 FREE(estr); 521 522 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 523 (void) snprintf(tmpbuf, 524 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 525 if (nvlist_xpack(fp->ecurrent->nvp, 526 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 527 out(O_DIE|O_SYS, "pack of observed nvl failed"); 528 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 529 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 530 FREE(pkd); 531 } 532 533 fp->uniqobs++; 534 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 535 sizeof (fp->uniqobs)); 536 } 537 538 /* 539 * init_fme_bufs -- We keep several bits of state about an fme for 540 * use if eft or fmd crashes and we're restarted. 541 */ 542 static void 543 init_fme_bufs(struct fme *fp) 544 { 545 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 546 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 547 sizeof (fp->pull)); 548 549 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 550 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 551 sizeof (fp->id)); 552 553 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 554 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 555 sizeof (fp->uniqobs)); 556 557 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 558 sizeof (fp->posted_suspects)); 559 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 560 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 561 } 562 563 static void 564 destroy_fme_bufs(struct fme *fp) 565 { 566 char tmpbuf[OBBUFNMSZ]; 567 int o; 568 569 platform_restore_config(fp->hdl, fp->fmcase); 570 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 571 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 572 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 573 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 574 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 575 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 576 577 for (o = 0; o < fp->uniqobs; o++) { 578 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 579 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 580 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 581 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 582 } 583 } 584 585 /* 586 * reconstitute_observations -- convert a case's serialized observations 587 * back into struct events. Returns zero if all observations are 588 * successfully reconstituted. 589 */ 590 static int 591 reconstitute_observations(struct fme *fmep) 592 { 593 struct event *ep; 594 struct node *epnamenp = NULL; 595 size_t pkdlen; 596 char *pkd = NULL; 597 char *tmpbuf = alloca(OBBUFNMSZ); 598 char *sepptr; 599 char *estr; 600 int ocnt; 601 int elen; 602 603 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 604 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 605 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 606 if (elen == 0) { 607 out(O_ALTFP, 608 "reconstitute_observation: no %s buffer found.", 609 tmpbuf); 610 Undiag_reason = UD_VAL_MISSINGOBS; 611 break; 612 } 613 614 estr = MALLOC(elen); 615 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 616 sepptr = strchr(estr, '@'); 617 if (sepptr == NULL) { 618 out(O_ALTFP, 619 "reconstitute_observation: %s: " 620 "missing @ separator in %s.", 621 tmpbuf, estr); 622 Undiag_reason = UD_VAL_MISSINGPATH; 623 FREE(estr); 624 break; 625 } 626 627 *sepptr = '\0'; 628 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 629 out(O_ALTFP, 630 "reconstitute_observation: %s: " 631 "trouble converting path string \"%s\" " 632 "to internal representation.", 633 tmpbuf, sepptr + 1); 634 Undiag_reason = UD_VAL_MISSINGPATH; 635 FREE(estr); 636 break; 637 } 638 639 /* construct the event */ 640 ep = itree_lookup(fmep->eventtree, 641 stable(estr), ipath(epnamenp)); 642 if (ep == NULL) { 643 out(O_ALTFP, 644 "reconstitute_observation: %s: " 645 "lookup of \"%s\" in itree failed.", 646 tmpbuf, ipath2str(estr, ipath(epnamenp))); 647 Undiag_reason = UD_VAL_BADOBS; 648 tree_free(epnamenp); 649 FREE(estr); 650 break; 651 } 652 tree_free(epnamenp); 653 654 /* 655 * We may or may not have a saved nvlist for the observation 656 */ 657 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 658 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 659 if (pkdlen != 0) { 660 pkd = MALLOC(pkdlen); 661 fmd_buf_read(fmep->hdl, 662 fmep->fmcase, tmpbuf, pkd, pkdlen); 663 ASSERT(ep->nvp == NULL); 664 if (nvlist_xunpack(pkd, 665 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 666 out(O_DIE|O_SYS, "pack of observed nvl failed"); 667 FREE(pkd); 668 } 669 670 if (ocnt == 0) 671 fmep->e0 = ep; 672 673 FREE(estr); 674 fmep->ecurrent = ep; 675 ep->count++; 676 677 /* link it into list of observations seen */ 678 ep->observations = fmep->observations; 679 fmep->observations = ep; 680 } 681 682 if (ocnt == fmep->uniqobs) { 683 (void) fme_ready(fmep); 684 return (0); 685 } 686 687 return (1); 688 } 689 690 /* 691 * restart_fme -- called during eft initialization. Reconstitutes 692 * an in-progress fme. 693 */ 694 void 695 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 696 { 697 nvlist_t *defect; 698 struct case_list *bad; 699 struct fme *fmep; 700 struct cfgdata *cfgdata; 701 size_t rawsz; 702 struct event *ep; 703 char *tmpbuf = alloca(OBBUFNMSZ); 704 char *sepptr; 705 char *estr; 706 int elen; 707 struct node *epnamenp = NULL; 708 int init_size; 709 extern int alloc_total(); 710 char *reason; 711 712 /* 713 * ignore solved or closed cases 714 */ 715 if (fmd_case_solved(hdl, inprogress) || 716 fmd_case_closed(hdl, inprogress)) 717 return; 718 719 fmep = alloc_fme(); 720 fmep->fmcase = inprogress; 721 fmep->hdl = hdl; 722 723 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 724 out(O_ALTFP, "restart_fme: no saved posted status"); 725 Undiag_reason = UD_VAL_MISSINGINFO; 726 goto badcase; 727 } else { 728 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 729 (void *)&fmep->posted_suspects, 730 sizeof (fmep->posted_suspects)); 731 } 732 733 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 734 out(O_ALTFP, "restart_fme: no saved id"); 735 Undiag_reason = UD_VAL_MISSINGINFO; 736 goto badcase; 737 } else { 738 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 739 sizeof (fmep->id)); 740 } 741 if (Nextid <= fmep->id) 742 Nextid = fmep->id + 1; 743 744 out(O_ALTFP, "Replay FME %d", fmep->id); 745 746 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 747 out(O_ALTFP, "restart_fme: No config data"); 748 Undiag_reason = UD_VAL_MISSINGINFO; 749 goto badcase; 750 } 751 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 752 sizeof (size_t)); 753 754 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 755 out(O_ALTFP, "restart_fme: No event zero"); 756 Undiag_reason = UD_VAL_MISSINGZERO; 757 goto badcase; 758 } 759 760 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 761 out(O_ALTFP, "restart_fme: no saved wait time"); 762 Undiag_reason = UD_VAL_MISSINGINFO; 763 goto badcase; 764 } else { 765 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 766 sizeof (fmep->pull)); 767 } 768 769 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 770 out(O_ALTFP, "restart_fme: no count of observations"); 771 Undiag_reason = UD_VAL_MISSINGINFO; 772 goto badcase; 773 } else { 774 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 775 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 776 } 777 778 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 779 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 780 if (elen == 0) { 781 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 782 tmpbuf); 783 Undiag_reason = UD_VAL_MISSINGOBS; 784 goto badcase; 785 } 786 estr = MALLOC(elen); 787 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 788 sepptr = strchr(estr, '@'); 789 if (sepptr == NULL) { 790 out(O_ALTFP, "reconstitute_observation: %s: " 791 "missing @ separator in %s.", 792 tmpbuf, estr); 793 Undiag_reason = UD_VAL_MISSINGPATH; 794 FREE(estr); 795 goto badcase; 796 } 797 *sepptr = '\0'; 798 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 799 out(O_ALTFP, "reconstitute_observation: %s: " 800 "trouble converting path string \"%s\" " 801 "to internal representation.", tmpbuf, sepptr + 1); 802 Undiag_reason = UD_VAL_MISSINGPATH; 803 FREE(estr); 804 goto badcase; 805 } 806 (void) prune_propagations(stable(estr), ipath(epnamenp)); 807 tree_free(epnamenp); 808 FREE(estr); 809 810 init_size = alloc_total(); 811 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 812 cfgdata = MALLOC(sizeof (struct cfgdata)); 813 cfgdata->cooked = NULL; 814 cfgdata->devcache = NULL; 815 cfgdata->devidcache = NULL; 816 cfgdata->tpcache = NULL; 817 cfgdata->cpucache = NULL; 818 cfgdata->raw_refcnt = 1; 819 820 if (rawsz > 0) { 821 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 822 out(O_ALTFP, "restart_fme: Config data size mismatch"); 823 Undiag_reason = UD_VAL_CFGMISMATCH; 824 goto badcase; 825 } 826 cfgdata->begin = MALLOC(rawsz); 827 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 828 fmd_buf_read(hdl, 829 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 830 } else { 831 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 832 } 833 834 config_cook(cfgdata); 835 fmep->config = cfgdata->cooked; 836 config_free(cfgdata); 837 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 838 alloc_total() - init_size); 839 840 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 841 /* case not properly saved or irretrievable */ 842 out(O_ALTFP, "restart_fme: NULL instance tree"); 843 Undiag_reason = UD_VAL_INSTFAIL; 844 goto badcase; 845 } 846 847 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 848 849 if (reconstitute_observations(fmep) != 0) 850 goto badcase; 851 852 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 853 for (ep = fmep->observations; ep; ep = ep->observations) { 854 out(O_ALTFP|O_NONL, " "); 855 itree_pevent_brief(O_ALTFP|O_NONL, ep); 856 } 857 out(O_ALTFP, NULL); 858 859 Open_fme_count++; 860 861 /* give the diagnosis algorithm a shot at the new FME state */ 862 fme_eval(fmep, fmep->e0r); 863 return; 864 865 badcase: 866 if (fmep->eventtree != NULL) 867 itree_free(fmep->eventtree); 868 if (fmep->config) 869 structconfig_free(fmep->config); 870 destroy_fme_bufs(fmep); 871 FREE(fmep); 872 873 /* 874 * Since we're unable to restart the case, add it to the undiagable 875 * list and solve and close it as appropriate. 876 */ 877 bad = MALLOC(sizeof (struct case_list)); 878 bad->next = NULL; 879 880 if (Undiagablecaselist != NULL) 881 bad->next = Undiagablecaselist; 882 Undiagablecaselist = bad; 883 bad->fmcase = inprogress; 884 885 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 886 fmd_case_uuid(hdl, bad->fmcase)); 887 888 if (fmd_case_solved(hdl, bad->fmcase)) { 889 out(O_ALTFP|O_NONL, "already solved, "); 890 } else { 891 out(O_ALTFP|O_NONL, "solving, "); 892 defect = fmd_nvl_create_fault(hdl, 893 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 894 reason = undiag_2reason_str(Undiag_reason, NULL); 895 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 896 FREE(reason); 897 fmd_case_add_suspect(hdl, bad->fmcase, defect); 898 fmd_case_solve(hdl, bad->fmcase); 899 Undiag_reason = UD_VAL_UNKNOWN; 900 } 901 902 if (fmd_case_closed(hdl, bad->fmcase)) { 903 out(O_ALTFP, "already closed ]"); 904 } else { 905 out(O_ALTFP, "closing ]"); 906 fmd_case_close(hdl, bad->fmcase); 907 } 908 } 909 910 /*ARGSUSED*/ 911 static void 912 globals_destructor(void *left, void *right, void *arg) 913 { 914 struct evalue *evp = (struct evalue *)right; 915 if (evp->t == NODEPTR) 916 tree_free((struct node *)(uintptr_t)evp->v); 917 evp->v = (uintptr_t)NULL; 918 FREE(evp); 919 } 920 921 void 922 destroy_fme(struct fme *f) 923 { 924 stats_delete(f->Rcount); 925 stats_delete(f->Hcallcount); 926 stats_delete(f->Rcallcount); 927 stats_delete(f->Ccallcount); 928 stats_delete(f->Ecallcount); 929 stats_delete(f->Tcallcount); 930 stats_delete(f->Marrowcount); 931 stats_delete(f->diags); 932 933 if (f->eventtree != NULL) 934 itree_free(f->eventtree); 935 if (f->config) 936 structconfig_free(f->config); 937 lut_free(f->globals, globals_destructor, NULL); 938 FREE(f); 939 } 940 941 static const char * 942 fme_state2str(enum fme_state s) 943 { 944 switch (s) { 945 case FME_NOTHING: return ("NOTHING"); 946 case FME_WAIT: return ("WAIT"); 947 case FME_CREDIBLE: return ("CREDIBLE"); 948 case FME_DISPROVED: return ("DISPROVED"); 949 case FME_DEFERRED: return ("DEFERRED"); 950 default: return ("UNKNOWN"); 951 } 952 } 953 954 static int 955 is_problem(enum nametype t) 956 { 957 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 958 } 959 960 static int 961 is_defect(enum nametype t) 962 { 963 return (t == N_DEFECT); 964 } 965 966 static int 967 is_upset(enum nametype t) 968 { 969 return (t == N_UPSET); 970 } 971 972 static void 973 fme_print(int flags, struct fme *fmep) 974 { 975 struct event *ep; 976 977 out(flags, "Fault Management Exercise %d", fmep->id); 978 out(flags, "\t State: %s", fme_state2str(fmep->state)); 979 out(flags|O_NONL, "\t Start time: "); 980 ptree_timeval(flags|O_NONL, &fmep->ull); 981 out(flags, NULL); 982 if (fmep->wull) { 983 out(flags|O_NONL, "\t Wait time: "); 984 ptree_timeval(flags|O_NONL, &fmep->wull); 985 out(flags, NULL); 986 } 987 out(flags|O_NONL, "\t E0: "); 988 if (fmep->e0) 989 itree_pevent_brief(flags|O_NONL, fmep->e0); 990 else 991 out(flags|O_NONL, "NULL"); 992 out(flags, NULL); 993 out(flags|O_NONL, "\tObservations:"); 994 for (ep = fmep->observations; ep; ep = ep->observations) { 995 out(flags|O_NONL, " "); 996 itree_pevent_brief(flags|O_NONL, ep); 997 } 998 out(flags, NULL); 999 out(flags|O_NONL, "\tSuspect list:"); 1000 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1001 out(flags|O_NONL, " "); 1002 itree_pevent_brief(flags|O_NONL, ep); 1003 } 1004 out(flags, NULL); 1005 if (fmep->eventtree != NULL) { 1006 out(flags|O_VERB2, "\t Tree:"); 1007 itree_ptree(flags|O_VERB2, fmep->eventtree); 1008 } 1009 } 1010 1011 static struct node * 1012 pathstring2epnamenp(char *path) 1013 { 1014 char *sep = "/"; 1015 struct node *ret; 1016 char *ptr; 1017 1018 if ((ptr = strtok(path, sep)) == NULL) 1019 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1020 1021 ret = tree_iname(stable(ptr), NULL, 0); 1022 1023 while ((ptr = strtok(NULL, sep)) != NULL) 1024 ret = tree_name_append(ret, 1025 tree_iname(stable(ptr), NULL, 0)); 1026 1027 return (ret); 1028 } 1029 1030 /* 1031 * for a given upset sp, increment the corresponding SERD engine. if the 1032 * SERD engine trips, return the ename and ipp of the resulting ereport. 1033 * returns true if engine tripped and *enamep and *ippp were filled in. 1034 */ 1035 static int 1036 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1037 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1038 const struct ipath **ippp) 1039 { 1040 struct node *serdinst; 1041 char *serdname; 1042 char *serdresource; 1043 char *serdclass; 1044 struct node *nid; 1045 struct serd_entry *newentp; 1046 int i, serdn = -1, serdincrement = 1, len = 0; 1047 char *serdsuffix = NULL, *serdt = NULL; 1048 struct evalue *ep; 1049 1050 ASSERT(sp->t == N_UPSET); 1051 ASSERT(ffep != NULL); 1052 1053 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1054 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1055 ASSERT(ep->t == UINT64); 1056 serdn = (int)ep->v; 1057 } 1058 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1059 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1060 ASSERT(ep->t == STRING); 1061 serdt = (char *)(uintptr_t)ep->v; 1062 } 1063 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1064 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1065 ASSERT(ep->t == STRING); 1066 serdsuffix = (char *)(uintptr_t)ep->v; 1067 } 1068 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1069 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1070 ASSERT(ep->t == UINT64); 1071 serdincrement = (int)ep->v; 1072 } 1073 1074 /* 1075 * obtain instanced SERD engine from the upset sp. from this 1076 * derive serdname, the string used to identify the SERD engine. 1077 */ 1078 serdinst = eventprop_lookup(sp, L_engine); 1079 1080 if (serdinst == NULL) 1081 return (-1); 1082 1083 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1084 if (serdsuffix != NULL) 1085 len += strlen(serdsuffix); 1086 serdclass = MALLOC(len); 1087 if (serdsuffix != NULL) 1088 (void) snprintf(serdclass, len, "%s%s", 1089 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1090 else 1091 (void) snprintf(serdclass, len, "%s", 1092 serdinst->u.stmt.np->u.event.ename->u.name.s); 1093 serdresource = ipath2str(NULL, 1094 ipath(serdinst->u.stmt.np->u.event.epname)); 1095 len += strlen(serdresource) + 1; 1096 serdname = MALLOC(len); 1097 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1098 FREE(serdresource); 1099 1100 /* handle serd engine "id" property, if there is one */ 1101 if ((nid = 1102 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1103 struct evalue *gval; 1104 char suffixbuf[200]; 1105 char *suffix; 1106 char *nserdname; 1107 size_t nname; 1108 1109 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1110 ptree_name_iter(O_ALTFP|O_NONL, nid); 1111 1112 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1113 1114 if ((gval = lut_lookup(fmep->globals, 1115 (void *)nid->u.globid.s, NULL)) == NULL) { 1116 out(O_ALTFP, " undefined"); 1117 } else if (gval->t == UINT64) { 1118 out(O_ALTFP, " %llu", gval->v); 1119 (void) sprintf(suffixbuf, "%llu", gval->v); 1120 suffix = suffixbuf; 1121 } else { 1122 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1123 suffix = (char *)(uintptr_t)gval->v; 1124 } 1125 1126 nname = strlen(serdname) + strlen(suffix) + 2; 1127 nserdname = MALLOC(nname); 1128 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1129 FREE(serdname); 1130 serdname = nserdname; 1131 } 1132 1133 /* 1134 * if the engine is empty, and we have an override for n/t then 1135 * destroy and recreate it. 1136 */ 1137 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1138 fmd_serd_empty(hdl, serdname)) 1139 fmd_serd_destroy(hdl, serdname); 1140 1141 if (!fmd_serd_exists(hdl, serdname)) { 1142 struct node *nN, *nT; 1143 const char *s; 1144 struct node *nodep; 1145 struct config *cp; 1146 char *path; 1147 uint_t nval; 1148 hrtime_t tval; 1149 int i; 1150 char *ptr; 1151 int got_n_override = 0, got_t_override = 0; 1152 1153 /* no SERD engine yet, so create it */ 1154 nodep = serdinst->u.stmt.np->u.event.epname; 1155 path = ipath2str(NULL, ipath(nodep)); 1156 cp = config_lookup(fmep->config, path, 0); 1157 FREE((void *)path); 1158 1159 /* 1160 * We allow serd paramaters to be overridden, either from 1161 * eft.conf file values (if Serd_Override is set) or from 1162 * driver properties (for "serd.io.device" engines). 1163 */ 1164 if (Serd_Override != NULL) { 1165 char *save_ptr, *ptr1, *ptr2, *ptr3; 1166 ptr3 = save_ptr = STRDUP(Serd_Override); 1167 while (*ptr3 != '\0') { 1168 ptr1 = strchr(ptr3, ','); 1169 *ptr1 = '\0'; 1170 if (strcmp(ptr3, serdclass) == 0) { 1171 ptr2 = strchr(ptr1 + 1, ','); 1172 *ptr2 = '\0'; 1173 nval = atoi(ptr1 + 1); 1174 out(O_ALTFP, "serd override %s_n %d", 1175 serdclass, nval); 1176 ptr3 = strchr(ptr2 + 1, ' '); 1177 if (ptr3) 1178 *ptr3 = '\0'; 1179 ptr = STRDUP(ptr2 + 1); 1180 out(O_ALTFP, "serd override %s_t %s", 1181 serdclass, ptr); 1182 got_n_override = 1; 1183 got_t_override = 1; 1184 break; 1185 } else { 1186 ptr2 = strchr(ptr1 + 1, ','); 1187 ptr3 = strchr(ptr2 + 1, ' '); 1188 if (ptr3 == NULL) 1189 break; 1190 } 1191 ptr3++; 1192 } 1193 FREE(save_ptr); 1194 } 1195 1196 if (cp && got_n_override == 0) { 1197 /* 1198 * convert serd engine class into property name 1199 */ 1200 char *prop_name = MALLOC(strlen(serdclass) + 3); 1201 for (i = 0; i < strlen(serdclass); i++) { 1202 if (serdclass[i] == '.') 1203 prop_name[i] = '_'; 1204 else 1205 prop_name[i] = serdclass[i]; 1206 } 1207 prop_name[i++] = '_'; 1208 prop_name[i++] = 'n'; 1209 prop_name[i] = '\0'; 1210 if (s = config_getprop(cp, prop_name)) { 1211 nval = atoi(s); 1212 out(O_ALTFP, "serd override %s_n %s", 1213 serdclass, s); 1214 got_n_override = 1; 1215 } 1216 prop_name[i - 1] = 't'; 1217 if (s = config_getprop(cp, prop_name)) { 1218 ptr = STRDUP(s); 1219 out(O_ALTFP, "serd override %s_t %s", 1220 serdclass, s); 1221 got_t_override = 1; 1222 } 1223 FREE(prop_name); 1224 } 1225 1226 if (serdn != -1 && got_n_override == 0) { 1227 nval = serdn; 1228 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1229 got_n_override = 1; 1230 } 1231 if (serdt != NULL && got_t_override == 0) { 1232 ptr = STRDUP(serdt); 1233 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1234 got_t_override = 1; 1235 } 1236 1237 if (!got_n_override) { 1238 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1239 NULL); 1240 ASSERT(nN->t == T_NUM); 1241 nval = (uint_t)nN->u.ull; 1242 } 1243 if (!got_t_override) { 1244 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1245 NULL); 1246 ASSERT(nT->t == T_TIMEVAL); 1247 tval = (hrtime_t)nT->u.ull; 1248 } else { 1249 const unsigned long long *ullp; 1250 const char *suffix; 1251 int len; 1252 1253 len = strspn(ptr, "0123456789"); 1254 suffix = stable(&ptr[len]); 1255 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1256 (void *)suffix, NULL); 1257 ptr[len] = '\0'; 1258 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1259 FREE(ptr); 1260 } 1261 fmd_serd_create(hdl, serdname, nval, tval); 1262 } 1263 1264 newentp = MALLOC(sizeof (*newentp)); 1265 newentp->ename = stable(serdclass); 1266 FREE(serdclass); 1267 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1268 newentp->hdl = hdl; 1269 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1270 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1271 (void *)newentp, (lut_cmp)serd_cmp); 1272 Serd_need_save = 1; 1273 serd_save(); 1274 } else { 1275 FREE(newentp); 1276 } 1277 1278 1279 /* 1280 * increment SERD engine. if engine fires, reset serd 1281 * engine and return trip_strcode if required. 1282 */ 1283 for (i = 0; i < serdincrement; i++) { 1284 if (fmd_serd_record(hdl, serdname, ffep)) { 1285 fmd_case_add_serd(hdl, fmcase, serdname); 1286 fmd_serd_reset(hdl, serdname); 1287 1288 if (ippp) { 1289 struct node *tripinst = 1290 lut_lookup(serdinst->u.stmt.lutp, 1291 (void *)L_trip, NULL); 1292 ASSERT(tripinst != NULL); 1293 *enamep = tripinst->u.event.ename->u.name.s; 1294 *ippp = ipath(tripinst->u.event.epname); 1295 out(O_ALTFP|O_NONL, 1296 "[engine fired: %s, sending: ", serdname); 1297 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1298 out(O_ALTFP, "]"); 1299 } else { 1300 out(O_ALTFP, "[engine fired: %s, no trip]", 1301 serdname); 1302 } 1303 FREE(serdname); 1304 return (1); 1305 } 1306 } 1307 1308 FREE(serdname); 1309 return (0); 1310 } 1311 1312 /* 1313 * search a suspect list for upsets. feed each upset to serd_eval() and 1314 * build up tripped[], an array of ereports produced by the firing of 1315 * any SERD engines. then feed each ereport back into 1316 * fme_receive_report(). 1317 * 1318 * returns ntrip, the number of these ereports produced. 1319 */ 1320 static int 1321 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1322 { 1323 /* we build an array of tripped ereports that we send ourselves */ 1324 struct { 1325 const char *ename; 1326 const struct ipath *ipp; 1327 } *tripped; 1328 struct event *sp; 1329 int ntrip, nupset, i; 1330 1331 /* 1332 * count the number of upsets to determine the upper limit on 1333 * expected trip ereport strings. remember that one upset can 1334 * lead to at most one ereport. 1335 */ 1336 nupset = 0; 1337 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1338 if (sp->t == N_UPSET) 1339 nupset++; 1340 } 1341 1342 if (nupset == 0) 1343 return (0); 1344 1345 /* 1346 * get to this point if we have upsets and expect some trip 1347 * ereports 1348 */ 1349 tripped = alloca(sizeof (*tripped) * nupset); 1350 bzero((void *)tripped, sizeof (*tripped) * nupset); 1351 1352 ntrip = 0; 1353 for (sp = fmep->suspects; sp; sp = sp->suspects) 1354 if (sp->t == N_UPSET && 1355 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1356 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1357 ntrip++; 1358 1359 for (i = 0; i < ntrip; i++) { 1360 struct event *ep, *nep; 1361 struct fme *nfmep; 1362 fmd_case_t *fmcase; 1363 const struct ipath *ipp; 1364 const char *eventstring; 1365 int prev_verbose; 1366 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1367 enum fme_state state; 1368 1369 /* 1370 * First try and evaluate a case with the trip ereport plus 1371 * all the other ereports that cause the trip. If that fails 1372 * to evaluate then try again with just this ereport on its own. 1373 */ 1374 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1375 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1376 out(O_ALTFP|O_STAMP, NULL); 1377 ep = fmep->e0; 1378 eventstring = ep->enode->u.event.ename->u.name.s; 1379 ipp = ep->ipp; 1380 1381 /* 1382 * create a duplicate fme and case 1383 */ 1384 fmcase = fmd_case_open(fmep->hdl, NULL); 1385 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1386 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1387 out(O_ALTFP, " ]"); 1388 1389 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1390 fmcase, ffep, ep->nvp)) == NULL) { 1391 out(O_ALTFP|O_NONL, "["); 1392 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1393 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1394 continue; 1395 } 1396 1397 Open_fme_count++; 1398 nfmep->pull = fmep->pull; 1399 init_fme_bufs(nfmep); 1400 out(O_ALTFP|O_NONL, "["); 1401 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1402 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1403 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1404 if (ffep) { 1405 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1406 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1407 nfmep->e0r = ffep; 1408 } 1409 1410 /* 1411 * add the original ereports 1412 */ 1413 for (ep = fmep->observations; ep; ep = ep->observations) { 1414 eventstring = ep->enode->u.event.ename->u.name.s; 1415 ipp = ep->ipp; 1416 out(O_ALTFP|O_NONL, "adding event ["); 1417 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1418 out(O_ALTFP, " ]"); 1419 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1420 if (nep->count++ == 0) { 1421 nep->observations = nfmep->observations; 1422 nfmep->observations = nep; 1423 serialize_observation(nfmep, eventstring, ipp); 1424 nep->nvp = evnv_dupnvl(ep->nvp); 1425 } 1426 if (ep->ffep && ep->ffep != ffep) 1427 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1428 ep->ffep); 1429 stats_counter_bump(nfmep->Rcount); 1430 } 1431 1432 /* 1433 * add the serd trigger ereport 1434 */ 1435 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1436 tripped[i].ipp)) == NULL) { 1437 /* 1438 * The trigger ereport is not in the instance tree. It 1439 * was presumably removed by prune_propagations() as 1440 * this combination of events is not present in the 1441 * rules. 1442 */ 1443 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1444 Undiag_reason = UD_VAL_BADEVENTI; 1445 goto retry_lone_ereport; 1446 } 1447 out(O_ALTFP|O_NONL, "adding event ["); 1448 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1449 out(O_ALTFP, " ]"); 1450 nfmep->ecurrent = ep; 1451 ep->nvp = NULL; 1452 ep->count = 1; 1453 ep->observations = nfmep->observations; 1454 nfmep->observations = ep; 1455 1456 /* 1457 * just peek first. 1458 */ 1459 nfmep->peek = 1; 1460 prev_verbose = Verbose; 1461 if (Debug == 0) 1462 Verbose = 0; 1463 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1464 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1465 nfmep->peek = 0; 1466 Verbose = prev_verbose; 1467 if (state == FME_DISPROVED) { 1468 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1469 Undiag_reason = UD_VAL_UNSOLVD; 1470 retry_lone_ereport: 1471 /* 1472 * However the trigger ereport on its own might be 1473 * diagnosable, so check for that. Undo the new fme 1474 * and case we just created and call fme_receive_report. 1475 */ 1476 out(O_ALTFP|O_NONL, "["); 1477 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1478 tripped[i].ipp); 1479 out(O_ALTFP, " retrying with just trigger ereport]"); 1480 itree_free(nfmep->eventtree); 1481 nfmep->eventtree = NULL; 1482 structconfig_free(nfmep->config); 1483 nfmep->config = NULL; 1484 destroy_fme_bufs(nfmep); 1485 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1486 fme_receive_report(fmep->hdl, ffep, 1487 tripped[i].ename, tripped[i].ipp, NULL); 1488 continue; 1489 } 1490 1491 /* 1492 * and evaluate 1493 */ 1494 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1495 fme_eval(nfmep, ffep); 1496 } 1497 1498 return (ntrip); 1499 } 1500 1501 /* 1502 * fme_receive_external_report -- call when an external ereport comes in 1503 * 1504 * this routine just converts the relevant information from the ereport 1505 * into a format used internally and passes it on to fme_receive_report(). 1506 */ 1507 void 1508 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1509 const char *class) 1510 { 1511 struct node *epnamenp; 1512 fmd_case_t *fmcase; 1513 const struct ipath *ipp; 1514 nvlist_t *detector = NULL; 1515 1516 class = stable(class); 1517 1518 /* Get the component path from the ereport */ 1519 epnamenp = platform_getpath(nvl); 1520 1521 /* See if we ended up without a path. */ 1522 if (epnamenp == NULL) { 1523 /* See if class permits silent discard on unknown component. */ 1524 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1525 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1526 "to component path, but silent discard allowed.", 1527 class); 1528 } else { 1529 /* 1530 * XFILE: Failure to find a component is bad unless 1531 * 'discard_if_config_unknown=1' was specified in the 1532 * ereport definition. Indicate undiagnosable. 1533 */ 1534 Undiag_reason = UD_VAL_NOPATH; 1535 fmcase = fmd_case_open(hdl, NULL); 1536 1537 /* 1538 * We don't have a component path here (which means that 1539 * the detector was not in hc-scheme and couldn't be 1540 * converted to hc-scheme. Report the raw detector as 1541 * the suspect resource if there is one. 1542 */ 1543 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1544 &detector); 1545 publish_undiagnosable(hdl, ffep, fmcase, detector, 1546 (char *)class); 1547 } 1548 return; 1549 } 1550 1551 ipp = ipath(epnamenp); 1552 tree_free(epnamenp); 1553 fme_receive_report(hdl, ffep, class, ipp, nvl); 1554 } 1555 1556 /*ARGSUSED*/ 1557 void 1558 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1559 const char *eventstring) 1560 { 1561 char *uuid; 1562 nvlist_t **nva; 1563 uint_t nvc; 1564 const struct ipath *ipp; 1565 1566 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1567 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1568 &nva, &nvc) != 0) { 1569 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1570 return; 1571 } 1572 1573 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1574 1575 while (nvc-- != 0) { 1576 /* 1577 * Reset any istat or serd engine associated with this path. 1578 */ 1579 char *path; 1580 1581 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1582 continue; 1583 1584 path = ipath2str(NULL, ipp); 1585 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1586 path); 1587 FREE(path); 1588 1589 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1590 istat_save(); 1591 1592 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1593 serd_save(); 1594 } 1595 } 1596 1597 /*ARGSUSED*/ 1598 void 1599 fme_receive_topology_change(void) 1600 { 1601 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1602 istat_save(); 1603 1604 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1605 serd_save(); 1606 } 1607 1608 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1609 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1610 1611 /* ARGSUSED */ 1612 static void 1613 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1614 { 1615 struct bubble *bp; 1616 struct arrowlist *ap; 1617 1618 ep->cached_state = 0; 1619 ep->keep_in_tree = 0; 1620 for (bp = itree_next_bubble(ep, NULL); bp; 1621 bp = itree_next_bubble(ep, bp)) { 1622 if (bp->t != B_FROM) 1623 continue; 1624 bp->mark = 0; 1625 for (ap = itree_next_arrow(bp, NULL); ap; 1626 ap = itree_next_arrow(bp, ap)) 1627 ap->arrowp->mark = 0; 1628 } 1629 } 1630 1631 static void 1632 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1633 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1634 { 1635 struct event *ep; 1636 struct fme *fmep = NULL; 1637 struct fme *ofmep = NULL; 1638 struct fme *cfmep, *svfmep; 1639 int matched = 0; 1640 nvlist_t *defect; 1641 fmd_case_t *fmcase; 1642 char *reason; 1643 1644 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1645 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1646 out(O_ALTFP|O_STAMP, NULL); 1647 1648 /* decide which FME it goes to */ 1649 for (fmep = FMElist; fmep; fmep = fmep->next) { 1650 int prev_verbose; 1651 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1652 enum fme_state state; 1653 nvlist_t *pre_peek_nvp = NULL; 1654 1655 if (fmep->overflow) { 1656 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1657 ofmep = fmep; 1658 1659 continue; 1660 } 1661 1662 /* 1663 * ignore solved or closed cases 1664 */ 1665 if (fmep->posted_suspects || 1666 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1667 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1668 continue; 1669 1670 /* look up event in event tree for this FME */ 1671 if ((ep = itree_lookup(fmep->eventtree, 1672 eventstring, ipp)) == NULL) 1673 continue; 1674 1675 /* note observation */ 1676 fmep->ecurrent = ep; 1677 if (ep->count++ == 0) { 1678 /* link it into list of observations seen */ 1679 ep->observations = fmep->observations; 1680 fmep->observations = ep; 1681 ep->nvp = evnv_dupnvl(nvl); 1682 } else { 1683 /* use new payload values for peek */ 1684 pre_peek_nvp = ep->nvp; 1685 ep->nvp = evnv_dupnvl(nvl); 1686 } 1687 1688 /* tell hypothesise() not to mess with suspect list */ 1689 fmep->peek = 1; 1690 1691 /* don't want this to be verbose (unless Debug is set) */ 1692 prev_verbose = Verbose; 1693 if (Debug == 0) 1694 Verbose = 0; 1695 1696 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1697 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1698 1699 fmep->peek = 0; 1700 1701 /* put verbose flag back */ 1702 Verbose = prev_verbose; 1703 1704 if (state != FME_DISPROVED) { 1705 /* found an FME that explains the ereport */ 1706 matched++; 1707 out(O_ALTFP|O_NONL, "["); 1708 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1709 out(O_ALTFP, " explained by FME%d]", fmep->id); 1710 1711 nvlist_free(pre_peek_nvp); 1712 1713 if (ep->count == 1) 1714 serialize_observation(fmep, eventstring, ipp); 1715 1716 if (ffep) { 1717 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1718 ep->ffep = ffep; 1719 } 1720 1721 stats_counter_bump(fmep->Rcount); 1722 1723 /* re-eval FME */ 1724 fme_eval(fmep, ffep); 1725 } else { 1726 1727 /* not a match, undo noting of observation */ 1728 fmep->ecurrent = NULL; 1729 if (--ep->count == 0) { 1730 /* unlink it from observations */ 1731 fmep->observations = ep->observations; 1732 ep->observations = NULL; 1733 nvlist_free(ep->nvp); 1734 ep->nvp = NULL; 1735 } else { 1736 nvlist_free(ep->nvp); 1737 ep->nvp = pre_peek_nvp; 1738 } 1739 } 1740 } 1741 1742 if (matched) 1743 return; /* explained by at least one existing FME */ 1744 1745 /* clean up closed fmes */ 1746 cfmep = ClosedFMEs; 1747 while (cfmep != NULL) { 1748 svfmep = cfmep->next; 1749 destroy_fme(cfmep); 1750 cfmep = svfmep; 1751 } 1752 ClosedFMEs = NULL; 1753 1754 if (ofmep) { 1755 out(O_ALTFP|O_NONL, "["); 1756 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1757 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1758 if (ffep) 1759 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1760 1761 return; 1762 1763 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1764 out(O_ALTFP|O_NONL, "["); 1765 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1766 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1767 1768 fmcase = fmd_case_open(hdl, NULL); 1769 1770 /* Create overflow fme */ 1771 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1772 nvl)) == NULL) { 1773 out(O_ALTFP|O_NONL, "["); 1774 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1775 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1776 return; 1777 } 1778 1779 Open_fme_count++; 1780 1781 init_fme_bufs(fmep); 1782 fmep->overflow = B_TRUE; 1783 1784 if (ffep) 1785 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1786 1787 Undiag_reason = UD_VAL_MAXFME; 1788 defect = fmd_nvl_create_fault(hdl, 1789 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1790 reason = undiag_2reason_str(Undiag_reason, NULL); 1791 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1792 FREE(reason); 1793 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1794 fmd_case_solve(hdl, fmep->fmcase); 1795 Undiag_reason = UD_VAL_UNKNOWN; 1796 return; 1797 } 1798 1799 /* open a case */ 1800 fmcase = fmd_case_open(hdl, NULL); 1801 1802 /* start a new FME */ 1803 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1804 out(O_ALTFP|O_NONL, "["); 1805 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1806 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1807 return; 1808 } 1809 1810 Open_fme_count++; 1811 1812 init_fme_bufs(fmep); 1813 1814 out(O_ALTFP|O_NONL, "["); 1815 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1816 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1817 fmd_case_uuid(hdl, fmep->fmcase)); 1818 1819 ep = fmep->e0; 1820 ASSERT(ep != NULL); 1821 1822 /* note observation */ 1823 fmep->ecurrent = ep; 1824 if (ep->count++ == 0) { 1825 /* link it into list of observations seen */ 1826 ep->observations = fmep->observations; 1827 fmep->observations = ep; 1828 ep->nvp = evnv_dupnvl(nvl); 1829 serialize_observation(fmep, eventstring, ipp); 1830 } else { 1831 /* new payload overrides any previous */ 1832 nvlist_free(ep->nvp); 1833 ep->nvp = evnv_dupnvl(nvl); 1834 } 1835 1836 stats_counter_bump(fmep->Rcount); 1837 1838 if (ffep) { 1839 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1840 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1841 fmep->e0r = ffep; 1842 ep->ffep = ffep; 1843 } 1844 1845 /* give the diagnosis algorithm a shot at the new FME state */ 1846 fme_eval(fmep, ffep); 1847 } 1848 1849 void 1850 fme_status(int flags) 1851 { 1852 struct fme *fmep; 1853 1854 if (FMElist == NULL) { 1855 out(flags, "No fault management exercises underway."); 1856 return; 1857 } 1858 1859 for (fmep = FMElist; fmep; fmep = fmep->next) 1860 fme_print(flags, fmep); 1861 } 1862 1863 /* 1864 * "indent" routines used mostly for nicely formatted debug output, but also 1865 * for sanity checking for infinite recursion bugs. 1866 */ 1867 1868 #define MAX_INDENT 1024 1869 static const char *indent_s[MAX_INDENT]; 1870 static int current_indent; 1871 1872 static void 1873 indent_push(const char *s) 1874 { 1875 if (current_indent < MAX_INDENT) 1876 indent_s[current_indent++] = s; 1877 else 1878 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1879 } 1880 1881 static void 1882 indent_set(const char *s) 1883 { 1884 current_indent = 0; 1885 indent_push(s); 1886 } 1887 1888 static void 1889 indent_pop(void) 1890 { 1891 if (current_indent > 0) 1892 current_indent--; 1893 else 1894 out(O_DIE, "recursion underflow"); 1895 } 1896 1897 static void 1898 indent(void) 1899 { 1900 int i; 1901 if (!Verbose) 1902 return; 1903 for (i = 0; i < current_indent; i++) 1904 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1905 } 1906 1907 #define SLNEW 1 1908 #define SLCHANGED 2 1909 #define SLWAIT 3 1910 #define SLDISPROVED 4 1911 1912 static void 1913 print_suspects(int circumstance, struct fme *fmep) 1914 { 1915 struct event *ep; 1916 1917 out(O_ALTFP|O_NONL, "["); 1918 if (circumstance == SLCHANGED) { 1919 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1920 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1921 } else if (circumstance == SLWAIT) { 1922 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1923 fmep->timer); 1924 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1925 } else if (circumstance == SLDISPROVED) { 1926 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1927 } else { 1928 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1929 } 1930 1931 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1932 out(O_ALTFP, "]"); 1933 return; 1934 } 1935 1936 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1937 out(O_ALTFP|O_NONL, " "); 1938 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1939 } 1940 out(O_ALTFP, "]"); 1941 } 1942 1943 static struct node * 1944 eventprop_lookup(struct event *ep, const char *propname) 1945 { 1946 return (lut_lookup(ep->props, (void *)propname, NULL)); 1947 } 1948 1949 #define MAXDIGITIDX 23 1950 static char numbuf[MAXDIGITIDX + 1]; 1951 1952 static int 1953 node2uint(struct node *n, uint_t *valp) 1954 { 1955 struct evalue value; 1956 struct lut *globals = NULL; 1957 1958 if (n == NULL) 1959 return (1); 1960 1961 /* 1962 * check value.v since we are being asked to convert an unsigned 1963 * long long int to an unsigned int 1964 */ 1965 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1966 value.t != UINT64 || value.v > (1ULL << 32)) 1967 return (1); 1968 1969 *valp = (uint_t)value.v; 1970 1971 return (0); 1972 } 1973 1974 static nvlist_t * 1975 node2fmri(struct node *n) 1976 { 1977 nvlist_t **pa, *f, *p; 1978 struct node *nc; 1979 uint_t depth = 0; 1980 char *numstr, *nullbyte; 1981 char *failure; 1982 int err, i; 1983 1984 /* XXX do we need to be able to handle a non-T_NAME node? */ 1985 if (n == NULL || n->t != T_NAME) 1986 return (NULL); 1987 1988 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1989 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1990 break; 1991 depth++; 1992 } 1993 1994 if (nc != NULL) { 1995 /* We bailed early, something went wrong */ 1996 return (NULL); 1997 } 1998 1999 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2000 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2001 pa = alloca(depth * sizeof (nvlist_t *)); 2002 for (i = 0; i < depth; i++) 2003 pa[i] = NULL; 2004 2005 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2006 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2007 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2008 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2009 if (err != 0) { 2010 failure = "basic construction of FMRI failed"; 2011 goto boom; 2012 } 2013 2014 numbuf[MAXDIGITIDX] = '\0'; 2015 nullbyte = &numbuf[MAXDIGITIDX]; 2016 i = 0; 2017 2018 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2019 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2020 if (err != 0) { 2021 failure = "alloc of an hc-pair failed"; 2022 goto boom; 2023 } 2024 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2025 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2026 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2027 if (err != 0) { 2028 failure = "construction of an hc-pair failed"; 2029 goto boom; 2030 } 2031 pa[i++] = p; 2032 } 2033 2034 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2035 if (err == 0) { 2036 for (i = 0; i < depth; i++) 2037 nvlist_free(pa[i]); 2038 return (f); 2039 } 2040 failure = "addition of hc-pair array to FMRI failed"; 2041 2042 boom: 2043 for (i = 0; i < depth; i++) 2044 nvlist_free(pa[i]); 2045 nvlist_free(f); 2046 out(O_DIE, "%s", failure); 2047 /*NOTREACHED*/ 2048 return (NULL); 2049 } 2050 2051 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2052 struct ipath { 2053 const char *s; /* component name (in stable) */ 2054 int i; /* instance number */ 2055 }; 2056 2057 static nvlist_t * 2058 ipath2fmri(struct ipath *ipath) 2059 { 2060 nvlist_t **pa, *f, *p; 2061 uint_t depth = 0; 2062 char *numstr, *nullbyte; 2063 char *failure; 2064 int err, i; 2065 struct ipath *ipp; 2066 2067 for (ipp = ipath; ipp->s != NULL; ipp++) 2068 depth++; 2069 2070 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2071 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2072 pa = alloca(depth * sizeof (nvlist_t *)); 2073 for (i = 0; i < depth; i++) 2074 pa[i] = NULL; 2075 2076 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2077 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2078 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2079 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2080 if (err != 0) { 2081 failure = "basic construction of FMRI failed"; 2082 goto boom; 2083 } 2084 2085 numbuf[MAXDIGITIDX] = '\0'; 2086 nullbyte = &numbuf[MAXDIGITIDX]; 2087 i = 0; 2088 2089 for (ipp = ipath; ipp->s != NULL; ipp++) { 2090 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2091 if (err != 0) { 2092 failure = "alloc of an hc-pair failed"; 2093 goto boom; 2094 } 2095 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2096 numstr = ulltostr(ipp->i, nullbyte); 2097 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2098 if (err != 0) { 2099 failure = "construction of an hc-pair failed"; 2100 goto boom; 2101 } 2102 pa[i++] = p; 2103 } 2104 2105 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2106 if (err == 0) { 2107 for (i = 0; i < depth; i++) 2108 nvlist_free(pa[i]); 2109 return (f); 2110 } 2111 failure = "addition of hc-pair array to FMRI failed"; 2112 2113 boom: 2114 for (i = 0; i < depth; i++) 2115 nvlist_free(pa[i]); 2116 nvlist_free(f); 2117 out(O_DIE, "%s", failure); 2118 /*NOTREACHED*/ 2119 return (NULL); 2120 } 2121 2122 static uint8_t 2123 percentof(uint_t part, uint_t whole) 2124 { 2125 unsigned long long p = part * 1000; 2126 2127 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2128 } 2129 2130 struct rsl { 2131 struct event *suspect; 2132 nvlist_t *asru; 2133 nvlist_t *fru; 2134 nvlist_t *rsrc; 2135 }; 2136 2137 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2138 2139 /* 2140 * rslfree -- free internal members of struct rsl not expected to be 2141 * freed elsewhere. 2142 */ 2143 static void 2144 rslfree(struct rsl *freeme) 2145 { 2146 nvlist_free(freeme->asru); 2147 nvlist_free(freeme->fru); 2148 if (freeme->rsrc != freeme->asru) 2149 nvlist_free(freeme->rsrc); 2150 } 2151 2152 /* 2153 * rslcmp -- compare two rsl structures. Use the following 2154 * comparisons to establish cardinality: 2155 * 2156 * 1. Name of the suspect's class. (simple strcmp) 2157 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2158 * 2159 */ 2160 static int 2161 rslcmp(const void *a, const void *b) 2162 { 2163 struct rsl *r1 = (struct rsl *)a; 2164 struct rsl *r2 = (struct rsl *)b; 2165 int rv; 2166 2167 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2168 r2->suspect->enode->u.event.ename->u.name.s); 2169 if (rv != 0) 2170 return (rv); 2171 2172 if (r1->rsrc == NULL && r2->rsrc == NULL) 2173 return (0); 2174 if (r1->rsrc == NULL) 2175 return (-1); 2176 if (r2->rsrc == NULL) 2177 return (1); 2178 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2179 } 2180 2181 /* 2182 * get_resources -- for a given suspect, determine what ASRU, FRU and 2183 * RSRC nvlists should be advertised in the final suspect list. 2184 */ 2185 void 2186 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2187 { 2188 struct node *asrudef, *frudef; 2189 const struct ipath *asrupath, *frupath; 2190 nvlist_t *asru = NULL, *fru = NULL; 2191 nvlist_t *rsrc = NULL; 2192 char *pathstr; 2193 2194 /* 2195 * First find any ASRU and/or FRU defined in the 2196 * initial fault tree. 2197 */ 2198 asrudef = eventprop_lookup(sp, L_ASRU); 2199 frudef = eventprop_lookup(sp, L_FRU); 2200 2201 /* 2202 * Create ipaths based on those definitions 2203 */ 2204 asrupath = ipath(asrudef); 2205 frupath = ipath(frudef); 2206 2207 /* 2208 * Allow for platform translations of the FMRIs 2209 */ 2210 pathstr = ipath2str(NULL, sp->ipp); 2211 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_RESOURCE, 2212 &rsrc, pathstr); 2213 FREE(pathstr); 2214 2215 pathstr = ipath2str(NULL, asrupath); 2216 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_ASRU, 2217 &asru, pathstr); 2218 FREE(pathstr); 2219 2220 pathstr = ipath2str(NULL, frupath); 2221 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_FRU, 2222 &fru, pathstr); 2223 FREE(pathstr); 2224 2225 rsrcs->suspect = sp; 2226 rsrcs->asru = asru; 2227 rsrcs->fru = fru; 2228 rsrcs->rsrc = rsrc; 2229 } 2230 2231 /* 2232 * trim_suspects -- prior to publishing, we may need to remove some 2233 * suspects from the list. If we're auto-closing upsets, we don't 2234 * want any of those in the published list. If the ASRUs for multiple 2235 * defects resolve to the same ASRU (driver) we only want to publish 2236 * that as a single suspect. 2237 */ 2238 static int 2239 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2240 fmd_event_t *ffep) 2241 { 2242 struct event *ep; 2243 struct rsl *rp = begin; 2244 struct rsl *rp2 = begin2; 2245 int mess_zero_count = 0; 2246 int serd_rval; 2247 uint_t messval; 2248 2249 /* remove any unwanted upsets and populate our array */ 2250 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2251 if (is_upset(ep->t)) 2252 continue; 2253 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2254 NULL, NULL); 2255 if (serd_rval == 0) 2256 continue; 2257 if (node2uint(eventprop_lookup(ep, L_message), 2258 &messval) == 0 && messval == 0) { 2259 get_resources(ep, rp2, fmep->config); 2260 rp2++; 2261 mess_zero_count++; 2262 } else { 2263 get_resources(ep, rp, fmep->config); 2264 rp++; 2265 fmep->nsuspects++; 2266 } 2267 } 2268 return (mess_zero_count); 2269 } 2270 2271 /* 2272 * addpayloadprop -- add a payload prop to a problem 2273 */ 2274 static void 2275 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2276 { 2277 nvlist_t *rsrc, *hcs; 2278 2279 ASSERT(fault != NULL); 2280 ASSERT(lhs != NULL); 2281 ASSERT(rhs != NULL); 2282 2283 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2284 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2285 2286 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2287 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2288 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2289 out(O_DIE, 2290 "cannot add payloadprop \"%s\" to fault", lhs); 2291 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2292 out(O_DIE, 2293 "cannot add payloadprop \"%s\" to fault", lhs); 2294 nvlist_free(hcs); 2295 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2296 out(O_DIE, 2297 "cannot add payloadprop \"%s\" to fault", lhs); 2298 } else 2299 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2300 2301 if (rhs->t == UINT64) { 2302 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2303 2304 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2305 out(O_DIE, 2306 "cannot add payloadprop \"%s\" to fault", lhs); 2307 } else { 2308 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2309 lhs, (char *)(uintptr_t)rhs->v); 2310 2311 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2312 out(O_DIE, 2313 "cannot add payloadprop \"%s\" to fault", lhs); 2314 } 2315 } 2316 2317 static char *Istatbuf; 2318 static char *Istatbufptr; 2319 static int Istatsz; 2320 2321 /* 2322 * istataddsize -- calculate size of istat and add it to Istatsz 2323 */ 2324 /*ARGSUSED2*/ 2325 static void 2326 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2327 { 2328 int val; 2329 2330 ASSERT(lhs != NULL); 2331 ASSERT(rhs != NULL); 2332 2333 if ((val = stats_counter_value(rhs)) == 0) 2334 return; /* skip zero-valued stats */ 2335 2336 /* count up the size of the stat name */ 2337 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2338 Istatsz++; /* for the trailing NULL byte */ 2339 2340 /* count up the size of the stat value */ 2341 Istatsz += snprintf(NULL, 0, "%d", val); 2342 Istatsz++; /* for the trailing NULL byte */ 2343 } 2344 2345 /* 2346 * istat2str -- serialize an istat, writing result to *Istatbufptr 2347 */ 2348 /*ARGSUSED2*/ 2349 static void 2350 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2351 { 2352 char *str; 2353 int len; 2354 int val; 2355 2356 ASSERT(lhs != NULL); 2357 ASSERT(rhs != NULL); 2358 2359 if ((val = stats_counter_value(rhs)) == 0) 2360 return; /* skip zero-valued stats */ 2361 2362 /* serialize the stat name */ 2363 str = ipath2str(lhs->ename, lhs->ipath); 2364 len = strlen(str); 2365 2366 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2367 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2368 Istatbufptr += len; 2369 FREE(str); 2370 *Istatbufptr++ = '\0'; 2371 2372 /* serialize the stat value */ 2373 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2374 "%d", val); 2375 *Istatbufptr++ = '\0'; 2376 2377 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2378 } 2379 2380 void 2381 istat_save() 2382 { 2383 if (Istat_need_save == 0) 2384 return; 2385 2386 /* figure out how big the serialzed info is */ 2387 Istatsz = 0; 2388 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2389 2390 if (Istatsz == 0) { 2391 /* no stats to save */ 2392 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2393 return; 2394 } 2395 2396 /* create the serialized buffer */ 2397 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2398 lut_walk(Istats, (lut_cb)istat2str, NULL); 2399 2400 /* clear out current saved stats */ 2401 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2402 2403 /* write out the new version */ 2404 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2405 FREE(Istatbuf); 2406 2407 Istat_need_save = 0; 2408 } 2409 2410 int 2411 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2412 { 2413 if (ent1->ename != ent2->ename) 2414 return (ent2->ename - ent1->ename); 2415 if (ent1->ipath != ent2->ipath) 2416 return ((char *)ent2->ipath - (char *)ent1->ipath); 2417 2418 return (0); 2419 } 2420 2421 /* 2422 * istat-verify -- verify the component associated with a stat still exists 2423 * 2424 * if the component no longer exists, this routine resets the stat and 2425 * returns 0. if the component still exists, it returns 1. 2426 */ 2427 static int 2428 istat_verify(struct node *snp, struct istat_entry *entp) 2429 { 2430 struct stats *statp; 2431 nvlist_t *fmri; 2432 2433 fmri = node2fmri(snp->u.event.epname); 2434 if (platform_path_exists(fmri)) { 2435 nvlist_free(fmri); 2436 return (1); 2437 } 2438 nvlist_free(fmri); 2439 2440 /* component no longer in system. zero out the associated stats */ 2441 if ((statp = (struct stats *) 2442 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2443 stats_counter_value(statp) == 0) 2444 return (0); /* stat is already reset */ 2445 2446 Istat_need_save = 1; 2447 stats_counter_reset(statp); 2448 return (0); 2449 } 2450 2451 static void 2452 istat_bump(struct node *snp, int n) 2453 { 2454 struct stats *statp; 2455 struct istat_entry ent; 2456 2457 ASSERT(snp != NULL); 2458 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2459 ASSERT(snp->u.event.epname != NULL); 2460 2461 /* class name should be hoisted into a single stable entry */ 2462 ASSERT(snp->u.event.ename->u.name.next == NULL); 2463 ent.ename = snp->u.event.ename->u.name.s; 2464 ent.ipath = ipath(snp->u.event.epname); 2465 2466 if (!istat_verify(snp, &ent)) { 2467 /* component no longer exists in system, nothing to do */ 2468 return; 2469 } 2470 2471 if ((statp = (struct stats *) 2472 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2473 /* need to create the counter */ 2474 int cnt = 0; 2475 struct node *np; 2476 char *sname; 2477 char *snamep; 2478 struct istat_entry *newentp; 2479 2480 /* count up the size of the stat name */ 2481 np = snp->u.event.ename; 2482 while (np != NULL) { 2483 cnt += strlen(np->u.name.s); 2484 cnt++; /* for the '.' or '@' */ 2485 np = np->u.name.next; 2486 } 2487 np = snp->u.event.epname; 2488 while (np != NULL) { 2489 cnt += snprintf(NULL, 0, "%s%llu", 2490 np->u.name.s, np->u.name.child->u.ull); 2491 cnt++; /* for the '/' or trailing NULL byte */ 2492 np = np->u.name.next; 2493 } 2494 2495 /* build the stat name */ 2496 snamep = sname = alloca(cnt); 2497 np = snp->u.event.ename; 2498 while (np != NULL) { 2499 snamep += snprintf(snamep, &sname[cnt] - snamep, 2500 "%s", np->u.name.s); 2501 np = np->u.name.next; 2502 if (np) 2503 *snamep++ = '.'; 2504 } 2505 *snamep++ = '@'; 2506 np = snp->u.event.epname; 2507 while (np != NULL) { 2508 snamep += snprintf(snamep, &sname[cnt] - snamep, 2509 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2510 np = np->u.name.next; 2511 if (np) 2512 *snamep++ = '/'; 2513 } 2514 *snamep++ = '\0'; 2515 2516 /* create the new stat & add it to our list */ 2517 newentp = MALLOC(sizeof (*newentp)); 2518 *newentp = ent; 2519 statp = stats_new_counter(NULL, sname, 0); 2520 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2521 (lut_cmp)istat_cmp); 2522 } 2523 2524 /* if n is non-zero, set that value instead of bumping */ 2525 if (n) { 2526 stats_counter_reset(statp); 2527 stats_counter_add(statp, n); 2528 } else 2529 stats_counter_bump(statp); 2530 Istat_need_save = 1; 2531 2532 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2533 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2534 stats_counter_value(statp)); 2535 } 2536 2537 /*ARGSUSED*/ 2538 static void 2539 istat_destructor(void *left, void *right, void *arg) 2540 { 2541 struct istat_entry *entp = (struct istat_entry *)left; 2542 struct stats *statp = (struct stats *)right; 2543 FREE(entp); 2544 stats_delete(statp); 2545 } 2546 2547 /* 2548 * Callback used in a walk of the Istats to reset matching stat counters. 2549 */ 2550 static void 2551 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2552 const struct ipath *ipp) 2553 { 2554 char *path; 2555 2556 if (entp->ipath == ipp) { 2557 path = ipath2str(entp->ename, ipp); 2558 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2559 FREE(path); 2560 stats_counter_reset(statp); 2561 Istat_need_save = 1; 2562 } 2563 } 2564 2565 /*ARGSUSED*/ 2566 static void 2567 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2568 void *unused) 2569 { 2570 char *path; 2571 nvlist_t *fmri; 2572 2573 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2574 if (!platform_path_exists(fmri)) { 2575 path = ipath2str(entp->ename, entp->ipath); 2576 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2577 FREE(path); 2578 stats_counter_reset(statp); 2579 Istat_need_save = 1; 2580 } 2581 nvlist_free(fmri); 2582 } 2583 2584 void 2585 istat_fini(void) 2586 { 2587 lut_free(Istats, istat_destructor, NULL); 2588 } 2589 2590 static char *Serdbuf; 2591 static char *Serdbufptr; 2592 static int Serdsz; 2593 2594 /* 2595 * serdaddsize -- calculate size of serd and add it to Serdsz 2596 */ 2597 /*ARGSUSED*/ 2598 static void 2599 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2600 { 2601 ASSERT(lhs != NULL); 2602 2603 /* count up the size of the stat name */ 2604 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2605 Serdsz++; /* for the trailing NULL byte */ 2606 } 2607 2608 /* 2609 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2610 */ 2611 /*ARGSUSED*/ 2612 static void 2613 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2614 { 2615 char *str; 2616 int len; 2617 2618 ASSERT(lhs != NULL); 2619 2620 /* serialize the serd engine name */ 2621 str = ipath2str(lhs->ename, lhs->ipath); 2622 len = strlen(str); 2623 2624 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2625 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2626 Serdbufptr += len; 2627 FREE(str); 2628 *Serdbufptr++ = '\0'; 2629 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2630 } 2631 2632 void 2633 serd_save() 2634 { 2635 if (Serd_need_save == 0) 2636 return; 2637 2638 /* figure out how big the serialzed info is */ 2639 Serdsz = 0; 2640 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2641 2642 if (Serdsz == 0) { 2643 /* no serd engines to save */ 2644 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2645 return; 2646 } 2647 2648 /* create the serialized buffer */ 2649 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2650 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2651 2652 /* clear out current saved stats */ 2653 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2654 2655 /* write out the new version */ 2656 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2657 FREE(Serdbuf); 2658 Serd_need_save = 0; 2659 } 2660 2661 int 2662 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2663 { 2664 if (ent1->ename != ent2->ename) 2665 return (ent2->ename - ent1->ename); 2666 if (ent1->ipath != ent2->ipath) 2667 return ((char *)ent2->ipath - (char *)ent1->ipath); 2668 2669 return (0); 2670 } 2671 2672 void 2673 fme_serd_load(fmd_hdl_t *hdl) 2674 { 2675 int sz; 2676 char *sbuf; 2677 char *sepptr; 2678 char *ptr; 2679 struct serd_entry *newentp; 2680 struct node *epname; 2681 nvlist_t *fmri; 2682 char *namestring; 2683 2684 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2685 return; 2686 sbuf = alloca(sz); 2687 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2688 ptr = sbuf; 2689 while (ptr < &sbuf[sz]) { 2690 sepptr = strchr(ptr, '@'); 2691 *sepptr = '\0'; 2692 namestring = ptr; 2693 sepptr++; 2694 ptr = sepptr; 2695 ptr += strlen(ptr); 2696 ptr++; /* move past the '\0' separating paths */ 2697 epname = pathstring2epnamenp(sepptr); 2698 fmri = node2fmri(epname); 2699 if (platform_path_exists(fmri)) { 2700 newentp = MALLOC(sizeof (*newentp)); 2701 newentp->hdl = hdl; 2702 newentp->ipath = ipath(epname); 2703 newentp->ename = stable(namestring); 2704 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2705 (void *)newentp, (lut_cmp)serd_cmp); 2706 } else 2707 Serd_need_save = 1; 2708 tree_free(epname); 2709 nvlist_free(fmri); 2710 } 2711 /* save it back again in case some of the paths no longer exist */ 2712 serd_save(); 2713 } 2714 2715 /*ARGSUSED*/ 2716 static void 2717 serd_destructor(void *left, void *right, void *arg) 2718 { 2719 struct serd_entry *entp = (struct serd_entry *)left; 2720 FREE(entp); 2721 } 2722 2723 /* 2724 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2725 */ 2726 /*ARGSUSED*/ 2727 static void 2728 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2729 { 2730 char *path; 2731 2732 if (entp->ipath == ipp) { 2733 path = ipath2str(entp->ename, ipp); 2734 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2735 fmd_serd_reset(entp->hdl, path); 2736 FREE(path); 2737 Serd_need_save = 1; 2738 } 2739 } 2740 2741 /*ARGSUSED*/ 2742 static void 2743 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2744 { 2745 char *path; 2746 nvlist_t *fmri; 2747 2748 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2749 if (!platform_path_exists(fmri)) { 2750 path = ipath2str(entp->ename, entp->ipath); 2751 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2752 fmd_serd_reset(entp->hdl, path); 2753 FREE(path); 2754 Serd_need_save = 1; 2755 } 2756 nvlist_free(fmri); 2757 } 2758 2759 void 2760 serd_fini(void) 2761 { 2762 lut_free(SerdEngines, serd_destructor, NULL); 2763 } 2764 2765 static void 2766 publish_suspects(struct fme *fmep, struct rsl *srl) 2767 { 2768 struct rsl *rp; 2769 nvlist_t *fault; 2770 uint8_t cert; 2771 uint_t *frs; 2772 uint_t frsum, fr; 2773 uint_t messval; 2774 uint_t retireval; 2775 uint_t responseval; 2776 struct node *snp; 2777 int frcnt, fridx; 2778 boolean_t allfaulty = B_TRUE; 2779 struct rsl *erl = srl + fmep->nsuspects - 1; 2780 2781 /* 2782 * sort the array 2783 */ 2784 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2785 2786 /* sum the fitrates */ 2787 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2788 fridx = frcnt = frsum = 0; 2789 2790 for (rp = srl; rp <= erl; rp++) { 2791 struct node *n; 2792 2793 n = eventprop_lookup(rp->suspect, L_FITrate); 2794 if (node2uint(n, &fr) != 0) { 2795 out(O_DEBUG|O_NONL, "event "); 2796 ipath_print(O_DEBUG|O_NONL, 2797 rp->suspect->enode->u.event.ename->u.name.s, 2798 rp->suspect->ipp); 2799 out(O_VERB, " has no FITrate (using 1)"); 2800 fr = 1; 2801 } else if (fr == 0) { 2802 out(O_DEBUG|O_NONL, "event "); 2803 ipath_print(O_DEBUG|O_NONL, 2804 rp->suspect->enode->u.event.ename->u.name.s, 2805 rp->suspect->ipp); 2806 out(O_VERB, " has zero FITrate (using 1)"); 2807 fr = 1; 2808 } 2809 2810 frs[fridx++] = fr; 2811 frsum += fr; 2812 frcnt++; 2813 } 2814 2815 /* Add them in reverse order of our sort, as fmd reverses order */ 2816 for (rp = erl; rp >= srl; rp--) { 2817 cert = percentof(frs[--fridx], frsum); 2818 fault = fmd_nvl_create_fault(fmep->hdl, 2819 rp->suspect->enode->u.event.ename->u.name.s, 2820 cert, 2821 rp->asru, 2822 rp->fru, 2823 rp->rsrc); 2824 if (fault == NULL) 2825 out(O_DIE, "fault creation failed"); 2826 /* if "message" property exists, add it to the fault */ 2827 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2828 &messval) == 0) { 2829 2830 out(O_ALTFP, 2831 "[FME%d, %s adds message=%d to suspect list]", 2832 fmep->id, 2833 rp->suspect->enode->u.event.ename->u.name.s, 2834 messval); 2835 if (nvlist_add_boolean_value(fault, 2836 FM_SUSPECT_MESSAGE, 2837 (messval) ? B_TRUE : B_FALSE) != 0) { 2838 out(O_DIE, "cannot add no-message to fault"); 2839 } 2840 } 2841 2842 /* if "retire" property exists, add it to the fault */ 2843 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2844 &retireval) == 0) { 2845 2846 out(O_ALTFP, 2847 "[FME%d, %s adds retire=%d to suspect list]", 2848 fmep->id, 2849 rp->suspect->enode->u.event.ename->u.name.s, 2850 retireval); 2851 if (nvlist_add_boolean_value(fault, 2852 FM_SUSPECT_RETIRE, 2853 (retireval) ? B_TRUE : B_FALSE) != 0) { 2854 out(O_DIE, "cannot add no-retire to fault"); 2855 } 2856 } 2857 2858 /* if "response" property exists, add it to the fault */ 2859 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2860 &responseval) == 0) { 2861 2862 out(O_ALTFP, 2863 "[FME%d, %s adds response=%d to suspect list]", 2864 fmep->id, 2865 rp->suspect->enode->u.event.ename->u.name.s, 2866 responseval); 2867 if (nvlist_add_boolean_value(fault, 2868 FM_SUSPECT_RESPONSE, 2869 (responseval) ? B_TRUE : B_FALSE) != 0) { 2870 out(O_DIE, "cannot add no-response to fault"); 2871 } 2872 } 2873 2874 /* add any payload properties */ 2875 lut_walk(rp->suspect->payloadprops, 2876 (lut_cb)addpayloadprop, (void *)fault); 2877 rslfree(rp); 2878 2879 /* 2880 * If "action" property exists, evaluate it; this must be done 2881 * before the allfaulty check below since some actions may 2882 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2883 * needs to be restructured if any new actions are introduced 2884 * that have effects that we do not want to be visible if 2885 * we decide not to publish in the dupclose check below. 2886 */ 2887 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2888 struct evalue evalue; 2889 2890 out(O_ALTFP|O_NONL, 2891 "[FME%d, %s action ", fmep->id, 2892 rp->suspect->enode->u.event.ename->u.name.s); 2893 ptree_name_iter(O_ALTFP|O_NONL, snp); 2894 out(O_ALTFP, "]"); 2895 Action_nvl = fault; 2896 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2897 NULL, 0, &evalue); 2898 } 2899 2900 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2901 2902 /* 2903 * check if the asru is already marked as "faulty". 2904 */ 2905 if (allfaulty) { 2906 nvlist_t *asru; 2907 2908 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2909 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2910 out(O_ALTFP|O_VERB|O_NONL, " "); 2911 if (nvlist_lookup_nvlist(fault, 2912 FM_FAULT_ASRU, &asru) != 0) { 2913 out(O_ALTFP|O_VERB, "NULL asru"); 2914 allfaulty = B_FALSE; 2915 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2916 FMD_HAS_FAULT_ASRU, NULL)) { 2917 out(O_ALTFP|O_VERB, "faulty"); 2918 } else { 2919 out(O_ALTFP|O_VERB, "not faulty"); 2920 allfaulty = B_FALSE; 2921 } 2922 } 2923 2924 } 2925 2926 if (!allfaulty) { 2927 /* 2928 * don't update the count stat if all asrus are already 2929 * present and unrepaired in the asru cache 2930 */ 2931 for (rp = erl; rp >= srl; rp--) { 2932 struct event *suspect = rp->suspect; 2933 2934 if (suspect == NULL) 2935 continue; 2936 2937 /* if "count" exists, increment the appropriate stat */ 2938 if ((snp = eventprop_lookup(suspect, 2939 L_count)) != NULL) { 2940 out(O_ALTFP|O_NONL, 2941 "[FME%d, %s count ", fmep->id, 2942 suspect->enode->u.event.ename->u.name.s); 2943 ptree_name_iter(O_ALTFP|O_NONL, snp); 2944 out(O_ALTFP, "]"); 2945 istat_bump(snp, 0); 2946 2947 } 2948 } 2949 istat_save(); /* write out any istat changes */ 2950 } 2951 } 2952 2953 static const char * 2954 undiag_2defect_str(int ud) 2955 { 2956 switch (ud) { 2957 case UD_VAL_MISSINGINFO: 2958 case UD_VAL_MISSINGOBS: 2959 case UD_VAL_MISSINGPATH: 2960 case UD_VAL_MISSINGZERO: 2961 case UD_VAL_BADOBS: 2962 case UD_VAL_CFGMISMATCH: 2963 return (UNDIAG_DEFECT_CHKPT); 2964 2965 case UD_VAL_BADEVENTI: 2966 case UD_VAL_BADEVENTPATH: 2967 case UD_VAL_BADEVENTCLASS: 2968 case UD_VAL_INSTFAIL: 2969 case UD_VAL_NOPATH: 2970 case UD_VAL_UNSOLVD: 2971 return (UNDIAG_DEFECT_FME); 2972 2973 case UD_VAL_MAXFME: 2974 return (UNDIAG_DEFECT_LIMIT); 2975 2976 case UD_VAL_UNKNOWN: 2977 default: 2978 return (UNDIAG_DEFECT_UNKNOWN); 2979 } 2980 } 2981 2982 static const char * 2983 undiag_2fault_str(int ud) 2984 { 2985 switch (ud) { 2986 case UD_VAL_BADEVENTI: 2987 case UD_VAL_BADEVENTPATH: 2988 case UD_VAL_BADEVENTCLASS: 2989 case UD_VAL_INSTFAIL: 2990 case UD_VAL_NOPATH: 2991 case UD_VAL_UNSOLVD: 2992 return (UNDIAG_FAULT_FME); 2993 default: 2994 return (NULL); 2995 } 2996 } 2997 2998 static char * 2999 undiag_2reason_str(int ud, char *arg) 3000 { 3001 const char *ptr; 3002 char *buf; 3003 int with_arg = 0; 3004 3005 switch (ud) { 3006 case UD_VAL_BADEVENTPATH: 3007 ptr = UD_STR_BADEVENTPATH; 3008 with_arg = 1; 3009 break; 3010 case UD_VAL_BADEVENTCLASS: 3011 ptr = UD_STR_BADEVENTCLASS; 3012 with_arg = 1; 3013 break; 3014 case UD_VAL_BADEVENTI: 3015 ptr = UD_STR_BADEVENTI; 3016 with_arg = 1; 3017 break; 3018 case UD_VAL_BADOBS: 3019 ptr = UD_STR_BADOBS; 3020 break; 3021 case UD_VAL_CFGMISMATCH: 3022 ptr = UD_STR_CFGMISMATCH; 3023 break; 3024 case UD_VAL_INSTFAIL: 3025 ptr = UD_STR_INSTFAIL; 3026 with_arg = 1; 3027 break; 3028 case UD_VAL_MAXFME: 3029 ptr = UD_STR_MAXFME; 3030 break; 3031 case UD_VAL_MISSINGINFO: 3032 ptr = UD_STR_MISSINGINFO; 3033 break; 3034 case UD_VAL_MISSINGOBS: 3035 ptr = UD_STR_MISSINGOBS; 3036 break; 3037 case UD_VAL_MISSINGPATH: 3038 ptr = UD_STR_MISSINGPATH; 3039 break; 3040 case UD_VAL_MISSINGZERO: 3041 ptr = UD_STR_MISSINGZERO; 3042 break; 3043 case UD_VAL_NOPATH: 3044 ptr = UD_STR_NOPATH; 3045 with_arg = 1; 3046 break; 3047 case UD_VAL_UNSOLVD: 3048 ptr = UD_STR_UNSOLVD; 3049 break; 3050 case UD_VAL_UNKNOWN: 3051 default: 3052 ptr = UD_STR_UNKNOWN; 3053 break; 3054 } 3055 if (with_arg) { 3056 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3057 (void) sprintf(buf, ptr, arg); 3058 } else { 3059 buf = MALLOC(strlen(ptr) + 1); 3060 (void) sprintf(buf, ptr); 3061 } 3062 return (buf); 3063 } 3064 3065 static void 3066 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3067 nvlist_t *detector, char *arg) 3068 { 3069 struct case_list *newcase; 3070 nvlist_t *defect, *fault; 3071 const char *faultstr; 3072 char *reason = undiag_2reason_str(Undiag_reason, arg); 3073 3074 out(O_ALTFP, 3075 "[undiagnosable ereport received, " 3076 "creating and closing a new case (%s)]", reason); 3077 3078 newcase = MALLOC(sizeof (struct case_list)); 3079 newcase->next = NULL; 3080 newcase->fmcase = fmcase; 3081 if (Undiagablecaselist != NULL) 3082 newcase->next = Undiagablecaselist; 3083 Undiagablecaselist = newcase; 3084 3085 if (ffep != NULL) 3086 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3087 3088 /* add defect */ 3089 defect = fmd_nvl_create_fault(hdl, 3090 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3091 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3092 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3093 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3094 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3095 3096 /* add fault if appropriate */ 3097 faultstr = undiag_2fault_str(Undiag_reason); 3098 if (faultstr != NULL) { 3099 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3100 detector); 3101 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3102 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3103 B_FALSE); 3104 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3105 B_FALSE); 3106 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3107 } 3108 FREE(reason); 3109 3110 /* solve and close case */ 3111 fmd_case_solve(hdl, newcase->fmcase); 3112 fmd_case_close(hdl, newcase->fmcase); 3113 Undiag_reason = UD_VAL_UNKNOWN; 3114 } 3115 3116 static void 3117 fme_undiagnosable(struct fme *f) 3118 { 3119 nvlist_t *defect, *fault, *detector = NULL; 3120 struct event *ep; 3121 char *pathstr; 3122 const char *faultstr; 3123 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3124 3125 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3126 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3127 3128 for (ep = f->observations; ep; ep = ep->observations) { 3129 3130 if (ep->ffep != f->e0r) 3131 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3132 3133 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3134 platform_unit_translate(0, f->config, TOPO_PROP_RESOURCE, 3135 &detector, pathstr); 3136 FREE(pathstr); 3137 3138 /* add defect */ 3139 defect = fmd_nvl_create_fault(f->hdl, 3140 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3141 NULL, NULL, detector); 3142 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3143 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3144 B_FALSE); 3145 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3146 B_FALSE); 3147 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3148 3149 /* add fault if appropriate */ 3150 faultstr = undiag_2fault_str(Undiag_reason); 3151 if (faultstr == NULL) 3152 continue; 3153 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3154 NULL, NULL, detector); 3155 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3156 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3157 B_FALSE); 3158 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3159 B_FALSE); 3160 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3161 nvlist_free(detector); 3162 } 3163 FREE(reason); 3164 fmd_case_solve(f->hdl, f->fmcase); 3165 fmd_case_close(f->hdl, f->fmcase); 3166 Undiag_reason = UD_VAL_UNKNOWN; 3167 } 3168 3169 /* 3170 * fme_close_case 3171 * 3172 * Find the requested case amongst our fmes and close it. Free up 3173 * the related fme. 3174 */ 3175 void 3176 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3177 { 3178 struct case_list *ucasep, *prevcasep = NULL; 3179 struct fme *prev = NULL; 3180 struct fme *fmep; 3181 3182 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3183 if (fmcase != ucasep->fmcase) { 3184 prevcasep = ucasep; 3185 continue; 3186 } 3187 3188 if (prevcasep == NULL) 3189 Undiagablecaselist = Undiagablecaselist->next; 3190 else 3191 prevcasep->next = ucasep->next; 3192 3193 FREE(ucasep); 3194 return; 3195 } 3196 3197 for (fmep = FMElist; fmep; fmep = fmep->next) { 3198 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3199 break; 3200 prev = fmep; 3201 } 3202 3203 if (fmep == NULL) { 3204 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3205 fmd_case_uuid(hdl, fmcase)); 3206 return; 3207 } 3208 3209 if (EFMElist == fmep) 3210 EFMElist = prev; 3211 3212 if (prev == NULL) 3213 FMElist = FMElist->next; 3214 else 3215 prev->next = fmep->next; 3216 3217 fmep->next = NULL; 3218 3219 /* Get rid of any timer this fme has set */ 3220 if (fmep->wull != 0) 3221 fmd_timer_remove(fmep->hdl, fmep->timer); 3222 3223 if (ClosedFMEs == NULL) { 3224 ClosedFMEs = fmep; 3225 } else { 3226 fmep->next = ClosedFMEs; 3227 ClosedFMEs = fmep; 3228 } 3229 3230 Open_fme_count--; 3231 3232 /* See if we can close the overflow FME */ 3233 if (Open_fme_count <= Max_fme) { 3234 for (fmep = FMElist; fmep; fmep = fmep->next) { 3235 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3236 fmep->fmcase))) 3237 break; 3238 } 3239 3240 if (fmep != NULL) 3241 fmd_case_close(fmep->hdl, fmep->fmcase); 3242 } 3243 } 3244 3245 /* 3246 * fme_set_timer() 3247 * If the time we need to wait for the given FME is less than the 3248 * current timer, kick that old timer out and establish a new one. 3249 */ 3250 static int 3251 fme_set_timer(struct fme *fmep, unsigned long long wull) 3252 { 3253 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3254 ptree_timeval(O_ALTFP|O_VERB, &wull); 3255 3256 if (wull <= fmep->pull) { 3257 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3258 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3259 out(O_ALTFP|O_VERB, NULL); 3260 /* we've waited at least wull already, don't need timer */ 3261 return (0); 3262 } 3263 3264 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3265 if (fmep->wull != 0) { 3266 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3267 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3268 out(O_ALTFP|O_VERB, NULL); 3269 } else { 3270 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3271 out(O_ALTFP|O_VERB, NULL); 3272 } 3273 3274 if (fmep->wull != 0) 3275 if (wull >= fmep->wull) 3276 /* New timer would fire later than established timer */ 3277 return (0); 3278 3279 if (fmep->wull != 0) { 3280 fmd_timer_remove(fmep->hdl, fmep->timer); 3281 } 3282 3283 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3284 fmep->e0r, wull); 3285 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3286 fmep->wull = wull; 3287 return (1); 3288 } 3289 3290 void 3291 fme_timer_fired(struct fme *fmep, id_t tid) 3292 { 3293 struct fme *ffmep = NULL; 3294 3295 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3296 if (ffmep == fmep) 3297 break; 3298 3299 if (ffmep == NULL) { 3300 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3301 (void *)fmep); 3302 return; 3303 } 3304 3305 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3306 fmep->pull = fmep->wull; 3307 fmep->wull = 0; 3308 fmd_buf_write(fmep->hdl, fmep->fmcase, 3309 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3310 3311 fme_eval(fmep, fmep->e0r); 3312 } 3313 3314 /* 3315 * Preserve the fme's suspect list in its psuspects list, NULLing the 3316 * suspects list in the meantime. 3317 */ 3318 static void 3319 save_suspects(struct fme *fmep) 3320 { 3321 struct event *ep; 3322 struct event *nextep; 3323 3324 /* zero out the previous suspect list */ 3325 for (ep = fmep->psuspects; ep; ep = nextep) { 3326 nextep = ep->psuspects; 3327 ep->psuspects = NULL; 3328 } 3329 fmep->psuspects = NULL; 3330 3331 /* zero out the suspect list, copying it to previous suspect list */ 3332 fmep->psuspects = fmep->suspects; 3333 for (ep = fmep->suspects; ep; ep = nextep) { 3334 nextep = ep->suspects; 3335 ep->psuspects = ep->suspects; 3336 ep->suspects = NULL; 3337 ep->is_suspect = 0; 3338 } 3339 fmep->suspects = NULL; 3340 fmep->nsuspects = 0; 3341 } 3342 3343 /* 3344 * Retrieve the fme's suspect list from its psuspects list. 3345 */ 3346 static void 3347 restore_suspects(struct fme *fmep) 3348 { 3349 struct event *ep; 3350 struct event *nextep; 3351 3352 fmep->nsuspects = 0; 3353 fmep->suspects = fmep->psuspects; 3354 for (ep = fmep->psuspects; ep; ep = nextep) { 3355 fmep->nsuspects++; 3356 nextep = ep->psuspects; 3357 ep->suspects = ep->psuspects; 3358 } 3359 } 3360 3361 /* 3362 * this is what we use to call the Emrys prototype code instead of main() 3363 */ 3364 static void 3365 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3366 { 3367 struct event *ep; 3368 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3369 struct rsl *srl = NULL; 3370 struct rsl *srl2 = NULL; 3371 int mess_zero_count; 3372 int rpcnt; 3373 3374 save_suspects(fmep); 3375 3376 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3377 indent_set(" "); 3378 3379 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3380 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3381 3382 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3383 fme_state2str(fmep->state)); 3384 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3385 out(O_ALTFP|O_NONL, " "); 3386 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3387 } 3388 out(O_ALTFP, NULL); 3389 3390 switch (fmep->state) { 3391 case FME_CREDIBLE: 3392 print_suspects(SLNEW, fmep); 3393 (void) upsets_eval(fmep, ffep); 3394 3395 /* 3396 * we may have already posted suspects in upsets_eval() which 3397 * can recurse into fme_eval() again. If so then just return. 3398 */ 3399 if (fmep->posted_suspects) 3400 return; 3401 3402 stats_counter_bump(fmep->diags); 3403 rpcnt = fmep->nsuspects; 3404 save_suspects(fmep); 3405 3406 /* 3407 * create two lists, one for "message=1" faults and one for 3408 * "message=0" faults. If we have a mixture we will generate 3409 * two separate suspect lists. 3410 */ 3411 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3412 bzero(srl, rpcnt * sizeof (struct rsl)); 3413 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3414 bzero(srl2, rpcnt * sizeof (struct rsl)); 3415 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3416 3417 /* 3418 * If the resulting suspect list has no members, we're 3419 * done so simply close the case. Otherwise sort and publish. 3420 */ 3421 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3422 out(O_ALTFP, 3423 "[FME%d, case %s (all suspects are upsets)]", 3424 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3425 fmd_case_close(fmep->hdl, fmep->fmcase); 3426 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3427 publish_suspects(fmep, srl); 3428 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3429 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3430 fmd_case_solve(fmep->hdl, fmep->fmcase); 3431 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3432 fmep->nsuspects = mess_zero_count; 3433 publish_suspects(fmep, srl2); 3434 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3435 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3436 fmd_case_solve(fmep->hdl, fmep->fmcase); 3437 } else { 3438 struct event *obsp; 3439 struct fme *nfmep; 3440 3441 publish_suspects(fmep, srl); 3442 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3443 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3444 fmd_case_solve(fmep->hdl, fmep->fmcase); 3445 3446 /* 3447 * Got both message=0 and message=1 so create a 3448 * duplicate case. Also need a temporary duplicate fme 3449 * structure for use by publish_suspects(). 3450 */ 3451 nfmep = alloc_fme(); 3452 nfmep->id = Nextid++; 3453 nfmep->hdl = fmep->hdl; 3454 nfmep->nsuspects = mess_zero_count; 3455 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3456 out(O_ALTFP|O_STAMP, 3457 "[creating parallel FME%d, case %s]", nfmep->id, 3458 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3459 Open_fme_count++; 3460 if (ffep) { 3461 fmd_case_setprincipal(nfmep->hdl, 3462 nfmep->fmcase, ffep); 3463 fmd_case_add_ereport(nfmep->hdl, 3464 nfmep->fmcase, ffep); 3465 } 3466 for (obsp = fmep->observations; obsp; 3467 obsp = obsp->observations) 3468 if (obsp->ffep && obsp->ffep != ffep) 3469 fmd_case_add_ereport(nfmep->hdl, 3470 nfmep->fmcase, obsp->ffep); 3471 3472 publish_suspects(nfmep, srl2); 3473 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3474 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3475 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3476 FREE(nfmep); 3477 } 3478 FREE(srl); 3479 FREE(srl2); 3480 restore_suspects(fmep); 3481 3482 fmep->posted_suspects = 1; 3483 fmd_buf_write(fmep->hdl, fmep->fmcase, 3484 WOBUF_POSTD, 3485 (void *)&fmep->posted_suspects, 3486 sizeof (fmep->posted_suspects)); 3487 3488 /* 3489 * Now the suspects have been posted, we can clear up 3490 * the instance tree as we won't be looking at it again. 3491 * Also cancel the timer as the case is now solved. 3492 */ 3493 if (fmep->wull != 0) { 3494 fmd_timer_remove(fmep->hdl, fmep->timer); 3495 fmep->wull = 0; 3496 } 3497 break; 3498 3499 case FME_WAIT: 3500 ASSERT(my_delay > fmep->ull); 3501 (void) fme_set_timer(fmep, my_delay); 3502 print_suspects(SLWAIT, fmep); 3503 itree_prune(fmep->eventtree); 3504 return; 3505 3506 case FME_DISPROVED: 3507 print_suspects(SLDISPROVED, fmep); 3508 Undiag_reason = UD_VAL_UNSOLVD; 3509 fme_undiagnosable(fmep); 3510 break; 3511 } 3512 3513 itree_free(fmep->eventtree); 3514 fmep->eventtree = NULL; 3515 structconfig_free(fmep->config); 3516 fmep->config = NULL; 3517 destroy_fme_bufs(fmep); 3518 } 3519 3520 static void indent(void); 3521 static int triggered(struct fme *fmep, struct event *ep, int mark); 3522 static enum fme_state effects_test(struct fme *fmep, 3523 struct event *fault_event, unsigned long long at_latest_by, 3524 unsigned long long *pdelay); 3525 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3526 unsigned long long at_latest_by, unsigned long long *pdelay); 3527 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3528 unsigned long long at_latest_by, unsigned long long *pdelay); 3529 3530 static int 3531 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3532 { 3533 struct constraintlist *ctp; 3534 struct evalue value; 3535 char *sep = ""; 3536 3537 if (arrowp->forever_false) { 3538 indent(); 3539 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3540 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3541 out(O_ALTFP|O_VERB|O_NONL, sep); 3542 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3543 sep = ", "; 3544 } 3545 out(O_ALTFP|O_VERB, NULL); 3546 return (0); 3547 } 3548 if (arrowp->forever_true) { 3549 indent(); 3550 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3551 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3552 out(O_ALTFP|O_VERB|O_NONL, sep); 3553 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3554 sep = ", "; 3555 } 3556 out(O_ALTFP|O_VERB, NULL); 3557 return (1); 3558 } 3559 3560 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3561 if (eval_expr(ctp->cnode, NULL, NULL, 3562 &fmep->globals, fmep->config, 3563 arrowp, 0, &value)) { 3564 /* evaluation successful */ 3565 if (value.t == UNDEFINED || value.v == 0) { 3566 /* known false */ 3567 arrowp->forever_false = 1; 3568 indent(); 3569 out(O_ALTFP|O_VERB|O_NONL, 3570 " False constraint: "); 3571 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3572 out(O_ALTFP|O_VERB, NULL); 3573 return (0); 3574 } 3575 } else { 3576 /* evaluation unsuccessful -- unknown value */ 3577 indent(); 3578 out(O_ALTFP|O_VERB|O_NONL, 3579 " Deferred constraint: "); 3580 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3581 out(O_ALTFP|O_VERB, NULL); 3582 return (1); 3583 } 3584 } 3585 /* known true */ 3586 arrowp->forever_true = 1; 3587 indent(); 3588 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3589 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3590 out(O_ALTFP|O_VERB|O_NONL, sep); 3591 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3592 sep = ", "; 3593 } 3594 out(O_ALTFP|O_VERB, NULL); 3595 return (1); 3596 } 3597 3598 static int 3599 triggered(struct fme *fmep, struct event *ep, int mark) 3600 { 3601 struct bubble *bp; 3602 struct arrowlist *ap; 3603 int count = 0; 3604 3605 stats_counter_bump(fmep->Tcallcount); 3606 for (bp = itree_next_bubble(ep, NULL); bp; 3607 bp = itree_next_bubble(ep, bp)) { 3608 if (bp->t != B_TO) 3609 continue; 3610 for (ap = itree_next_arrow(bp, NULL); ap; 3611 ap = itree_next_arrow(bp, ap)) { 3612 /* check count of marks against K in the bubble */ 3613 if ((ap->arrowp->mark & mark) && 3614 ++count >= bp->nork) 3615 return (1); 3616 } 3617 } 3618 return (0); 3619 } 3620 3621 static int 3622 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3623 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3624 { 3625 struct bubble *bp; 3626 struct arrowlist *ap; 3627 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3628 unsigned long long my_delay; 3629 enum fme_state result; 3630 int retval = 0; 3631 3632 for (bp = itree_next_bubble(ep, NULL); bp; 3633 bp = itree_next_bubble(ep, bp)) { 3634 if (bp->t != B_FROM) 3635 continue; 3636 stats_counter_bump(fmep->Marrowcount); 3637 for (ap = itree_next_arrow(bp, NULL); ap; 3638 ap = itree_next_arrow(bp, ap)) { 3639 struct event *ep2 = ap->arrowp->head->myevent; 3640 /* 3641 * if we're clearing marks, we can avoid doing 3642 * all that work evaluating constraints. 3643 */ 3644 if (mark == 0) { 3645 if (ap->arrowp->arrow_marked == 0) 3646 continue; 3647 ap->arrowp->arrow_marked = 0; 3648 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3649 if (keep && (ep2->cached_state & 3650 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3651 ep2->keep_in_tree = 1; 3652 ep2->cached_state &= 3653 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3654 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3655 keep); 3656 continue; 3657 } 3658 ap->arrowp->arrow_marked = 1; 3659 if (ep2->cached_state & REQMNTS_DISPROVED) { 3660 indent(); 3661 out(O_ALTFP|O_VERB|O_NONL, 3662 " ALREADY DISPROVED "); 3663 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3664 out(O_ALTFP|O_VERB, NULL); 3665 continue; 3666 } 3667 if (ep2->cached_state & WAIT_EFFECT) { 3668 indent(); 3669 out(O_ALTFP|O_VERB|O_NONL, 3670 " ALREADY EFFECTS WAIT "); 3671 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3672 out(O_ALTFP|O_VERB, NULL); 3673 continue; 3674 } 3675 if (ep2->cached_state & CREDIBLE_EFFECT) { 3676 indent(); 3677 out(O_ALTFP|O_VERB|O_NONL, 3678 " ALREADY EFFECTS CREDIBLE "); 3679 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3680 out(O_ALTFP|O_VERB, NULL); 3681 continue; 3682 } 3683 if ((ep2->cached_state & PARENT_WAIT) && 3684 (mark & PARENT_WAIT)) { 3685 indent(); 3686 out(O_ALTFP|O_VERB|O_NONL, 3687 " ALREADY PARENT EFFECTS WAIT "); 3688 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3689 out(O_ALTFP|O_VERB, NULL); 3690 continue; 3691 } 3692 platform_set_payloadnvp(ep2->nvp); 3693 if (checkconstraints(fmep, ap->arrowp) == 0) { 3694 platform_set_payloadnvp(NULL); 3695 indent(); 3696 out(O_ALTFP|O_VERB|O_NONL, 3697 " CONSTRAINTS FAIL "); 3698 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3699 out(O_ALTFP|O_VERB, NULL); 3700 continue; 3701 } 3702 platform_set_payloadnvp(NULL); 3703 ap->arrowp->mark |= EFFECTS_COUNTER; 3704 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3705 indent(); 3706 out(O_ALTFP|O_VERB|O_NONL, 3707 " K-COUNT NOT YET MET "); 3708 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3709 out(O_ALTFP|O_VERB, NULL); 3710 continue; 3711 } 3712 ep2->cached_state &= ~PARENT_WAIT; 3713 /* 3714 * if we've reached an ereport and no propagation time 3715 * is specified, use the Hesitate value 3716 */ 3717 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3718 ap->arrowp->maxdelay == 0ULL) { 3719 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3720 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3721 out(O_ALTFP|O_VERB, NULL); 3722 result = requirements_test(fmep, ep2, Hesitate, 3723 &my_delay); 3724 } else { 3725 result = requirements_test(fmep, ep2, 3726 at_latest_by + ap->arrowp->maxdelay, 3727 &my_delay); 3728 } 3729 if (result == FME_WAIT) { 3730 retval = WAIT_EFFECT; 3731 if (overall_delay > my_delay) 3732 overall_delay = my_delay; 3733 ep2->cached_state |= WAIT_EFFECT; 3734 indent(); 3735 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3736 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3737 out(O_ALTFP|O_VERB, NULL); 3738 indent_push(" E"); 3739 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3740 at_latest_by, &my_delay, 0) == 3741 WAIT_EFFECT) { 3742 retval = WAIT_EFFECT; 3743 if (overall_delay > my_delay) 3744 overall_delay = my_delay; 3745 } 3746 indent_pop(); 3747 } else if (result == FME_DISPROVED) { 3748 indent(); 3749 out(O_ALTFP|O_VERB|O_NONL, 3750 " EFFECTS DISPROVED "); 3751 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3752 out(O_ALTFP|O_VERB, NULL); 3753 } else { 3754 ep2->cached_state |= mark; 3755 indent(); 3756 if (mark == CREDIBLE_EFFECT) 3757 out(O_ALTFP|O_VERB|O_NONL, 3758 " EFFECTS CREDIBLE "); 3759 else 3760 out(O_ALTFP|O_VERB|O_NONL, 3761 " PARENT EFFECTS WAIT "); 3762 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3763 out(O_ALTFP|O_VERB, NULL); 3764 indent_push(" E"); 3765 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3766 &my_delay, 0) == WAIT_EFFECT) { 3767 retval = WAIT_EFFECT; 3768 if (overall_delay > my_delay) 3769 overall_delay = my_delay; 3770 } 3771 indent_pop(); 3772 } 3773 } 3774 } 3775 if (retval == WAIT_EFFECT) 3776 *pdelay = overall_delay; 3777 return (retval); 3778 } 3779 3780 static enum fme_state 3781 effects_test(struct fme *fmep, struct event *fault_event, 3782 unsigned long long at_latest_by, unsigned long long *pdelay) 3783 { 3784 struct event *error_event; 3785 enum fme_state return_value = FME_CREDIBLE; 3786 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3787 unsigned long long my_delay; 3788 3789 stats_counter_bump(fmep->Ecallcount); 3790 indent_push(" E"); 3791 indent(); 3792 out(O_ALTFP|O_VERB|O_NONL, "->"); 3793 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3794 out(O_ALTFP|O_VERB, NULL); 3795 3796 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3797 &my_delay, 0) == WAIT_EFFECT) { 3798 return_value = FME_WAIT; 3799 if (overall_delay > my_delay) 3800 overall_delay = my_delay; 3801 } 3802 for (error_event = fmep->observations; 3803 error_event; error_event = error_event->observations) { 3804 indent(); 3805 out(O_ALTFP|O_VERB|O_NONL, " "); 3806 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3807 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3808 if (error_event->cached_state & 3809 (PARENT_WAIT|WAIT_EFFECT)) { 3810 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3811 continue; 3812 } 3813 return_value = FME_DISPROVED; 3814 out(O_ALTFP|O_VERB, " NOT triggered"); 3815 break; 3816 } else { 3817 out(O_ALTFP|O_VERB, " triggered"); 3818 } 3819 } 3820 if (return_value == FME_DISPROVED) { 3821 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3822 } else { 3823 fault_event->keep_in_tree = 1; 3824 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3825 } 3826 3827 indent(); 3828 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3829 fme_state2str(return_value)); 3830 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3831 out(O_ALTFP|O_VERB, NULL); 3832 indent_pop(); 3833 if (return_value == FME_WAIT) 3834 *pdelay = overall_delay; 3835 return (return_value); 3836 } 3837 3838 static enum fme_state 3839 requirements_test(struct fme *fmep, struct event *ep, 3840 unsigned long long at_latest_by, unsigned long long *pdelay) 3841 { 3842 int waiting_events; 3843 int credible_events; 3844 int deferred_events; 3845 enum fme_state return_value = FME_CREDIBLE; 3846 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3847 unsigned long long arrow_delay; 3848 unsigned long long my_delay; 3849 struct event *ep2; 3850 struct bubble *bp; 3851 struct arrowlist *ap; 3852 3853 if (ep->cached_state & REQMNTS_CREDIBLE) { 3854 indent(); 3855 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3856 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3857 out(O_ALTFP|O_VERB, NULL); 3858 return (FME_CREDIBLE); 3859 } 3860 if (ep->cached_state & REQMNTS_DISPROVED) { 3861 indent(); 3862 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3863 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3864 out(O_ALTFP|O_VERB, NULL); 3865 return (FME_DISPROVED); 3866 } 3867 if (ep->cached_state & REQMNTS_WAIT) { 3868 indent(); 3869 *pdelay = ep->cached_delay; 3870 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3871 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3872 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3873 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3874 out(O_ALTFP|O_VERB, NULL); 3875 return (FME_WAIT); 3876 } 3877 stats_counter_bump(fmep->Rcallcount); 3878 indent_push(" R"); 3879 indent(); 3880 out(O_ALTFP|O_VERB|O_NONL, "->"); 3881 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3882 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3883 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3884 out(O_ALTFP|O_VERB, NULL); 3885 3886 if (ep->t == N_EREPORT) { 3887 if (ep->count == 0) { 3888 if (fmep->pull >= at_latest_by) { 3889 return_value = FME_DISPROVED; 3890 } else { 3891 ep->cached_delay = *pdelay = at_latest_by; 3892 return_value = FME_WAIT; 3893 } 3894 } 3895 3896 indent(); 3897 switch (return_value) { 3898 case FME_CREDIBLE: 3899 ep->cached_state |= REQMNTS_CREDIBLE; 3900 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3901 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3902 break; 3903 case FME_DISPROVED: 3904 ep->cached_state |= REQMNTS_DISPROVED; 3905 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3906 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3907 break; 3908 case FME_WAIT: 3909 ep->cached_state |= REQMNTS_WAIT; 3910 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3911 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3912 out(O_ALTFP|O_VERB|O_NONL, " to "); 3913 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3914 break; 3915 default: 3916 out(O_DIE, "requirements_test: unexpected fme_state"); 3917 break; 3918 } 3919 out(O_ALTFP|O_VERB, NULL); 3920 indent_pop(); 3921 3922 return (return_value); 3923 } 3924 3925 /* this event is not a report, descend the tree */ 3926 for (bp = itree_next_bubble(ep, NULL); bp; 3927 bp = itree_next_bubble(ep, bp)) { 3928 int n; 3929 3930 if (bp->t != B_FROM) 3931 continue; 3932 3933 n = bp->nork; 3934 3935 credible_events = 0; 3936 waiting_events = 0; 3937 deferred_events = 0; 3938 arrow_delay = TIMEVAL_EVENTUALLY; 3939 /* 3940 * n is -1 for 'A' so adjust it. 3941 * XXX just count up the arrows for now. 3942 */ 3943 if (n < 0) { 3944 n = 0; 3945 for (ap = itree_next_arrow(bp, NULL); ap; 3946 ap = itree_next_arrow(bp, ap)) 3947 n++; 3948 indent(); 3949 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3950 } else { 3951 indent(); 3952 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3953 } 3954 3955 if (n == 0) 3956 continue; 3957 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3958 for (ap = itree_next_arrow(bp, NULL); ap; 3959 ap = itree_next_arrow(bp, ap)) { 3960 ep2 = ap->arrowp->head->myevent; 3961 platform_set_payloadnvp(ep2->nvp); 3962 (void) checkconstraints(fmep, ap->arrowp); 3963 if (!ap->arrowp->forever_false) { 3964 /* 3965 * if all arrows are invalidated by the 3966 * constraints, then we should elide the 3967 * whole bubble to be consistant with 3968 * the tree creation time behaviour 3969 */ 3970 bp->mark |= BUBBLE_OK; 3971 platform_set_payloadnvp(NULL); 3972 break; 3973 } 3974 platform_set_payloadnvp(NULL); 3975 } 3976 } 3977 for (ap = itree_next_arrow(bp, NULL); ap; 3978 ap = itree_next_arrow(bp, ap)) { 3979 ep2 = ap->arrowp->head->myevent; 3980 if (n <= credible_events) 3981 break; 3982 3983 ap->arrowp->mark |= REQMNTS_COUNTER; 3984 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3985 /* XXX adding max timevals! */ 3986 switch (requirements_test(fmep, ep2, 3987 at_latest_by + ap->arrowp->maxdelay, 3988 &my_delay)) { 3989 case FME_DEFERRED: 3990 deferred_events++; 3991 break; 3992 case FME_CREDIBLE: 3993 credible_events++; 3994 break; 3995 case FME_DISPROVED: 3996 break; 3997 case FME_WAIT: 3998 if (my_delay < arrow_delay) 3999 arrow_delay = my_delay; 4000 waiting_events++; 4001 break; 4002 default: 4003 out(O_DIE, 4004 "Bug in requirements_test."); 4005 } 4006 else 4007 deferred_events++; 4008 } 4009 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 4010 bp->mark |= BUBBLE_ELIDED; 4011 continue; 4012 } 4013 indent(); 4014 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 4015 credible_events + deferred_events, waiting_events); 4016 if (credible_events + deferred_events + waiting_events < n) { 4017 /* Can never meet requirements */ 4018 ep->cached_state |= REQMNTS_DISPROVED; 4019 indent(); 4020 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4021 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4022 out(O_ALTFP|O_VERB, NULL); 4023 indent_pop(); 4024 return (FME_DISPROVED); 4025 } 4026 if (credible_events + deferred_events < n) { 4027 /* will have to wait */ 4028 /* wait time is shortest known */ 4029 if (arrow_delay < overall_delay) 4030 overall_delay = arrow_delay; 4031 return_value = FME_WAIT; 4032 } else if (credible_events < n) { 4033 if (return_value != FME_WAIT) 4034 return_value = FME_DEFERRED; 4035 } 4036 } 4037 4038 /* 4039 * don't mark as FME_DEFERRED. If this event isn't reached by another 4040 * path, then this will be considered FME_CREDIBLE. But if it is 4041 * reached by a different path so the K-count is met, then might 4042 * get overridden by FME_WAIT or FME_DISPROVED. 4043 */ 4044 if (return_value == FME_WAIT) { 4045 ep->cached_state |= REQMNTS_WAIT; 4046 ep->cached_delay = *pdelay = overall_delay; 4047 } else if (return_value == FME_CREDIBLE) { 4048 ep->cached_state |= REQMNTS_CREDIBLE; 4049 } 4050 indent(); 4051 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4052 fme_state2str(return_value)); 4053 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4054 out(O_ALTFP|O_VERB, NULL); 4055 indent_pop(); 4056 return (return_value); 4057 } 4058 4059 static enum fme_state 4060 causes_test(struct fme *fmep, struct event *ep, 4061 unsigned long long at_latest_by, unsigned long long *pdelay) 4062 { 4063 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4064 unsigned long long my_delay; 4065 int credible_results = 0; 4066 int waiting_results = 0; 4067 enum fme_state fstate; 4068 struct event *tail_event; 4069 struct bubble *bp; 4070 struct arrowlist *ap; 4071 int k = 1; 4072 4073 stats_counter_bump(fmep->Ccallcount); 4074 indent_push(" C"); 4075 indent(); 4076 out(O_ALTFP|O_VERB|O_NONL, "->"); 4077 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4078 out(O_ALTFP|O_VERB, NULL); 4079 4080 for (bp = itree_next_bubble(ep, NULL); bp; 4081 bp = itree_next_bubble(ep, bp)) { 4082 if (bp->t != B_TO) 4083 continue; 4084 k = bp->nork; /* remember the K value */ 4085 for (ap = itree_next_arrow(bp, NULL); ap; 4086 ap = itree_next_arrow(bp, ap)) { 4087 int do_not_follow = 0; 4088 4089 /* 4090 * if we get to the same event multiple times 4091 * only worry about the first one. 4092 */ 4093 if (ap->arrowp->tail->myevent->cached_state & 4094 CAUSES_TESTED) { 4095 indent(); 4096 out(O_ALTFP|O_VERB|O_NONL, 4097 " causes test already run for "); 4098 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4099 ap->arrowp->tail->myevent); 4100 out(O_ALTFP|O_VERB, NULL); 4101 continue; 4102 } 4103 4104 /* 4105 * see if false constraint prevents us 4106 * from traversing this arrow 4107 */ 4108 platform_set_payloadnvp(ep->nvp); 4109 if (checkconstraints(fmep, ap->arrowp) == 0) 4110 do_not_follow = 1; 4111 platform_set_payloadnvp(NULL); 4112 if (do_not_follow) { 4113 indent(); 4114 out(O_ALTFP|O_VERB|O_NONL, 4115 " False arrow from "); 4116 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4117 ap->arrowp->tail->myevent); 4118 out(O_ALTFP|O_VERB, NULL); 4119 continue; 4120 } 4121 4122 ap->arrowp->tail->myevent->cached_state |= 4123 CAUSES_TESTED; 4124 tail_event = ap->arrowp->tail->myevent; 4125 fstate = hypothesise(fmep, tail_event, at_latest_by, 4126 &my_delay); 4127 4128 switch (fstate) { 4129 case FME_WAIT: 4130 if (my_delay < overall_delay) 4131 overall_delay = my_delay; 4132 waiting_results++; 4133 break; 4134 case FME_CREDIBLE: 4135 credible_results++; 4136 break; 4137 case FME_DISPROVED: 4138 break; 4139 default: 4140 out(O_DIE, "Bug in causes_test"); 4141 } 4142 } 4143 } 4144 /* compare against K */ 4145 if (credible_results + waiting_results < k) { 4146 indent(); 4147 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4148 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4149 out(O_ALTFP|O_VERB, NULL); 4150 indent_pop(); 4151 return (FME_DISPROVED); 4152 } 4153 if (waiting_results != 0) { 4154 *pdelay = overall_delay; 4155 indent(); 4156 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4157 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4158 out(O_ALTFP|O_VERB|O_NONL, " to "); 4159 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4160 out(O_ALTFP|O_VERB, NULL); 4161 indent_pop(); 4162 return (FME_WAIT); 4163 } 4164 indent(); 4165 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4166 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4167 out(O_ALTFP|O_VERB, NULL); 4168 indent_pop(); 4169 return (FME_CREDIBLE); 4170 } 4171 4172 static enum fme_state 4173 hypothesise(struct fme *fmep, struct event *ep, 4174 unsigned long long at_latest_by, unsigned long long *pdelay) 4175 { 4176 enum fme_state rtr, otr; 4177 unsigned long long my_delay; 4178 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4179 4180 stats_counter_bump(fmep->Hcallcount); 4181 indent_push(" H"); 4182 indent(); 4183 out(O_ALTFP|O_VERB|O_NONL, "->"); 4184 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4185 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4186 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4187 out(O_ALTFP|O_VERB, NULL); 4188 4189 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4190 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4191 overall_delay = my_delay; 4192 if (rtr != FME_DISPROVED) { 4193 if (is_problem(ep->t)) { 4194 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4195 if (otr != FME_DISPROVED) { 4196 if (fmep->peek == 0 && ep->is_suspect == 0) { 4197 ep->suspects = fmep->suspects; 4198 ep->is_suspect = 1; 4199 fmep->suspects = ep; 4200 fmep->nsuspects++; 4201 } 4202 } 4203 } else 4204 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4205 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4206 overall_delay = my_delay; 4207 if ((otr != FME_DISPROVED) && 4208 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4209 *pdelay = overall_delay; 4210 } 4211 if (rtr == FME_DISPROVED) { 4212 indent(); 4213 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4214 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4215 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4216 indent_pop(); 4217 return (FME_DISPROVED); 4218 } 4219 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4220 indent(); 4221 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4222 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4223 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4224 indent_pop(); 4225 return (FME_DISPROVED); 4226 } 4227 if (otr == FME_DISPROVED) { 4228 indent(); 4229 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4230 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4231 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4232 indent_pop(); 4233 return (FME_DISPROVED); 4234 } 4235 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4236 indent(); 4237 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4238 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4239 out(O_ALTFP|O_VERB|O_NONL, " to "); 4240 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4241 out(O_ALTFP|O_VERB, NULL); 4242 indent_pop(); 4243 return (FME_WAIT); 4244 } 4245 indent(); 4246 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4247 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4248 out(O_ALTFP|O_VERB, NULL); 4249 indent_pop(); 4250 return (FME_CREDIBLE); 4251 } 4252 4253 /* 4254 * fme_istat_load -- reconstitute any persistent istats 4255 */ 4256 void 4257 fme_istat_load(fmd_hdl_t *hdl) 4258 { 4259 int sz; 4260 char *sbuf; 4261 char *ptr; 4262 4263 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4264 out(O_ALTFP, "fme_istat_load: No stats"); 4265 return; 4266 } 4267 4268 sbuf = alloca(sz); 4269 4270 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4271 4272 /* 4273 * pick apart the serialized stats 4274 * 4275 * format is: 4276 * <class-name>, '@', <path>, '\0', <value>, '\0' 4277 * for example: 4278 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4279 * 4280 * since this is parsing our own serialized data, any parsing issues 4281 * are fatal, so we check for them all with ASSERT() below. 4282 */ 4283 ptr = sbuf; 4284 while (ptr < &sbuf[sz]) { 4285 char *sepptr; 4286 struct node *np; 4287 int val; 4288 4289 sepptr = strchr(ptr, '@'); 4290 ASSERT(sepptr != NULL); 4291 *sepptr = '\0'; 4292 4293 /* construct the event */ 4294 np = newnode(T_EVENT, NULL, 0); 4295 np->u.event.ename = newnode(T_NAME, NULL, 0); 4296 np->u.event.ename->u.name.t = N_STAT; 4297 np->u.event.ename->u.name.s = stable(ptr); 4298 np->u.event.ename->u.name.it = IT_ENAME; 4299 np->u.event.ename->u.name.last = np->u.event.ename; 4300 4301 ptr = sepptr + 1; 4302 ASSERT(ptr < &sbuf[sz]); 4303 ptr += strlen(ptr); 4304 ptr++; /* move past the '\0' separating path from value */ 4305 ASSERT(ptr < &sbuf[sz]); 4306 ASSERT(isdigit(*ptr)); 4307 val = atoi(ptr); 4308 ASSERT(val > 0); 4309 ptr += strlen(ptr); 4310 ptr++; /* move past the final '\0' for this entry */ 4311 4312 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4313 ASSERT(np->u.event.epname != NULL); 4314 4315 istat_bump(np, val); 4316 tree_free(np); 4317 } 4318 4319 istat_save(); 4320 } 4321