1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2012 Milan Jurik. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 * 27 * fme.c -- fault management exercise module 28 * 29 * this module provides the simulated fault management exercise. 30 */ 31 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <strings.h> 36 #include <ctype.h> 37 #include <alloca.h> 38 #include <libnvpair.h> 39 #include <sys/fm/protocol.h> 40 #include <fm/fmd_api.h> 41 #include <fm/libtopo.h> 42 #include "alloc.h" 43 #include "out.h" 44 #include "stats.h" 45 #include "stable.h" 46 #include "literals.h" 47 #include "lut.h" 48 #include "tree.h" 49 #include "ptree.h" 50 #include "itree.h" 51 #include "ipath.h" 52 #include "fme.h" 53 #include "evnv.h" 54 #include "eval.h" 55 #include "config.h" 56 #include "platform.h" 57 #include "esclex.h" 58 59 /* imported from eft.c... */ 60 extern hrtime_t Hesitate; 61 extern char *Serd_Override; 62 extern nv_alloc_t Eft_nv_hdl; 63 extern int Max_fme; 64 extern fmd_hdl_t *Hdl; 65 66 static int Istat_need_save; 67 static int Serd_need_save; 68 void istat_save(void); 69 void serd_save(void); 70 71 /* fme under construction is global so we can free it on module abort */ 72 static struct fme *Nfmep; 73 74 static int Undiag_reason = UD_VAL_UNKNOWN; 75 76 static int Nextid = 0; 77 78 static int Open_fme_count = 0; /* Count of open FMEs */ 79 80 /* list of fault management exercises underway */ 81 static struct fme { 82 struct fme *next; /* next exercise */ 83 unsigned long long ull; /* time when fme was created */ 84 int id; /* FME id */ 85 struct config *config; /* cooked configuration data */ 86 struct lut *eventtree; /* propagation tree for this FME */ 87 /* 88 * The initial error report that created this FME is kept in 89 * two forms. e0 points to the instance tree node and is used 90 * by fme_eval() as the starting point for the inference 91 * algorithm. e0r is the event handle FMD passed to us when 92 * the ereport first arrived and is used when setting timers, 93 * which are always relative to the time of this initial 94 * report. 95 */ 96 struct event *e0; 97 fmd_event_t *e0r; 98 99 id_t timer; /* for setting an fmd time-out */ 100 101 struct event *ecurrent; /* ereport under consideration */ 102 struct event *suspects; /* current suspect list */ 103 struct event *psuspects; /* previous suspect list */ 104 int nsuspects; /* count of suspects */ 105 int posted_suspects; /* true if we've posted a diagnosis */ 106 int uniqobs; /* number of unique events observed */ 107 int peek; /* just peeking, don't track suspects */ 108 int overflow; /* true if overflow FME */ 109 enum fme_state { 110 FME_NOTHING = 5000, /* not evaluated yet */ 111 FME_WAIT, /* need to wait for more info */ 112 FME_CREDIBLE, /* suspect list is credible */ 113 FME_DISPROVED, /* no valid suspects found */ 114 FME_DEFERRED /* don't know yet (k-count not met) */ 115 } state; 116 117 unsigned long long pull; /* time passed since created */ 118 unsigned long long wull; /* wait until this time for re-eval */ 119 struct event *observations; /* observation list */ 120 struct lut *globals; /* values of global variables */ 121 /* fmd interfacing */ 122 fmd_hdl_t *hdl; /* handle for talking with fmd */ 123 fmd_case_t *fmcase; /* what fmd 'case' we associate with */ 124 /* stats */ 125 struct stats *Rcount; 126 struct stats *Hcallcount; 127 struct stats *Rcallcount; 128 struct stats *Ccallcount; 129 struct stats *Ecallcount; 130 struct stats *Tcallcount; 131 struct stats *Marrowcount; 132 struct stats *diags; 133 } *FMElist, *EFMElist, *ClosedFMEs; 134 135 static struct case_list { 136 fmd_case_t *fmcase; 137 struct case_list *next; 138 } *Undiagablecaselist; 139 140 static void fme_eval(struct fme *fmep, fmd_event_t *ffep); 141 static enum fme_state hypothesise(struct fme *fmep, struct event *ep, 142 unsigned long long at_latest_by, unsigned long long *pdelay); 143 static struct node *eventprop_lookup(struct event *ep, const char *propname); 144 static struct node *pathstring2epnamenp(char *path); 145 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, 146 fmd_case_t *fmcase, nvlist_t *detector, char *arg); 147 static char *undiag_2reason_str(int ud, char *arg); 148 static const char *undiag_2defect_str(int ud); 149 static void restore_suspects(struct fme *fmep); 150 static void save_suspects(struct fme *fmep); 151 static void destroy_fme(struct fme *f); 152 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 153 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl); 154 static void istat_counter_reset_cb(struct istat_entry *entp, 155 struct stats *statp, const struct ipath *ipp); 156 static void istat_counter_topo_chg_cb(struct istat_entry *entp, 157 struct stats *statp, void *unused); 158 static void serd_reset_cb(struct serd_entry *entp, void *unused, 159 const struct ipath *ipp); 160 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused, 161 void *unused2); 162 static void destroy_fme_bufs(struct fme *fp); 163 164 static struct fme * 165 alloc_fme(void) 166 { 167 struct fme *fmep; 168 169 fmep = MALLOC(sizeof (*fmep)); 170 bzero(fmep, sizeof (*fmep)); 171 return (fmep); 172 } 173 174 /* 175 * fme_ready -- called when all initialization of the FME (except for 176 * stats) has completed successfully. Adds the fme to global lists 177 * and establishes its stats. 178 */ 179 static struct fme * 180 fme_ready(struct fme *fmep) 181 { 182 char nbuf[100]; 183 184 Nfmep = NULL; /* don't need to free this on module abort now */ 185 186 if (EFMElist) { 187 EFMElist->next = fmep; 188 EFMElist = fmep; 189 } else 190 FMElist = EFMElist = fmep; 191 192 (void) sprintf(nbuf, "fme%d.Rcount", fmep->id); 193 fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 194 (void) sprintf(nbuf, "fme%d.Hcall", fmep->id); 195 fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1); 196 (void) sprintf(nbuf, "fme%d.Rcall", fmep->id); 197 fmep->Rcallcount = stats_new_counter(nbuf, 198 "calls to requirements_test()", 1); 199 (void) sprintf(nbuf, "fme%d.Ccall", fmep->id); 200 fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1); 201 (void) sprintf(nbuf, "fme%d.Ecall", fmep->id); 202 fmep->Ecallcount = 203 stats_new_counter(nbuf, "calls to effects_test()", 1); 204 (void) sprintf(nbuf, "fme%d.Tcall", fmep->id); 205 fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 206 (void) sprintf(nbuf, "fme%d.Marrow", fmep->id); 207 fmep->Marrowcount = stats_new_counter(nbuf, 208 "arrows marked by mark_arrows()", 1); 209 (void) sprintf(nbuf, "fme%d.diags", fmep->id); 210 fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 211 212 out(O_ALTFP|O_VERB2, "newfme: config snapshot contains..."); 213 config_print(O_ALTFP|O_VERB2, fmep->config); 214 215 return (fmep); 216 } 217 218 extern void ipath_dummy_lut(struct arrow *); 219 extern struct lut *itree_create_dummy(const char *, const struct ipath *); 220 221 /* ARGSUSED */ 222 static void 223 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 224 { 225 struct bubble *bp; 226 struct arrowlist *ap; 227 228 for (bp = itree_next_bubble(ep, NULL); bp; 229 bp = itree_next_bubble(ep, bp)) { 230 if (bp->t != B_FROM) 231 continue; 232 for (ap = itree_next_arrow(bp, NULL); ap; 233 ap = itree_next_arrow(bp, ap)) { 234 ap->arrowp->pnode->u.arrow.needed = 1; 235 ipath_dummy_lut(ap->arrowp); 236 } 237 } 238 } 239 240 /* ARGSUSED */ 241 static void 242 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 243 { 244 struct bubble *bp; 245 struct arrowlist *ap; 246 247 for (bp = itree_next_bubble(ep, NULL); bp; 248 bp = itree_next_bubble(ep, bp)) { 249 if (bp->t != B_FROM) 250 continue; 251 for (ap = itree_next_arrow(bp, NULL); ap; 252 ap = itree_next_arrow(bp, ap)) 253 ap->arrowp->pnode->u.arrow.needed = 0; 254 } 255 } 256 257 static void globals_destructor(void *left, void *right, void *arg); 258 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep); 259 260 static boolean_t 261 prune_propagations(const char *e0class, const struct ipath *e0ipp) 262 { 263 char nbuf[100]; 264 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 265 extern struct lut *Usednames; 266 267 Nfmep = alloc_fme(); 268 Nfmep->id = Nextid; 269 Nfmep->state = FME_NOTHING; 270 Nfmep->eventtree = itree_create_dummy(e0class, e0ipp); 271 if ((Nfmep->e0 = 272 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 273 itree_free(Nfmep->eventtree); 274 FREE(Nfmep); 275 Nfmep = NULL; 276 return (B_FALSE); 277 } 278 Nfmep->ecurrent = Nfmep->observations = Nfmep->e0; 279 Nfmep->e0->count++; 280 281 (void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id); 282 Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0); 283 (void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id); 284 Nfmep->Hcallcount = 285 stats_new_counter(nbuf, "calls to hypothesise()", 1); 286 (void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id); 287 Nfmep->Rcallcount = stats_new_counter(nbuf, 288 "calls to requirements_test()", 1); 289 (void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id); 290 Nfmep->Ccallcount = 291 stats_new_counter(nbuf, "calls to causes_test()", 1); 292 (void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id); 293 Nfmep->Ecallcount = 294 stats_new_counter(nbuf, "calls to effects_test()", 1); 295 (void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id); 296 Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1); 297 (void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id); 298 Nfmep->Marrowcount = stats_new_counter(nbuf, 299 "arrows marked by mark_arrows()", 1); 300 (void) sprintf(nbuf, "fme%d.diags", Nfmep->id); 301 Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0); 302 303 Nfmep->peek = 1; 304 lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep); 305 lut_free(Usednames, NULL, NULL); 306 Usednames = NULL; 307 lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep); 308 (void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay); 309 itree_prune(Nfmep->eventtree); 310 lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep); 311 312 stats_delete(Nfmep->Rcount); 313 stats_delete(Nfmep->Hcallcount); 314 stats_delete(Nfmep->Rcallcount); 315 stats_delete(Nfmep->Ccallcount); 316 stats_delete(Nfmep->Ecallcount); 317 stats_delete(Nfmep->Tcallcount); 318 stats_delete(Nfmep->Marrowcount); 319 stats_delete(Nfmep->diags); 320 itree_free(Nfmep->eventtree); 321 lut_free(Nfmep->globals, globals_destructor, NULL); 322 FREE(Nfmep); 323 return (B_TRUE); 324 } 325 326 static struct fme * 327 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl, 328 fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl) 329 { 330 struct cfgdata *cfgdata; 331 int init_size; 332 extern int alloc_total(); 333 nvlist_t *detector = NULL; 334 char *pathstr; 335 char *arg; 336 337 /* 338 * First check if e0ipp is actually in the topology so we can give a 339 * more useful error message. 340 */ 341 ipathlastcomp(e0ipp); 342 pathstr = ipath2str(NULL, e0ipp); 343 cfgdata = config_snapshot(); 344 platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE, 345 &detector, pathstr); 346 FREE(pathstr); 347 structconfig_free(cfgdata->cooked); 348 config_free(cfgdata); 349 if (detector == NULL) { 350 /* See if class permits silent discard on unknown component. */ 351 if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) { 352 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 353 "to component path, but silent discard allowed.", 354 e0class); 355 fmd_case_close(hdl, fmcase); 356 } else { 357 Undiag_reason = UD_VAL_BADEVENTPATH; 358 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 359 &detector); 360 arg = ipath2str(e0class, e0ipp); 361 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 362 FREE(arg); 363 } 364 return (NULL); 365 } 366 367 /* 368 * Next run a quick first pass of the rules with a dummy config. This 369 * allows us to prune those rules which can't possibly cause this 370 * ereport. 371 */ 372 if (!prune_propagations(e0class, e0ipp)) { 373 /* 374 * The fault class must have been in the rules or we would 375 * not have registered for it (and got a "nosub"), and the 376 * pathname must be in the topology or we would have failed the 377 * previous test. So to get here means the combination of 378 * class and pathname in the ereport must be invalid. 379 */ 380 Undiag_reason = UD_VAL_BADEVENTCLASS; 381 arg = ipath2str(e0class, e0ipp); 382 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 383 nvlist_free(detector); 384 FREE(arg); 385 return (NULL); 386 } 387 388 /* 389 * Now go ahead and create the real fme using the pruned rules. 390 */ 391 init_size = alloc_total(); 392 out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size); 393 nvlist_free(detector); 394 pathstr = ipath2str(NULL, e0ipp); 395 cfgdata = config_snapshot(); 396 platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE, 397 &detector, pathstr); 398 FREE(pathstr); 399 platform_save_config(hdl, fmcase); 400 out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes", 401 alloc_total() - init_size); 402 403 Nfmep = alloc_fme(); 404 405 Nfmep->id = Nextid++; 406 Nfmep->config = cfgdata->cooked; 407 config_free(cfgdata); 408 Nfmep->posted_suspects = 0; 409 Nfmep->uniqobs = 0; 410 Nfmep->state = FME_NOTHING; 411 Nfmep->pull = 0ULL; 412 Nfmep->overflow = 0; 413 414 Nfmep->fmcase = fmcase; 415 Nfmep->hdl = hdl; 416 417 if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) { 418 Undiag_reason = UD_VAL_INSTFAIL; 419 arg = ipath2str(e0class, e0ipp); 420 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 421 nvlist_free(detector); 422 FREE(arg); 423 structconfig_free(Nfmep->config); 424 destroy_fme_bufs(Nfmep); 425 FREE(Nfmep); 426 Nfmep = NULL; 427 return (NULL); 428 } 429 430 itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree); 431 432 if ((Nfmep->e0 = 433 itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) { 434 Undiag_reason = UD_VAL_BADEVENTI; 435 arg = ipath2str(e0class, e0ipp); 436 publish_undiagnosable(hdl, ffep, fmcase, detector, arg); 437 nvlist_free(detector); 438 FREE(arg); 439 itree_free(Nfmep->eventtree); 440 structconfig_free(Nfmep->config); 441 destroy_fme_bufs(Nfmep); 442 FREE(Nfmep); 443 Nfmep = NULL; 444 return (NULL); 445 } 446 447 nvlist_free(detector); 448 return (fme_ready(Nfmep)); 449 } 450 451 void 452 fme_fini(void) 453 { 454 struct fme *sfp, *fp; 455 struct case_list *ucasep, *nextcasep; 456 457 ucasep = Undiagablecaselist; 458 while (ucasep != NULL) { 459 nextcasep = ucasep->next; 460 FREE(ucasep); 461 ucasep = nextcasep; 462 } 463 Undiagablecaselist = NULL; 464 465 /* clean up closed fmes */ 466 fp = ClosedFMEs; 467 while (fp != NULL) { 468 sfp = fp->next; 469 destroy_fme(fp); 470 fp = sfp; 471 } 472 ClosedFMEs = NULL; 473 474 fp = FMElist; 475 while (fp != NULL) { 476 sfp = fp->next; 477 destroy_fme(fp); 478 fp = sfp; 479 } 480 FMElist = EFMElist = NULL; 481 482 /* if we were in the middle of creating an fme, free it now */ 483 if (Nfmep) { 484 destroy_fme(Nfmep); 485 Nfmep = NULL; 486 } 487 } 488 489 /* 490 * Allocated space for a buffer name. 20 bytes allows for 491 * a ridiculous 9,999,999 unique observations. 492 */ 493 #define OBBUFNMSZ 20 494 495 /* 496 * serialize_observation 497 * 498 * Create a recoverable version of the current observation 499 * (f->ecurrent). We keep a serialized version of each unique 500 * observation in order that we may resume correctly the fme in the 501 * correct state if eft or fmd crashes and we're restarted. 502 */ 503 static void 504 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp) 505 { 506 size_t pkdlen; 507 char tmpbuf[OBBUFNMSZ]; 508 char *pkd = NULL; 509 char *estr; 510 511 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs); 512 estr = ipath2str(cls, ipp); 513 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1); 514 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr, 515 strlen(estr) + 1); 516 FREE(estr); 517 518 if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) { 519 (void) snprintf(tmpbuf, 520 OBBUFNMSZ, "observed%d.nvp", fp->uniqobs); 521 if (nvlist_xpack(fp->ecurrent->nvp, 522 &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0) 523 out(O_DIE|O_SYS, "pack of observed nvl failed"); 524 fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen); 525 fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen); 526 FREE(pkd); 527 } 528 529 fp->uniqobs++; 530 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 531 sizeof (fp->uniqobs)); 532 } 533 534 /* 535 * init_fme_bufs -- We keep several bits of state about an fme for 536 * use if eft or fmd crashes and we're restarted. 537 */ 538 static void 539 init_fme_bufs(struct fme *fp) 540 { 541 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull)); 542 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull, 543 sizeof (fp->pull)); 544 545 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id)); 546 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id, 547 sizeof (fp->id)); 548 549 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs)); 550 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs, 551 sizeof (fp->uniqobs)); 552 553 fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD, 554 sizeof (fp->posted_suspects)); 555 fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD, 556 (void *)&fp->posted_suspects, sizeof (fp->posted_suspects)); 557 } 558 559 static void 560 destroy_fme_bufs(struct fme *fp) 561 { 562 char tmpbuf[OBBUFNMSZ]; 563 int o; 564 565 platform_restore_config(fp->hdl, fp->fmcase); 566 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN); 567 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG); 568 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL); 569 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID); 570 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD); 571 fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS); 572 573 for (o = 0; o < fp->uniqobs; o++) { 574 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o); 575 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 576 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o); 577 fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf); 578 } 579 } 580 581 /* 582 * reconstitute_observations -- convert a case's serialized observations 583 * back into struct events. Returns zero if all observations are 584 * successfully reconstituted. 585 */ 586 static int 587 reconstitute_observations(struct fme *fmep) 588 { 589 struct event *ep; 590 struct node *epnamenp = NULL; 591 size_t pkdlen; 592 char *pkd = NULL; 593 char *tmpbuf = alloca(OBBUFNMSZ); 594 char *sepptr; 595 char *estr; 596 int ocnt; 597 int elen; 598 599 for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) { 600 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt); 601 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 602 if (elen == 0) { 603 out(O_ALTFP, 604 "reconstitute_observation: no %s buffer found.", 605 tmpbuf); 606 Undiag_reason = UD_VAL_MISSINGOBS; 607 break; 608 } 609 610 estr = MALLOC(elen); 611 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 612 sepptr = strchr(estr, '@'); 613 if (sepptr == NULL) { 614 out(O_ALTFP, 615 "reconstitute_observation: %s: " 616 "missing @ separator in %s.", 617 tmpbuf, estr); 618 Undiag_reason = UD_VAL_MISSINGPATH; 619 FREE(estr); 620 break; 621 } 622 623 *sepptr = '\0'; 624 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 625 out(O_ALTFP, 626 "reconstitute_observation: %s: " 627 "trouble converting path string \"%s\" " 628 "to internal representation.", 629 tmpbuf, sepptr + 1); 630 Undiag_reason = UD_VAL_MISSINGPATH; 631 FREE(estr); 632 break; 633 } 634 635 /* construct the event */ 636 ep = itree_lookup(fmep->eventtree, 637 stable(estr), ipath(epnamenp)); 638 if (ep == NULL) { 639 out(O_ALTFP, 640 "reconstitute_observation: %s: " 641 "lookup of \"%s\" in itree failed.", 642 tmpbuf, ipath2str(estr, ipath(epnamenp))); 643 Undiag_reason = UD_VAL_BADOBS; 644 tree_free(epnamenp); 645 FREE(estr); 646 break; 647 } 648 tree_free(epnamenp); 649 650 /* 651 * We may or may not have a saved nvlist for the observation 652 */ 653 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt); 654 pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 655 if (pkdlen != 0) { 656 pkd = MALLOC(pkdlen); 657 fmd_buf_read(fmep->hdl, 658 fmep->fmcase, tmpbuf, pkd, pkdlen); 659 ASSERT(ep->nvp == NULL); 660 if (nvlist_xunpack(pkd, 661 pkdlen, &ep->nvp, &Eft_nv_hdl) != 0) 662 out(O_DIE|O_SYS, "pack of observed nvl failed"); 663 FREE(pkd); 664 } 665 666 if (ocnt == 0) 667 fmep->e0 = ep; 668 669 FREE(estr); 670 fmep->ecurrent = ep; 671 ep->count++; 672 673 /* link it into list of observations seen */ 674 ep->observations = fmep->observations; 675 fmep->observations = ep; 676 } 677 678 if (ocnt == fmep->uniqobs) { 679 (void) fme_ready(fmep); 680 return (0); 681 } 682 683 return (1); 684 } 685 686 /* 687 * restart_fme -- called during eft initialization. Reconstitutes 688 * an in-progress fme. 689 */ 690 void 691 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress) 692 { 693 nvlist_t *defect; 694 struct case_list *bad; 695 struct fme *fmep; 696 struct cfgdata *cfgdata; 697 size_t rawsz; 698 struct event *ep; 699 char *tmpbuf = alloca(OBBUFNMSZ); 700 char *sepptr; 701 char *estr; 702 int elen; 703 struct node *epnamenp = NULL; 704 int init_size; 705 extern int alloc_total(); 706 char *reason; 707 708 /* 709 * ignore solved or closed cases 710 */ 711 if (fmd_case_solved(hdl, inprogress) || 712 fmd_case_closed(hdl, inprogress)) 713 return; 714 715 fmep = alloc_fme(); 716 fmep->fmcase = inprogress; 717 fmep->hdl = hdl; 718 719 if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) { 720 out(O_ALTFP, "restart_fme: no saved posted status"); 721 Undiag_reason = UD_VAL_MISSINGINFO; 722 goto badcase; 723 } else { 724 fmd_buf_read(hdl, inprogress, WOBUF_POSTD, 725 (void *)&fmep->posted_suspects, 726 sizeof (fmep->posted_suspects)); 727 } 728 729 if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) { 730 out(O_ALTFP, "restart_fme: no saved id"); 731 Undiag_reason = UD_VAL_MISSINGINFO; 732 goto badcase; 733 } else { 734 fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id, 735 sizeof (fmep->id)); 736 } 737 if (Nextid <= fmep->id) 738 Nextid = fmep->id + 1; 739 740 out(O_ALTFP, "Replay FME %d", fmep->id); 741 742 if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) { 743 out(O_ALTFP, "restart_fme: No config data"); 744 Undiag_reason = UD_VAL_MISSINGINFO; 745 goto badcase; 746 } 747 fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz, 748 sizeof (size_t)); 749 750 if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) { 751 out(O_ALTFP, "restart_fme: No event zero"); 752 Undiag_reason = UD_VAL_MISSINGZERO; 753 goto badcase; 754 } 755 756 if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) { 757 out(O_ALTFP, "restart_fme: no saved wait time"); 758 Undiag_reason = UD_VAL_MISSINGINFO; 759 goto badcase; 760 } else { 761 fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull, 762 sizeof (fmep->pull)); 763 } 764 765 if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) { 766 out(O_ALTFP, "restart_fme: no count of observations"); 767 Undiag_reason = UD_VAL_MISSINGINFO; 768 goto badcase; 769 } else { 770 fmd_buf_read(hdl, inprogress, WOBUF_NOBS, 771 (void *)&fmep->uniqobs, sizeof (fmep->uniqobs)); 772 } 773 774 (void) snprintf(tmpbuf, OBBUFNMSZ, "observed0"); 775 elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf); 776 if (elen == 0) { 777 out(O_ALTFP, "reconstitute_observation: no %s buffer found.", 778 tmpbuf); 779 Undiag_reason = UD_VAL_MISSINGOBS; 780 goto badcase; 781 } 782 estr = MALLOC(elen); 783 fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen); 784 sepptr = strchr(estr, '@'); 785 if (sepptr == NULL) { 786 out(O_ALTFP, "reconstitute_observation: %s: " 787 "missing @ separator in %s.", 788 tmpbuf, estr); 789 Undiag_reason = UD_VAL_MISSINGPATH; 790 FREE(estr); 791 goto badcase; 792 } 793 *sepptr = '\0'; 794 if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) { 795 out(O_ALTFP, "reconstitute_observation: %s: " 796 "trouble converting path string \"%s\" " 797 "to internal representation.", tmpbuf, sepptr + 1); 798 Undiag_reason = UD_VAL_MISSINGPATH; 799 FREE(estr); 800 goto badcase; 801 } 802 (void) prune_propagations(stable(estr), ipath(epnamenp)); 803 tree_free(epnamenp); 804 FREE(estr); 805 806 init_size = alloc_total(); 807 out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size); 808 cfgdata = MALLOC(sizeof (struct cfgdata)); 809 cfgdata->cooked = NULL; 810 cfgdata->devcache = NULL; 811 cfgdata->devidcache = NULL; 812 cfgdata->tpcache = NULL; 813 cfgdata->cpucache = NULL; 814 cfgdata->raw_refcnt = 1; 815 816 if (rawsz > 0) { 817 if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) { 818 out(O_ALTFP, "restart_fme: Config data size mismatch"); 819 Undiag_reason = UD_VAL_CFGMISMATCH; 820 goto badcase; 821 } 822 cfgdata->begin = MALLOC(rawsz); 823 cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz; 824 fmd_buf_read(hdl, 825 inprogress, WOBUF_CFG, cfgdata->begin, rawsz); 826 } else { 827 cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL; 828 } 829 830 config_cook(cfgdata); 831 fmep->config = cfgdata->cooked; 832 config_free(cfgdata); 833 out(O_ALTFP|O_STAMP, "config_restore added %d bytes", 834 alloc_total() - init_size); 835 836 if ((fmep->eventtree = itree_create(fmep->config)) == NULL) { 837 /* case not properly saved or irretrievable */ 838 out(O_ALTFP, "restart_fme: NULL instance tree"); 839 Undiag_reason = UD_VAL_INSTFAIL; 840 goto badcase; 841 } 842 843 itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree); 844 845 if (reconstitute_observations(fmep) != 0) 846 goto badcase; 847 848 out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id); 849 for (ep = fmep->observations; ep; ep = ep->observations) { 850 out(O_ALTFP|O_NONL, " "); 851 itree_pevent_brief(O_ALTFP|O_NONL, ep); 852 } 853 out(O_ALTFP, NULL); 854 855 Open_fme_count++; 856 857 /* give the diagnosis algorithm a shot at the new FME state */ 858 fme_eval(fmep, fmep->e0r); 859 return; 860 861 badcase: 862 if (fmep->eventtree != NULL) 863 itree_free(fmep->eventtree); 864 if (fmep->config) 865 structconfig_free(fmep->config); 866 destroy_fme_bufs(fmep); 867 FREE(fmep); 868 869 /* 870 * Since we're unable to restart the case, add it to the undiagable 871 * list and solve and close it as appropriate. 872 */ 873 bad = MALLOC(sizeof (struct case_list)); 874 bad->next = NULL; 875 876 if (Undiagablecaselist != NULL) 877 bad->next = Undiagablecaselist; 878 Undiagablecaselist = bad; 879 bad->fmcase = inprogress; 880 881 out(O_ALTFP|O_NONL, "[case %s (unable to restart), ", 882 fmd_case_uuid(hdl, bad->fmcase)); 883 884 if (fmd_case_solved(hdl, bad->fmcase)) { 885 out(O_ALTFP|O_NONL, "already solved, "); 886 } else { 887 out(O_ALTFP|O_NONL, "solving, "); 888 defect = fmd_nvl_create_fault(hdl, 889 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 890 reason = undiag_2reason_str(Undiag_reason, NULL); 891 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 892 FREE(reason); 893 fmd_case_add_suspect(hdl, bad->fmcase, defect); 894 fmd_case_solve(hdl, bad->fmcase); 895 Undiag_reason = UD_VAL_UNKNOWN; 896 } 897 898 if (fmd_case_closed(hdl, bad->fmcase)) { 899 out(O_ALTFP, "already closed ]"); 900 } else { 901 out(O_ALTFP, "closing ]"); 902 fmd_case_close(hdl, bad->fmcase); 903 } 904 } 905 906 /*ARGSUSED*/ 907 static void 908 globals_destructor(void *left, void *right, void *arg) 909 { 910 struct evalue *evp = (struct evalue *)right; 911 if (evp->t == NODEPTR) 912 tree_free((struct node *)(uintptr_t)evp->v); 913 evp->v = (uintptr_t)NULL; 914 FREE(evp); 915 } 916 917 void 918 destroy_fme(struct fme *f) 919 { 920 stats_delete(f->Rcount); 921 stats_delete(f->Hcallcount); 922 stats_delete(f->Rcallcount); 923 stats_delete(f->Ccallcount); 924 stats_delete(f->Ecallcount); 925 stats_delete(f->Tcallcount); 926 stats_delete(f->Marrowcount); 927 stats_delete(f->diags); 928 929 if (f->eventtree != NULL) 930 itree_free(f->eventtree); 931 if (f->config) 932 structconfig_free(f->config); 933 lut_free(f->globals, globals_destructor, NULL); 934 FREE(f); 935 } 936 937 static const char * 938 fme_state2str(enum fme_state s) 939 { 940 switch (s) { 941 case FME_NOTHING: return ("NOTHING"); 942 case FME_WAIT: return ("WAIT"); 943 case FME_CREDIBLE: return ("CREDIBLE"); 944 case FME_DISPROVED: return ("DISPROVED"); 945 case FME_DEFERRED: return ("DEFERRED"); 946 default: return ("UNKNOWN"); 947 } 948 } 949 950 static int 951 is_problem(enum nametype t) 952 { 953 return (t == N_FAULT || t == N_DEFECT || t == N_UPSET); 954 } 955 956 static int 957 is_defect(enum nametype t) 958 { 959 return (t == N_DEFECT); 960 } 961 962 static int 963 is_upset(enum nametype t) 964 { 965 return (t == N_UPSET); 966 } 967 968 static void 969 fme_print(int flags, struct fme *fmep) 970 { 971 struct event *ep; 972 973 out(flags, "Fault Management Exercise %d", fmep->id); 974 out(flags, "\t State: %s", fme_state2str(fmep->state)); 975 out(flags|O_NONL, "\t Start time: "); 976 ptree_timeval(flags|O_NONL, &fmep->ull); 977 out(flags, NULL); 978 if (fmep->wull) { 979 out(flags|O_NONL, "\t Wait time: "); 980 ptree_timeval(flags|O_NONL, &fmep->wull); 981 out(flags, NULL); 982 } 983 out(flags|O_NONL, "\t E0: "); 984 if (fmep->e0) 985 itree_pevent_brief(flags|O_NONL, fmep->e0); 986 else 987 out(flags|O_NONL, "NULL"); 988 out(flags, NULL); 989 out(flags|O_NONL, "\tObservations:"); 990 for (ep = fmep->observations; ep; ep = ep->observations) { 991 out(flags|O_NONL, " "); 992 itree_pevent_brief(flags|O_NONL, ep); 993 } 994 out(flags, NULL); 995 out(flags|O_NONL, "\tSuspect list:"); 996 for (ep = fmep->suspects; ep; ep = ep->suspects) { 997 out(flags|O_NONL, " "); 998 itree_pevent_brief(flags|O_NONL, ep); 999 } 1000 out(flags, NULL); 1001 if (fmep->eventtree != NULL) { 1002 out(flags|O_VERB2, "\t Tree:"); 1003 itree_ptree(flags|O_VERB2, fmep->eventtree); 1004 } 1005 } 1006 1007 static struct node * 1008 pathstring2epnamenp(char *path) 1009 { 1010 char *sep = "/"; 1011 struct node *ret; 1012 char *ptr; 1013 1014 if ((ptr = strtok(path, sep)) == NULL) 1015 out(O_DIE, "pathstring2epnamenp: invalid empty class"); 1016 1017 ret = tree_iname(stable(ptr), NULL, 0); 1018 1019 while ((ptr = strtok(NULL, sep)) != NULL) 1020 ret = tree_name_append(ret, 1021 tree_iname(stable(ptr), NULL, 0)); 1022 1023 return (ret); 1024 } 1025 1026 /* 1027 * for a given upset sp, increment the corresponding SERD engine. if the 1028 * SERD engine trips, return the ename and ipp of the resulting ereport. 1029 * returns true if engine tripped and *enamep and *ippp were filled in. 1030 */ 1031 static int 1032 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep, 1033 fmd_case_t *fmcase, struct event *sp, const char **enamep, 1034 const struct ipath **ippp) 1035 { 1036 struct node *serdinst; 1037 char *serdname; 1038 char *serdresource; 1039 char *serdclass; 1040 struct node *nid; 1041 struct serd_entry *newentp; 1042 int i, serdn = -1, serdincrement = 1, len = 0; 1043 char *serdsuffix = NULL, *serdt = NULL; 1044 struct evalue *ep; 1045 1046 ASSERT(sp->t == N_UPSET); 1047 ASSERT(ffep != NULL); 1048 1049 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1050 (void *)"n", (lut_cmp)strcmp)) != NULL) { 1051 ASSERT(ep->t == UINT64); 1052 serdn = (int)ep->v; 1053 } 1054 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1055 (void *)"t", (lut_cmp)strcmp)) != NULL) { 1056 ASSERT(ep->t == STRING); 1057 serdt = (char *)(uintptr_t)ep->v; 1058 } 1059 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1060 (void *)"suffix", (lut_cmp)strcmp)) != NULL) { 1061 ASSERT(ep->t == STRING); 1062 serdsuffix = (char *)(uintptr_t)ep->v; 1063 } 1064 if ((ep = (struct evalue *)lut_lookup(sp->serdprops, 1065 (void *)"increment", (lut_cmp)strcmp)) != NULL) { 1066 ASSERT(ep->t == UINT64); 1067 serdincrement = (int)ep->v; 1068 } 1069 1070 /* 1071 * obtain instanced SERD engine from the upset sp. from this 1072 * derive serdname, the string used to identify the SERD engine. 1073 */ 1074 serdinst = eventprop_lookup(sp, L_engine); 1075 1076 if (serdinst == NULL) 1077 return (-1); 1078 1079 len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1; 1080 if (serdsuffix != NULL) 1081 len += strlen(serdsuffix); 1082 serdclass = MALLOC(len); 1083 if (serdsuffix != NULL) 1084 (void) snprintf(serdclass, len, "%s%s", 1085 serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix); 1086 else 1087 (void) snprintf(serdclass, len, "%s", 1088 serdinst->u.stmt.np->u.event.ename->u.name.s); 1089 serdresource = ipath2str(NULL, 1090 ipath(serdinst->u.stmt.np->u.event.epname)); 1091 len += strlen(serdresource) + 1; 1092 serdname = MALLOC(len); 1093 (void) snprintf(serdname, len, "%s@%s", serdclass, serdresource); 1094 FREE(serdresource); 1095 1096 /* handle serd engine "id" property, if there is one */ 1097 if ((nid = 1098 lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) { 1099 struct evalue *gval; 1100 char suffixbuf[200]; 1101 char *suffix; 1102 char *nserdname; 1103 size_t nname; 1104 1105 out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname); 1106 ptree_name_iter(O_ALTFP|O_NONL, nid); 1107 1108 ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t)); 1109 1110 if ((gval = lut_lookup(fmep->globals, 1111 (void *)nid->u.globid.s, NULL)) == NULL) { 1112 out(O_ALTFP, " undefined"); 1113 } else if (gval->t == UINT64) { 1114 out(O_ALTFP, " %llu", gval->v); 1115 (void) sprintf(suffixbuf, "%llu", gval->v); 1116 suffix = suffixbuf; 1117 } else { 1118 out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v); 1119 suffix = (char *)(uintptr_t)gval->v; 1120 } 1121 1122 nname = strlen(serdname) + strlen(suffix) + 2; 1123 nserdname = MALLOC(nname); 1124 (void) snprintf(nserdname, nname, "%s:%s", serdname, suffix); 1125 FREE(serdname); 1126 serdname = nserdname; 1127 } 1128 1129 /* 1130 * if the engine is empty, and we have an override for n/t then 1131 * destroy and recreate it. 1132 */ 1133 if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) && 1134 fmd_serd_empty(hdl, serdname)) 1135 fmd_serd_destroy(hdl, serdname); 1136 1137 if (!fmd_serd_exists(hdl, serdname)) { 1138 struct node *nN, *nT; 1139 const char *s; 1140 struct node *nodep; 1141 struct config *cp; 1142 char *path; 1143 uint_t nval; 1144 hrtime_t tval; 1145 int i; 1146 char *ptr; 1147 int got_n_override = 0, got_t_override = 0; 1148 1149 /* no SERD engine yet, so create it */ 1150 nodep = serdinst->u.stmt.np->u.event.epname; 1151 path = ipath2str(NULL, ipath(nodep)); 1152 cp = config_lookup(fmep->config, path, 0); 1153 FREE((void *)path); 1154 1155 /* 1156 * We allow serd paramaters to be overridden, either from 1157 * eft.conf file values (if Serd_Override is set) or from 1158 * driver properties (for "serd.io.device" engines). 1159 */ 1160 if (Serd_Override != NULL) { 1161 char *save_ptr, *ptr1, *ptr2, *ptr3; 1162 ptr3 = save_ptr = STRDUP(Serd_Override); 1163 while (*ptr3 != '\0') { 1164 ptr1 = strchr(ptr3, ','); 1165 *ptr1 = '\0'; 1166 if (strcmp(ptr3, serdclass) == 0) { 1167 ptr2 = strchr(ptr1 + 1, ','); 1168 *ptr2 = '\0'; 1169 nval = atoi(ptr1 + 1); 1170 out(O_ALTFP, "serd override %s_n %d", 1171 serdclass, nval); 1172 ptr3 = strchr(ptr2 + 1, ' '); 1173 if (ptr3) 1174 *ptr3 = '\0'; 1175 ptr = STRDUP(ptr2 + 1); 1176 out(O_ALTFP, "serd override %s_t %s", 1177 serdclass, ptr); 1178 got_n_override = 1; 1179 got_t_override = 1; 1180 break; 1181 } else { 1182 ptr2 = strchr(ptr1 + 1, ','); 1183 ptr3 = strchr(ptr2 + 1, ' '); 1184 if (ptr3 == NULL) 1185 break; 1186 } 1187 ptr3++; 1188 } 1189 FREE(save_ptr); 1190 } 1191 1192 if (cp && got_n_override == 0) { 1193 /* 1194 * convert serd engine class into property name 1195 */ 1196 char *prop_name = MALLOC(strlen(serdclass) + 3); 1197 for (i = 0; i < strlen(serdclass); i++) { 1198 if (serdclass[i] == '.') 1199 prop_name[i] = '_'; 1200 else 1201 prop_name[i] = serdclass[i]; 1202 } 1203 prop_name[i++] = '_'; 1204 prop_name[i++] = 'n'; 1205 prop_name[i] = '\0'; 1206 if (s = config_getprop(cp, prop_name)) { 1207 nval = atoi(s); 1208 out(O_ALTFP, "serd override %s_n %s", 1209 serdclass, s); 1210 got_n_override = 1; 1211 } 1212 prop_name[i - 1] = 't'; 1213 if (s = config_getprop(cp, prop_name)) { 1214 ptr = STRDUP(s); 1215 out(O_ALTFP, "serd override %s_t %s", 1216 serdclass, s); 1217 got_t_override = 1; 1218 } 1219 FREE(prop_name); 1220 } 1221 1222 if (serdn != -1 && got_n_override == 0) { 1223 nval = serdn; 1224 out(O_ALTFP, "serd override %s_n %d", serdclass, serdn); 1225 got_n_override = 1; 1226 } 1227 if (serdt != NULL && got_t_override == 0) { 1228 ptr = STRDUP(serdt); 1229 out(O_ALTFP, "serd override %s_t %s", serdclass, serdt); 1230 got_t_override = 1; 1231 } 1232 1233 if (!got_n_override) { 1234 nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N, 1235 NULL); 1236 ASSERT(nN->t == T_NUM); 1237 nval = (uint_t)nN->u.ull; 1238 } 1239 if (!got_t_override) { 1240 nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T, 1241 NULL); 1242 ASSERT(nT->t == T_TIMEVAL); 1243 tval = (hrtime_t)nT->u.ull; 1244 } else { 1245 const unsigned long long *ullp; 1246 const char *suffix; 1247 int len; 1248 1249 len = strspn(ptr, "0123456789"); 1250 suffix = stable(&ptr[len]); 1251 ullp = (unsigned long long *)lut_lookup(Timesuffixlut, 1252 (void *)suffix, NULL); 1253 ptr[len] = '\0'; 1254 tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll); 1255 FREE(ptr); 1256 } 1257 fmd_serd_create(hdl, serdname, nval, tval); 1258 } 1259 1260 newentp = MALLOC(sizeof (*newentp)); 1261 newentp->ename = stable(serdclass); 1262 FREE(serdclass); 1263 newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname); 1264 newentp->hdl = hdl; 1265 if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) { 1266 SerdEngines = lut_add(SerdEngines, (void *)newentp, 1267 (void *)newentp, (lut_cmp)serd_cmp); 1268 Serd_need_save = 1; 1269 serd_save(); 1270 } else { 1271 FREE(newentp); 1272 } 1273 1274 1275 /* 1276 * increment SERD engine. if engine fires, reset serd 1277 * engine and return trip_strcode if required. 1278 */ 1279 for (i = 0; i < serdincrement; i++) { 1280 if (fmd_serd_record(hdl, serdname, ffep)) { 1281 fmd_case_add_serd(hdl, fmcase, serdname); 1282 fmd_serd_reset(hdl, serdname); 1283 1284 if (ippp) { 1285 struct node *tripinst = 1286 lut_lookup(serdinst->u.stmt.lutp, 1287 (void *)L_trip, NULL); 1288 ASSERT(tripinst != NULL); 1289 *enamep = tripinst->u.event.ename->u.name.s; 1290 *ippp = ipath(tripinst->u.event.epname); 1291 out(O_ALTFP|O_NONL, 1292 "[engine fired: %s, sending: ", serdname); 1293 ipath_print(O_ALTFP|O_NONL, *enamep, *ippp); 1294 out(O_ALTFP, "]"); 1295 } else { 1296 out(O_ALTFP, "[engine fired: %s, no trip]", 1297 serdname); 1298 } 1299 FREE(serdname); 1300 return (1); 1301 } 1302 } 1303 1304 FREE(serdname); 1305 return (0); 1306 } 1307 1308 /* 1309 * search a suspect list for upsets. feed each upset to serd_eval() and 1310 * build up tripped[], an array of ereports produced by the firing of 1311 * any SERD engines. then feed each ereport back into 1312 * fme_receive_report(). 1313 * 1314 * returns ntrip, the number of these ereports produced. 1315 */ 1316 static int 1317 upsets_eval(struct fme *fmep, fmd_event_t *ffep) 1318 { 1319 /* we build an array of tripped ereports that we send ourselves */ 1320 struct { 1321 const char *ename; 1322 const struct ipath *ipp; 1323 } *tripped; 1324 struct event *sp; 1325 int ntrip, nupset, i; 1326 1327 /* 1328 * count the number of upsets to determine the upper limit on 1329 * expected trip ereport strings. remember that one upset can 1330 * lead to at most one ereport. 1331 */ 1332 nupset = 0; 1333 for (sp = fmep->suspects; sp; sp = sp->suspects) { 1334 if (sp->t == N_UPSET) 1335 nupset++; 1336 } 1337 1338 if (nupset == 0) 1339 return (0); 1340 1341 /* 1342 * get to this point if we have upsets and expect some trip 1343 * ereports 1344 */ 1345 tripped = alloca(sizeof (*tripped) * nupset); 1346 bzero((void *)tripped, sizeof (*tripped) * nupset); 1347 1348 ntrip = 0; 1349 for (sp = fmep->suspects; sp; sp = sp->suspects) 1350 if (sp->t == N_UPSET && 1351 serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp, 1352 &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1) 1353 ntrip++; 1354 1355 for (i = 0; i < ntrip; i++) { 1356 struct event *ep, *nep; 1357 struct fme *nfmep; 1358 fmd_case_t *fmcase; 1359 const struct ipath *ipp; 1360 const char *eventstring; 1361 int prev_verbose; 1362 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1363 enum fme_state state; 1364 1365 /* 1366 * First try and evaluate a case with the trip ereport plus 1367 * all the other ereports that cause the trip. If that fails 1368 * to evaluate then try again with just this ereport on its own. 1369 */ 1370 out(O_ALTFP|O_NONL, "fme_receive_report_serd: "); 1371 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1372 out(O_ALTFP|O_STAMP, NULL); 1373 ep = fmep->e0; 1374 eventstring = ep->enode->u.event.ename->u.name.s; 1375 ipp = ep->ipp; 1376 1377 /* 1378 * create a duplicate fme and case 1379 */ 1380 fmcase = fmd_case_open(fmep->hdl, NULL); 1381 out(O_ALTFP|O_NONL, "duplicate fme for event ["); 1382 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1383 out(O_ALTFP, " ]"); 1384 1385 if ((nfmep = newfme(eventstring, ipp, fmep->hdl, 1386 fmcase, ffep, ep->nvp)) == NULL) { 1387 out(O_ALTFP|O_NONL, "["); 1388 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1389 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1390 continue; 1391 } 1392 1393 Open_fme_count++; 1394 nfmep->pull = fmep->pull; 1395 init_fme_bufs(nfmep); 1396 out(O_ALTFP|O_NONL, "["); 1397 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1398 out(O_ALTFP, " created FME%d, case %s]", nfmep->id, 1399 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 1400 if (ffep) { 1401 fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep); 1402 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep); 1403 nfmep->e0r = ffep; 1404 } 1405 1406 /* 1407 * add the original ereports 1408 */ 1409 for (ep = fmep->observations; ep; ep = ep->observations) { 1410 eventstring = ep->enode->u.event.ename->u.name.s; 1411 ipp = ep->ipp; 1412 out(O_ALTFP|O_NONL, "adding event ["); 1413 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1414 out(O_ALTFP, " ]"); 1415 nep = itree_lookup(nfmep->eventtree, eventstring, ipp); 1416 if (nep->count++ == 0) { 1417 nep->observations = nfmep->observations; 1418 nfmep->observations = nep; 1419 serialize_observation(nfmep, eventstring, ipp); 1420 nep->nvp = evnv_dupnvl(ep->nvp); 1421 } 1422 if (ep->ffep && ep->ffep != ffep) 1423 fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, 1424 ep->ffep); 1425 stats_counter_bump(nfmep->Rcount); 1426 } 1427 1428 /* 1429 * add the serd trigger ereport 1430 */ 1431 if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename, 1432 tripped[i].ipp)) == NULL) { 1433 /* 1434 * The trigger ereport is not in the instance tree. It 1435 * was presumably removed by prune_propagations() as 1436 * this combination of events is not present in the 1437 * rules. 1438 */ 1439 out(O_ALTFP, "upsets_eval: e0 not in instance tree"); 1440 Undiag_reason = UD_VAL_BADEVENTI; 1441 goto retry_lone_ereport; 1442 } 1443 out(O_ALTFP|O_NONL, "adding event ["); 1444 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp); 1445 out(O_ALTFP, " ]"); 1446 nfmep->ecurrent = ep; 1447 ep->nvp = NULL; 1448 ep->count = 1; 1449 ep->observations = nfmep->observations; 1450 nfmep->observations = ep; 1451 1452 /* 1453 * just peek first. 1454 */ 1455 nfmep->peek = 1; 1456 prev_verbose = Verbose; 1457 if (Debug == 0) 1458 Verbose = 0; 1459 lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep); 1460 state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay); 1461 nfmep->peek = 0; 1462 Verbose = prev_verbose; 1463 if (state == FME_DISPROVED) { 1464 out(O_ALTFP, "upsets_eval: hypothesis disproved"); 1465 Undiag_reason = UD_VAL_UNSOLVD; 1466 retry_lone_ereport: 1467 /* 1468 * However the trigger ereport on its own might be 1469 * diagnosable, so check for that. Undo the new fme 1470 * and case we just created and call fme_receive_report. 1471 */ 1472 out(O_ALTFP|O_NONL, "["); 1473 ipath_print(O_ALTFP|O_NONL, tripped[i].ename, 1474 tripped[i].ipp); 1475 out(O_ALTFP, " retrying with just trigger ereport]"); 1476 itree_free(nfmep->eventtree); 1477 nfmep->eventtree = NULL; 1478 structconfig_free(nfmep->config); 1479 nfmep->config = NULL; 1480 destroy_fme_bufs(nfmep); 1481 fmd_case_close(nfmep->hdl, nfmep->fmcase); 1482 fme_receive_report(fmep->hdl, ffep, 1483 tripped[i].ename, tripped[i].ipp, NULL); 1484 continue; 1485 } 1486 1487 /* 1488 * and evaluate 1489 */ 1490 serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp); 1491 fme_eval(nfmep, ffep); 1492 } 1493 1494 return (ntrip); 1495 } 1496 1497 /* 1498 * fme_receive_external_report -- call when an external ereport comes in 1499 * 1500 * this routine just converts the relevant information from the ereport 1501 * into a format used internally and passes it on to fme_receive_report(). 1502 */ 1503 void 1504 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1505 const char *class) 1506 { 1507 struct node *epnamenp; 1508 fmd_case_t *fmcase; 1509 const struct ipath *ipp; 1510 nvlist_t *detector = NULL; 1511 1512 class = stable(class); 1513 1514 /* Get the component path from the ereport */ 1515 epnamenp = platform_getpath(nvl); 1516 1517 /* See if we ended up without a path. */ 1518 if (epnamenp == NULL) { 1519 /* See if class permits silent discard on unknown component. */ 1520 if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) { 1521 out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport " 1522 "to component path, but silent discard allowed.", 1523 class); 1524 } else { 1525 /* 1526 * XFILE: Failure to find a component is bad unless 1527 * 'discard_if_config_unknown=1' was specified in the 1528 * ereport definition. Indicate undiagnosable. 1529 */ 1530 Undiag_reason = UD_VAL_NOPATH; 1531 fmcase = fmd_case_open(hdl, NULL); 1532 1533 /* 1534 * We don't have a component path here (which means that 1535 * the detector was not in hc-scheme and couldn't be 1536 * converted to hc-scheme. Report the raw detector as 1537 * the suspect resource if there is one. 1538 */ 1539 (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, 1540 &detector); 1541 publish_undiagnosable(hdl, ffep, fmcase, detector, 1542 (char *)class); 1543 } 1544 return; 1545 } 1546 1547 ipp = ipath(epnamenp); 1548 tree_free(epnamenp); 1549 fme_receive_report(hdl, ffep, class, ipp, nvl); 1550 } 1551 1552 /*ARGSUSED*/ 1553 void 1554 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl, 1555 const char *eventstring) 1556 { 1557 char *uuid; 1558 nvlist_t **nva; 1559 uint_t nvc; 1560 const struct ipath *ipp; 1561 1562 if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 || 1563 nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 1564 &nva, &nvc) != 0) { 1565 out(O_ALTFP, "No uuid or fault list for list.repaired event"); 1566 return; 1567 } 1568 1569 out(O_ALTFP, "Processing list.repaired from case %s", uuid); 1570 1571 while (nvc-- != 0) { 1572 /* 1573 * Reset any istat or serd engine associated with this path. 1574 */ 1575 char *path; 1576 1577 if ((ipp = platform_fault2ipath(*nva++)) == NULL) 1578 continue; 1579 1580 path = ipath2str(NULL, ipp); 1581 out(O_ALTFP, "fme_receive_repair_list: resetting state for %s", 1582 path); 1583 FREE(path); 1584 1585 lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp); 1586 istat_save(); 1587 1588 lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp); 1589 serd_save(); 1590 } 1591 } 1592 1593 /*ARGSUSED*/ 1594 void 1595 fme_receive_topology_change(void) 1596 { 1597 lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL); 1598 istat_save(); 1599 1600 lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL); 1601 serd_save(); 1602 } 1603 1604 static int mark_arrows(struct fme *fmep, struct event *ep, int mark, 1605 unsigned long long at_latest_by, unsigned long long *pdelay, int keep); 1606 1607 /* ARGSUSED */ 1608 static void 1609 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep) 1610 { 1611 struct bubble *bp; 1612 struct arrowlist *ap; 1613 1614 ep->cached_state = 0; 1615 ep->keep_in_tree = 0; 1616 for (bp = itree_next_bubble(ep, NULL); bp; 1617 bp = itree_next_bubble(ep, bp)) { 1618 if (bp->t != B_FROM) 1619 continue; 1620 bp->mark = 0; 1621 for (ap = itree_next_arrow(bp, NULL); ap; 1622 ap = itree_next_arrow(bp, ap)) 1623 ap->arrowp->mark = 0; 1624 } 1625 } 1626 1627 static void 1628 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep, 1629 const char *eventstring, const struct ipath *ipp, nvlist_t *nvl) 1630 { 1631 struct event *ep; 1632 struct fme *fmep = NULL; 1633 struct fme *ofmep = NULL; 1634 struct fme *cfmep, *svfmep; 1635 int matched = 0; 1636 nvlist_t *defect; 1637 fmd_case_t *fmcase; 1638 char *reason; 1639 1640 out(O_ALTFP|O_NONL, "fme_receive_report: "); 1641 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1642 out(O_ALTFP|O_STAMP, NULL); 1643 1644 /* decide which FME it goes to */ 1645 for (fmep = FMElist; fmep; fmep = fmep->next) { 1646 int prev_verbose; 1647 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 1648 enum fme_state state; 1649 nvlist_t *pre_peek_nvp = NULL; 1650 1651 if (fmep->overflow) { 1652 if (!(fmd_case_closed(fmep->hdl, fmep->fmcase))) 1653 ofmep = fmep; 1654 1655 continue; 1656 } 1657 1658 /* 1659 * ignore solved or closed cases 1660 */ 1661 if (fmep->posted_suspects || 1662 fmd_case_solved(fmep->hdl, fmep->fmcase) || 1663 fmd_case_closed(fmep->hdl, fmep->fmcase)) 1664 continue; 1665 1666 /* look up event in event tree for this FME */ 1667 if ((ep = itree_lookup(fmep->eventtree, 1668 eventstring, ipp)) == NULL) 1669 continue; 1670 1671 /* note observation */ 1672 fmep->ecurrent = ep; 1673 if (ep->count++ == 0) { 1674 /* link it into list of observations seen */ 1675 ep->observations = fmep->observations; 1676 fmep->observations = ep; 1677 ep->nvp = evnv_dupnvl(nvl); 1678 } else { 1679 /* use new payload values for peek */ 1680 pre_peek_nvp = ep->nvp; 1681 ep->nvp = evnv_dupnvl(nvl); 1682 } 1683 1684 /* tell hypothesise() not to mess with suspect list */ 1685 fmep->peek = 1; 1686 1687 /* don't want this to be verbose (unless Debug is set) */ 1688 prev_verbose = Verbose; 1689 if (Debug == 0) 1690 Verbose = 0; 1691 1692 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 1693 state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 1694 1695 fmep->peek = 0; 1696 1697 /* put verbose flag back */ 1698 Verbose = prev_verbose; 1699 1700 if (state != FME_DISPROVED) { 1701 /* found an FME that explains the ereport */ 1702 matched++; 1703 out(O_ALTFP|O_NONL, "["); 1704 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1705 out(O_ALTFP, " explained by FME%d]", fmep->id); 1706 1707 nvlist_free(pre_peek_nvp); 1708 1709 if (ep->count == 1) 1710 serialize_observation(fmep, eventstring, ipp); 1711 1712 if (ffep) { 1713 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1714 ep->ffep = ffep; 1715 } 1716 1717 stats_counter_bump(fmep->Rcount); 1718 1719 /* re-eval FME */ 1720 fme_eval(fmep, ffep); 1721 } else { 1722 1723 /* not a match, undo noting of observation */ 1724 fmep->ecurrent = NULL; 1725 if (--ep->count == 0) { 1726 /* unlink it from observations */ 1727 fmep->observations = ep->observations; 1728 ep->observations = NULL; 1729 nvlist_free(ep->nvp); 1730 ep->nvp = NULL; 1731 } else { 1732 nvlist_free(ep->nvp); 1733 ep->nvp = pre_peek_nvp; 1734 } 1735 } 1736 } 1737 1738 if (matched) 1739 return; /* explained by at least one existing FME */ 1740 1741 /* clean up closed fmes */ 1742 cfmep = ClosedFMEs; 1743 while (cfmep != NULL) { 1744 svfmep = cfmep->next; 1745 destroy_fme(cfmep); 1746 cfmep = svfmep; 1747 } 1748 ClosedFMEs = NULL; 1749 1750 if (ofmep) { 1751 out(O_ALTFP|O_NONL, "["); 1752 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1753 out(O_ALTFP, " ADDING TO OVERFLOW FME]"); 1754 if (ffep) 1755 fmd_case_add_ereport(hdl, ofmep->fmcase, ffep); 1756 1757 return; 1758 1759 } else if (Max_fme && (Open_fme_count >= Max_fme)) { 1760 out(O_ALTFP|O_NONL, "["); 1761 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1762 out(O_ALTFP, " MAX OPEN FME REACHED]"); 1763 1764 fmcase = fmd_case_open(hdl, NULL); 1765 1766 /* Create overflow fme */ 1767 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, 1768 nvl)) == NULL) { 1769 out(O_ALTFP|O_NONL, "["); 1770 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1771 out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]"); 1772 return; 1773 } 1774 1775 Open_fme_count++; 1776 1777 init_fme_bufs(fmep); 1778 fmep->overflow = B_TRUE; 1779 1780 if (ffep) 1781 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1782 1783 Undiag_reason = UD_VAL_MAXFME; 1784 defect = fmd_nvl_create_fault(hdl, 1785 undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL); 1786 reason = undiag_2reason_str(Undiag_reason, NULL); 1787 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 1788 FREE(reason); 1789 fmd_case_add_suspect(hdl, fmep->fmcase, defect); 1790 fmd_case_solve(hdl, fmep->fmcase); 1791 Undiag_reason = UD_VAL_UNKNOWN; 1792 return; 1793 } 1794 1795 /* open a case */ 1796 fmcase = fmd_case_open(hdl, NULL); 1797 1798 /* start a new FME */ 1799 if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) { 1800 out(O_ALTFP|O_NONL, "["); 1801 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1802 out(O_ALTFP, " CANNOT DIAGNOSE]"); 1803 return; 1804 } 1805 1806 Open_fme_count++; 1807 1808 init_fme_bufs(fmep); 1809 1810 out(O_ALTFP|O_NONL, "["); 1811 ipath_print(O_ALTFP|O_NONL, eventstring, ipp); 1812 out(O_ALTFP, " created FME%d, case %s]", fmep->id, 1813 fmd_case_uuid(hdl, fmep->fmcase)); 1814 1815 ep = fmep->e0; 1816 ASSERT(ep != NULL); 1817 1818 /* note observation */ 1819 fmep->ecurrent = ep; 1820 if (ep->count++ == 0) { 1821 /* link it into list of observations seen */ 1822 ep->observations = fmep->observations; 1823 fmep->observations = ep; 1824 ep->nvp = evnv_dupnvl(nvl); 1825 serialize_observation(fmep, eventstring, ipp); 1826 } else { 1827 /* new payload overrides any previous */ 1828 nvlist_free(ep->nvp); 1829 ep->nvp = evnv_dupnvl(nvl); 1830 } 1831 1832 stats_counter_bump(fmep->Rcount); 1833 1834 if (ffep) { 1835 fmd_case_add_ereport(hdl, fmep->fmcase, ffep); 1836 fmd_case_setprincipal(hdl, fmep->fmcase, ffep); 1837 fmep->e0r = ffep; 1838 ep->ffep = ffep; 1839 } 1840 1841 /* give the diagnosis algorithm a shot at the new FME state */ 1842 fme_eval(fmep, ffep); 1843 } 1844 1845 void 1846 fme_status(int flags) 1847 { 1848 struct fme *fmep; 1849 1850 if (FMElist == NULL) { 1851 out(flags, "No fault management exercises underway."); 1852 return; 1853 } 1854 1855 for (fmep = FMElist; fmep; fmep = fmep->next) 1856 fme_print(flags, fmep); 1857 } 1858 1859 /* 1860 * "indent" routines used mostly for nicely formatted debug output, but also 1861 * for sanity checking for infinite recursion bugs. 1862 */ 1863 1864 #define MAX_INDENT 1024 1865 static const char *indent_s[MAX_INDENT]; 1866 static int current_indent; 1867 1868 static void 1869 indent_push(const char *s) 1870 { 1871 if (current_indent < MAX_INDENT) 1872 indent_s[current_indent++] = s; 1873 else 1874 out(O_DIE, "unexpected recursion depth (%d)", current_indent); 1875 } 1876 1877 static void 1878 indent_set(const char *s) 1879 { 1880 current_indent = 0; 1881 indent_push(s); 1882 } 1883 1884 static void 1885 indent_pop(void) 1886 { 1887 if (current_indent > 0) 1888 current_indent--; 1889 else 1890 out(O_DIE, "recursion underflow"); 1891 } 1892 1893 static void 1894 indent(void) 1895 { 1896 int i; 1897 if (!Verbose) 1898 return; 1899 for (i = 0; i < current_indent; i++) 1900 out(O_ALTFP|O_VERB|O_NONL, indent_s[i]); 1901 } 1902 1903 #define SLNEW 1 1904 #define SLCHANGED 2 1905 #define SLWAIT 3 1906 #define SLDISPROVED 4 1907 1908 static void 1909 print_suspects(int circumstance, struct fme *fmep) 1910 { 1911 struct event *ep; 1912 1913 out(O_ALTFP|O_NONL, "["); 1914 if (circumstance == SLCHANGED) { 1915 out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, " 1916 "suspect list:", fmep->id, fme_state2str(fmep->state)); 1917 } else if (circumstance == SLWAIT) { 1918 out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id, 1919 fmep->timer); 1920 ptree_timeval(O_ALTFP|O_NONL, &fmep->wull); 1921 } else if (circumstance == SLDISPROVED) { 1922 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id); 1923 } else { 1924 out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id); 1925 } 1926 1927 if (circumstance == SLWAIT || circumstance == SLDISPROVED) { 1928 out(O_ALTFP, "]"); 1929 return; 1930 } 1931 1932 for (ep = fmep->suspects; ep; ep = ep->suspects) { 1933 out(O_ALTFP|O_NONL, " "); 1934 itree_pevent_brief(O_ALTFP|O_NONL, ep); 1935 } 1936 out(O_ALTFP, "]"); 1937 } 1938 1939 static struct node * 1940 eventprop_lookup(struct event *ep, const char *propname) 1941 { 1942 return (lut_lookup(ep->props, (void *)propname, NULL)); 1943 } 1944 1945 #define MAXDIGITIDX 23 1946 static char numbuf[MAXDIGITIDX + 1]; 1947 1948 static int 1949 node2uint(struct node *n, uint_t *valp) 1950 { 1951 struct evalue value; 1952 struct lut *globals = NULL; 1953 1954 if (n == NULL) 1955 return (1); 1956 1957 /* 1958 * check value.v since we are being asked to convert an unsigned 1959 * long long int to an unsigned int 1960 */ 1961 if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) || 1962 value.t != UINT64 || value.v > (1ULL << 32)) 1963 return (1); 1964 1965 *valp = (uint_t)value.v; 1966 1967 return (0); 1968 } 1969 1970 static nvlist_t * 1971 node2fmri(struct node *n) 1972 { 1973 nvlist_t **pa, *f, *p; 1974 struct node *nc; 1975 uint_t depth = 0; 1976 char *numstr, *nullbyte; 1977 char *failure; 1978 int err, i; 1979 1980 /* XXX do we need to be able to handle a non-T_NAME node? */ 1981 if (n == NULL || n->t != T_NAME) 1982 return (NULL); 1983 1984 for (nc = n; nc != NULL; nc = nc->u.name.next) { 1985 if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM) 1986 break; 1987 depth++; 1988 } 1989 1990 if (nc != NULL) { 1991 /* We bailed early, something went wrong */ 1992 return (NULL); 1993 } 1994 1995 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 1996 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 1997 pa = alloca(depth * sizeof (nvlist_t *)); 1998 for (i = 0; i < depth; i++) 1999 pa[i] = NULL; 2000 2001 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2002 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2003 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2004 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2005 if (err != 0) { 2006 failure = "basic construction of FMRI failed"; 2007 goto boom; 2008 } 2009 2010 numbuf[MAXDIGITIDX] = '\0'; 2011 nullbyte = &numbuf[MAXDIGITIDX]; 2012 i = 0; 2013 2014 for (nc = n; nc != NULL; nc = nc->u.name.next) { 2015 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2016 if (err != 0) { 2017 failure = "alloc of an hc-pair failed"; 2018 goto boom; 2019 } 2020 err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s); 2021 numstr = ulltostr(nc->u.name.child->u.ull, nullbyte); 2022 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2023 if (err != 0) { 2024 failure = "construction of an hc-pair failed"; 2025 goto boom; 2026 } 2027 pa[i++] = p; 2028 } 2029 2030 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2031 if (err == 0) { 2032 for (i = 0; i < depth; i++) 2033 nvlist_free(pa[i]); 2034 return (f); 2035 } 2036 failure = "addition of hc-pair array to FMRI failed"; 2037 2038 boom: 2039 for (i = 0; i < depth; i++) 2040 nvlist_free(pa[i]); 2041 nvlist_free(f); 2042 out(O_DIE, "%s", failure); 2043 /*NOTREACHED*/ 2044 return (NULL); 2045 } 2046 2047 /* an ipath cache entry is an array of these, with s==NULL at the end */ 2048 struct ipath { 2049 const char *s; /* component name (in stable) */ 2050 int i; /* instance number */ 2051 }; 2052 2053 static nvlist_t * 2054 ipath2fmri(struct ipath *ipath) 2055 { 2056 nvlist_t **pa, *f, *p; 2057 uint_t depth = 0; 2058 char *numstr, *nullbyte; 2059 char *failure; 2060 int err, i; 2061 struct ipath *ipp; 2062 2063 for (ipp = ipath; ipp->s != NULL; ipp++) 2064 depth++; 2065 2066 if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0) 2067 out(O_DIE|O_SYS, "alloc of fmri nvl failed"); 2068 pa = alloca(depth * sizeof (nvlist_t *)); 2069 for (i = 0; i < depth; i++) 2070 pa[i] = NULL; 2071 2072 err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC); 2073 err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION); 2074 err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, ""); 2075 err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth); 2076 if (err != 0) { 2077 failure = "basic construction of FMRI failed"; 2078 goto boom; 2079 } 2080 2081 numbuf[MAXDIGITIDX] = '\0'; 2082 nullbyte = &numbuf[MAXDIGITIDX]; 2083 i = 0; 2084 2085 for (ipp = ipath; ipp->s != NULL; ipp++) { 2086 err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl); 2087 if (err != 0) { 2088 failure = "alloc of an hc-pair failed"; 2089 goto boom; 2090 } 2091 err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s); 2092 numstr = ulltostr(ipp->i, nullbyte); 2093 err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr); 2094 if (err != 0) { 2095 failure = "construction of an hc-pair failed"; 2096 goto boom; 2097 } 2098 pa[i++] = p; 2099 } 2100 2101 err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth); 2102 if (err == 0) { 2103 for (i = 0; i < depth; i++) 2104 nvlist_free(pa[i]); 2105 return (f); 2106 } 2107 failure = "addition of hc-pair array to FMRI failed"; 2108 2109 boom: 2110 for (i = 0; i < depth; i++) 2111 nvlist_free(pa[i]); 2112 nvlist_free(f); 2113 out(O_DIE, "%s", failure); 2114 /*NOTREACHED*/ 2115 return (NULL); 2116 } 2117 2118 static uint8_t 2119 percentof(uint_t part, uint_t whole) 2120 { 2121 unsigned long long p = part * 1000; 2122 2123 return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0)); 2124 } 2125 2126 struct rsl { 2127 struct event *suspect; 2128 nvlist_t *asru; 2129 nvlist_t *fru; 2130 nvlist_t *rsrc; 2131 }; 2132 2133 static void publish_suspects(struct fme *fmep, struct rsl *srl); 2134 2135 /* 2136 * rslfree -- free internal members of struct rsl not expected to be 2137 * freed elsewhere. 2138 */ 2139 static void 2140 rslfree(struct rsl *freeme) 2141 { 2142 nvlist_free(freeme->asru); 2143 nvlist_free(freeme->fru); 2144 if (freeme->rsrc != freeme->asru) 2145 nvlist_free(freeme->rsrc); 2146 } 2147 2148 /* 2149 * rslcmp -- compare two rsl structures. Use the following 2150 * comparisons to establish cardinality: 2151 * 2152 * 1. Name of the suspect's class. (simple strcmp) 2153 * 2. Name of the suspect's ASRU. (trickier, since nvlist) 2154 * 2155 */ 2156 static int 2157 rslcmp(const void *a, const void *b) 2158 { 2159 struct rsl *r1 = (struct rsl *)a; 2160 struct rsl *r2 = (struct rsl *)b; 2161 int rv; 2162 2163 rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s, 2164 r2->suspect->enode->u.event.ename->u.name.s); 2165 if (rv != 0) 2166 return (rv); 2167 2168 if (r1->rsrc == NULL && r2->rsrc == NULL) 2169 return (0); 2170 if (r1->rsrc == NULL) 2171 return (-1); 2172 if (r2->rsrc == NULL) 2173 return (1); 2174 return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0)); 2175 } 2176 2177 /* 2178 * get_resources -- for a given suspect, determine what ASRU, FRU and 2179 * RSRC nvlists should be advertised in the final suspect list. 2180 */ 2181 void 2182 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot) 2183 { 2184 struct node *asrudef, *frudef; 2185 const struct ipath *asrupath, *frupath; 2186 nvlist_t *asru = NULL, *fru = NULL; 2187 nvlist_t *rsrc = NULL; 2188 char *pathstr; 2189 2190 /* 2191 * First find any ASRU and/or FRU defined in the 2192 * initial fault tree. 2193 */ 2194 asrudef = eventprop_lookup(sp, L_ASRU); 2195 frudef = eventprop_lookup(sp, L_FRU); 2196 2197 /* 2198 * Create ipaths based on those definitions 2199 */ 2200 asrupath = ipath(asrudef); 2201 frupath = ipath(frudef); 2202 2203 /* 2204 * Allow for platform translations of the FMRIs 2205 */ 2206 pathstr = ipath2str(NULL, sp->ipp); 2207 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_RESOURCE, 2208 &rsrc, pathstr); 2209 FREE(pathstr); 2210 2211 pathstr = ipath2str(NULL, asrupath); 2212 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_ASRU, 2213 &asru, pathstr); 2214 FREE(pathstr); 2215 2216 pathstr = ipath2str(NULL, frupath); 2217 platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_FRU, 2218 &fru, pathstr); 2219 FREE(pathstr); 2220 2221 rsrcs->suspect = sp; 2222 rsrcs->asru = asru; 2223 rsrcs->fru = fru; 2224 rsrcs->rsrc = rsrc; 2225 } 2226 2227 /* 2228 * trim_suspects -- prior to publishing, we may need to remove some 2229 * suspects from the list. If we're auto-closing upsets, we don't 2230 * want any of those in the published list. If the ASRUs for multiple 2231 * defects resolve to the same ASRU (driver) we only want to publish 2232 * that as a single suspect. 2233 */ 2234 static int 2235 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2, 2236 fmd_event_t *ffep) 2237 { 2238 struct event *ep; 2239 struct rsl *rp = begin; 2240 struct rsl *rp2 = begin2; 2241 int mess_zero_count = 0; 2242 int serd_rval; 2243 uint_t messval; 2244 2245 /* remove any unwanted upsets and populate our array */ 2246 for (ep = fmep->psuspects; ep; ep = ep->psuspects) { 2247 if (is_upset(ep->t)) 2248 continue; 2249 serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep, 2250 NULL, NULL); 2251 if (serd_rval == 0) 2252 continue; 2253 if (node2uint(eventprop_lookup(ep, L_message), 2254 &messval) == 0 && messval == 0) { 2255 get_resources(ep, rp2, fmep->config); 2256 rp2++; 2257 mess_zero_count++; 2258 } else { 2259 get_resources(ep, rp, fmep->config); 2260 rp++; 2261 fmep->nsuspects++; 2262 } 2263 } 2264 return (mess_zero_count); 2265 } 2266 2267 /* 2268 * addpayloadprop -- add a payload prop to a problem 2269 */ 2270 static void 2271 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault) 2272 { 2273 nvlist_t *rsrc, *hcs; 2274 2275 ASSERT(fault != NULL); 2276 ASSERT(lhs != NULL); 2277 ASSERT(rhs != NULL); 2278 2279 if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0) 2280 out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs); 2281 2282 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) { 2283 out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific"); 2284 if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0) 2285 out(O_DIE, 2286 "cannot add payloadprop \"%s\" to fault", lhs); 2287 if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0) 2288 out(O_DIE, 2289 "cannot add payloadprop \"%s\" to fault", lhs); 2290 nvlist_free(hcs); 2291 if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) 2292 out(O_DIE, 2293 "cannot add payloadprop \"%s\" to fault", lhs); 2294 } else 2295 out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific"); 2296 2297 if (rhs->t == UINT64) { 2298 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v); 2299 2300 if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0) 2301 out(O_DIE, 2302 "cannot add payloadprop \"%s\" to fault", lhs); 2303 } else { 2304 out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"", 2305 lhs, (char *)(uintptr_t)rhs->v); 2306 2307 if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0) 2308 out(O_DIE, 2309 "cannot add payloadprop \"%s\" to fault", lhs); 2310 } 2311 } 2312 2313 static char *Istatbuf; 2314 static char *Istatbufptr; 2315 static int Istatsz; 2316 2317 /* 2318 * istataddsize -- calculate size of istat and add it to Istatsz 2319 */ 2320 /*ARGSUSED2*/ 2321 static void 2322 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2323 { 2324 int val; 2325 2326 ASSERT(lhs != NULL); 2327 ASSERT(rhs != NULL); 2328 2329 if ((val = stats_counter_value(rhs)) == 0) 2330 return; /* skip zero-valued stats */ 2331 2332 /* count up the size of the stat name */ 2333 Istatsz += ipath2strlen(lhs->ename, lhs->ipath); 2334 Istatsz++; /* for the trailing NULL byte */ 2335 2336 /* count up the size of the stat value */ 2337 Istatsz += snprintf(NULL, 0, "%d", val); 2338 Istatsz++; /* for the trailing NULL byte */ 2339 } 2340 2341 /* 2342 * istat2str -- serialize an istat, writing result to *Istatbufptr 2343 */ 2344 /*ARGSUSED2*/ 2345 static void 2346 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg) 2347 { 2348 char *str; 2349 int len; 2350 int val; 2351 2352 ASSERT(lhs != NULL); 2353 ASSERT(rhs != NULL); 2354 2355 if ((val = stats_counter_value(rhs)) == 0) 2356 return; /* skip zero-valued stats */ 2357 2358 /* serialize the stat name */ 2359 str = ipath2str(lhs->ename, lhs->ipath); 2360 len = strlen(str); 2361 2362 ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]); 2363 (void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr); 2364 Istatbufptr += len; 2365 FREE(str); 2366 *Istatbufptr++ = '\0'; 2367 2368 /* serialize the stat value */ 2369 Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr, 2370 "%d", val); 2371 *Istatbufptr++ = '\0'; 2372 2373 ASSERT(Istatbufptr <= &Istatbuf[Istatsz]); 2374 } 2375 2376 void 2377 istat_save() 2378 { 2379 if (Istat_need_save == 0) 2380 return; 2381 2382 /* figure out how big the serialzed info is */ 2383 Istatsz = 0; 2384 lut_walk(Istats, (lut_cb)istataddsize, NULL); 2385 2386 if (Istatsz == 0) { 2387 /* no stats to save */ 2388 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2389 return; 2390 } 2391 2392 /* create the serialized buffer */ 2393 Istatbufptr = Istatbuf = MALLOC(Istatsz); 2394 lut_walk(Istats, (lut_cb)istat2str, NULL); 2395 2396 /* clear out current saved stats */ 2397 fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS); 2398 2399 /* write out the new version */ 2400 fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz); 2401 FREE(Istatbuf); 2402 2403 Istat_need_save = 0; 2404 } 2405 2406 int 2407 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2) 2408 { 2409 if (ent1->ename != ent2->ename) 2410 return (ent2->ename - ent1->ename); 2411 if (ent1->ipath != ent2->ipath) 2412 return ((char *)ent2->ipath - (char *)ent1->ipath); 2413 2414 return (0); 2415 } 2416 2417 /* 2418 * istat-verify -- verify the component associated with a stat still exists 2419 * 2420 * if the component no longer exists, this routine resets the stat and 2421 * returns 0. if the component still exists, it returns 1. 2422 */ 2423 static int 2424 istat_verify(struct node *snp, struct istat_entry *entp) 2425 { 2426 struct stats *statp; 2427 nvlist_t *fmri; 2428 2429 fmri = node2fmri(snp->u.event.epname); 2430 if (platform_path_exists(fmri)) { 2431 nvlist_free(fmri); 2432 return (1); 2433 } 2434 nvlist_free(fmri); 2435 2436 /* component no longer in system. zero out the associated stats */ 2437 if ((statp = (struct stats *) 2438 lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL || 2439 stats_counter_value(statp) == 0) 2440 return (0); /* stat is already reset */ 2441 2442 Istat_need_save = 1; 2443 stats_counter_reset(statp); 2444 return (0); 2445 } 2446 2447 static void 2448 istat_bump(struct node *snp, int n) 2449 { 2450 struct stats *statp; 2451 struct istat_entry ent; 2452 2453 ASSERT(snp != NULL); 2454 ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t)); 2455 ASSERT(snp->u.event.epname != NULL); 2456 2457 /* class name should be hoisted into a single stable entry */ 2458 ASSERT(snp->u.event.ename->u.name.next == NULL); 2459 ent.ename = snp->u.event.ename->u.name.s; 2460 ent.ipath = ipath(snp->u.event.epname); 2461 2462 if (!istat_verify(snp, &ent)) { 2463 /* component no longer exists in system, nothing to do */ 2464 return; 2465 } 2466 2467 if ((statp = (struct stats *) 2468 lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) { 2469 /* need to create the counter */ 2470 int cnt = 0; 2471 struct node *np; 2472 char *sname; 2473 char *snamep; 2474 struct istat_entry *newentp; 2475 2476 /* count up the size of the stat name */ 2477 np = snp->u.event.ename; 2478 while (np != NULL) { 2479 cnt += strlen(np->u.name.s); 2480 cnt++; /* for the '.' or '@' */ 2481 np = np->u.name.next; 2482 } 2483 np = snp->u.event.epname; 2484 while (np != NULL) { 2485 cnt += snprintf(NULL, 0, "%s%llu", 2486 np->u.name.s, np->u.name.child->u.ull); 2487 cnt++; /* for the '/' or trailing NULL byte */ 2488 np = np->u.name.next; 2489 } 2490 2491 /* build the stat name */ 2492 snamep = sname = alloca(cnt); 2493 np = snp->u.event.ename; 2494 while (np != NULL) { 2495 snamep += snprintf(snamep, &sname[cnt] - snamep, 2496 "%s", np->u.name.s); 2497 np = np->u.name.next; 2498 if (np) 2499 *snamep++ = '.'; 2500 } 2501 *snamep++ = '@'; 2502 np = snp->u.event.epname; 2503 while (np != NULL) { 2504 snamep += snprintf(snamep, &sname[cnt] - snamep, 2505 "%s%llu", np->u.name.s, np->u.name.child->u.ull); 2506 np = np->u.name.next; 2507 if (np) 2508 *snamep++ = '/'; 2509 } 2510 *snamep++ = '\0'; 2511 2512 /* create the new stat & add it to our list */ 2513 newentp = MALLOC(sizeof (*newentp)); 2514 *newentp = ent; 2515 statp = stats_new_counter(NULL, sname, 0); 2516 Istats = lut_add(Istats, (void *)newentp, (void *)statp, 2517 (lut_cmp)istat_cmp); 2518 } 2519 2520 /* if n is non-zero, set that value instead of bumping */ 2521 if (n) { 2522 stats_counter_reset(statp); 2523 stats_counter_add(statp, n); 2524 } else 2525 stats_counter_bump(statp); 2526 Istat_need_save = 1; 2527 2528 ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath); 2529 out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented", 2530 stats_counter_value(statp)); 2531 } 2532 2533 /*ARGSUSED*/ 2534 static void 2535 istat_destructor(void *left, void *right, void *arg) 2536 { 2537 struct istat_entry *entp = (struct istat_entry *)left; 2538 struct stats *statp = (struct stats *)right; 2539 FREE(entp); 2540 stats_delete(statp); 2541 } 2542 2543 /* 2544 * Callback used in a walk of the Istats to reset matching stat counters. 2545 */ 2546 static void 2547 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp, 2548 const struct ipath *ipp) 2549 { 2550 char *path; 2551 2552 if (entp->ipath == ipp) { 2553 path = ipath2str(entp->ename, ipp); 2554 out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path); 2555 FREE(path); 2556 stats_counter_reset(statp); 2557 Istat_need_save = 1; 2558 } 2559 } 2560 2561 /*ARGSUSED*/ 2562 static void 2563 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp, 2564 void *unused) 2565 { 2566 char *path; 2567 nvlist_t *fmri; 2568 2569 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2570 if (!platform_path_exists(fmri)) { 2571 path = ipath2str(entp->ename, entp->ipath); 2572 out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path); 2573 FREE(path); 2574 stats_counter_reset(statp); 2575 Istat_need_save = 1; 2576 } 2577 nvlist_free(fmri); 2578 } 2579 2580 void 2581 istat_fini(void) 2582 { 2583 lut_free(Istats, istat_destructor, NULL); 2584 } 2585 2586 static char *Serdbuf; 2587 static char *Serdbufptr; 2588 static int Serdsz; 2589 2590 /* 2591 * serdaddsize -- calculate size of serd and add it to Serdsz 2592 */ 2593 /*ARGSUSED*/ 2594 static void 2595 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2596 { 2597 ASSERT(lhs != NULL); 2598 2599 /* count up the size of the stat name */ 2600 Serdsz += ipath2strlen(lhs->ename, lhs->ipath); 2601 Serdsz++; /* for the trailing NULL byte */ 2602 } 2603 2604 /* 2605 * serd2str -- serialize a serd engine, writing result to *Serdbufptr 2606 */ 2607 /*ARGSUSED*/ 2608 static void 2609 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg) 2610 { 2611 char *str; 2612 int len; 2613 2614 ASSERT(lhs != NULL); 2615 2616 /* serialize the serd engine name */ 2617 str = ipath2str(lhs->ename, lhs->ipath); 2618 len = strlen(str); 2619 2620 ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]); 2621 (void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr); 2622 Serdbufptr += len; 2623 FREE(str); 2624 *Serdbufptr++ = '\0'; 2625 ASSERT(Serdbufptr <= &Serdbuf[Serdsz]); 2626 } 2627 2628 void 2629 serd_save() 2630 { 2631 if (Serd_need_save == 0) 2632 return; 2633 2634 /* figure out how big the serialzed info is */ 2635 Serdsz = 0; 2636 lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL); 2637 2638 if (Serdsz == 0) { 2639 /* no serd engines to save */ 2640 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2641 return; 2642 } 2643 2644 /* create the serialized buffer */ 2645 Serdbufptr = Serdbuf = MALLOC(Serdsz); 2646 lut_walk(SerdEngines, (lut_cb)serd2str, NULL); 2647 2648 /* clear out current saved stats */ 2649 fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS); 2650 2651 /* write out the new version */ 2652 fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz); 2653 FREE(Serdbuf); 2654 Serd_need_save = 0; 2655 } 2656 2657 int 2658 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2) 2659 { 2660 if (ent1->ename != ent2->ename) 2661 return (ent2->ename - ent1->ename); 2662 if (ent1->ipath != ent2->ipath) 2663 return ((char *)ent2->ipath - (char *)ent1->ipath); 2664 2665 return (0); 2666 } 2667 2668 void 2669 fme_serd_load(fmd_hdl_t *hdl) 2670 { 2671 int sz; 2672 char *sbuf; 2673 char *sepptr; 2674 char *ptr; 2675 struct serd_entry *newentp; 2676 struct node *epname; 2677 nvlist_t *fmri; 2678 char *namestring; 2679 2680 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0) 2681 return; 2682 sbuf = alloca(sz); 2683 fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz); 2684 ptr = sbuf; 2685 while (ptr < &sbuf[sz]) { 2686 sepptr = strchr(ptr, '@'); 2687 *sepptr = '\0'; 2688 namestring = ptr; 2689 sepptr++; 2690 ptr = sepptr; 2691 ptr += strlen(ptr); 2692 ptr++; /* move past the '\0' separating paths */ 2693 epname = pathstring2epnamenp(sepptr); 2694 fmri = node2fmri(epname); 2695 if (platform_path_exists(fmri)) { 2696 newentp = MALLOC(sizeof (*newentp)); 2697 newentp->hdl = hdl; 2698 newentp->ipath = ipath(epname); 2699 newentp->ename = stable(namestring); 2700 SerdEngines = lut_add(SerdEngines, (void *)newentp, 2701 (void *)newentp, (lut_cmp)serd_cmp); 2702 } else 2703 Serd_need_save = 1; 2704 tree_free(epname); 2705 nvlist_free(fmri); 2706 } 2707 /* save it back again in case some of the paths no longer exist */ 2708 serd_save(); 2709 } 2710 2711 /*ARGSUSED*/ 2712 static void 2713 serd_destructor(void *left, void *right, void *arg) 2714 { 2715 struct serd_entry *entp = (struct serd_entry *)left; 2716 FREE(entp); 2717 } 2718 2719 /* 2720 * Callback used in a walk of the SerdEngines to reset matching serd engines. 2721 */ 2722 /*ARGSUSED*/ 2723 static void 2724 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp) 2725 { 2726 char *path; 2727 2728 if (entp->ipath == ipp) { 2729 path = ipath2str(entp->ename, ipp); 2730 out(O_ALTFP, "serd_reset_cb: resetting %s", path); 2731 fmd_serd_reset(entp->hdl, path); 2732 FREE(path); 2733 Serd_need_save = 1; 2734 } 2735 } 2736 2737 /*ARGSUSED*/ 2738 static void 2739 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2) 2740 { 2741 char *path; 2742 nvlist_t *fmri; 2743 2744 fmri = ipath2fmri((struct ipath *)(entp->ipath)); 2745 if (!platform_path_exists(fmri)) { 2746 path = ipath2str(entp->ename, entp->ipath); 2747 out(O_ALTFP, "serd_topo_chg_cb: not present %s", path); 2748 fmd_serd_reset(entp->hdl, path); 2749 FREE(path); 2750 Serd_need_save = 1; 2751 } 2752 nvlist_free(fmri); 2753 } 2754 2755 void 2756 serd_fini(void) 2757 { 2758 lut_free(SerdEngines, serd_destructor, NULL); 2759 } 2760 2761 static void 2762 publish_suspects(struct fme *fmep, struct rsl *srl) 2763 { 2764 struct rsl *rp; 2765 nvlist_t *fault; 2766 uint8_t cert; 2767 uint_t *frs; 2768 uint_t frsum, fr; 2769 uint_t messval; 2770 uint_t retireval; 2771 uint_t responseval; 2772 struct node *snp; 2773 int frcnt, fridx; 2774 boolean_t allfaulty = B_TRUE; 2775 struct rsl *erl = srl + fmep->nsuspects - 1; 2776 2777 /* 2778 * sort the array 2779 */ 2780 qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp); 2781 2782 /* sum the fitrates */ 2783 frs = alloca(fmep->nsuspects * sizeof (uint_t)); 2784 fridx = frcnt = frsum = 0; 2785 2786 for (rp = srl; rp <= erl; rp++) { 2787 struct node *n; 2788 2789 n = eventprop_lookup(rp->suspect, L_FITrate); 2790 if (node2uint(n, &fr) != 0) { 2791 out(O_DEBUG|O_NONL, "event "); 2792 ipath_print(O_DEBUG|O_NONL, 2793 rp->suspect->enode->u.event.ename->u.name.s, 2794 rp->suspect->ipp); 2795 out(O_VERB, " has no FITrate (using 1)"); 2796 fr = 1; 2797 } else if (fr == 0) { 2798 out(O_DEBUG|O_NONL, "event "); 2799 ipath_print(O_DEBUG|O_NONL, 2800 rp->suspect->enode->u.event.ename->u.name.s, 2801 rp->suspect->ipp); 2802 out(O_VERB, " has zero FITrate (using 1)"); 2803 fr = 1; 2804 } 2805 2806 frs[fridx++] = fr; 2807 frsum += fr; 2808 frcnt++; 2809 } 2810 2811 /* Add them in reverse order of our sort, as fmd reverses order */ 2812 for (rp = erl; rp >= srl; rp--) { 2813 cert = percentof(frs[--fridx], frsum); 2814 fault = fmd_nvl_create_fault(fmep->hdl, 2815 rp->suspect->enode->u.event.ename->u.name.s, 2816 cert, 2817 rp->asru, 2818 rp->fru, 2819 rp->rsrc); 2820 if (fault == NULL) 2821 out(O_DIE, "fault creation failed"); 2822 /* if "message" property exists, add it to the fault */ 2823 if (node2uint(eventprop_lookup(rp->suspect, L_message), 2824 &messval) == 0) { 2825 2826 out(O_ALTFP, 2827 "[FME%d, %s adds message=%d to suspect list]", 2828 fmep->id, 2829 rp->suspect->enode->u.event.ename->u.name.s, 2830 messval); 2831 if (nvlist_add_boolean_value(fault, 2832 FM_SUSPECT_MESSAGE, 2833 (messval) ? B_TRUE : B_FALSE) != 0) { 2834 out(O_DIE, "cannot add no-message to fault"); 2835 } 2836 } 2837 2838 /* if "retire" property exists, add it to the fault */ 2839 if (node2uint(eventprop_lookup(rp->suspect, L_retire), 2840 &retireval) == 0) { 2841 2842 out(O_ALTFP, 2843 "[FME%d, %s adds retire=%d to suspect list]", 2844 fmep->id, 2845 rp->suspect->enode->u.event.ename->u.name.s, 2846 retireval); 2847 if (nvlist_add_boolean_value(fault, 2848 FM_SUSPECT_RETIRE, 2849 (retireval) ? B_TRUE : B_FALSE) != 0) { 2850 out(O_DIE, "cannot add no-retire to fault"); 2851 } 2852 } 2853 2854 /* if "response" property exists, add it to the fault */ 2855 if (node2uint(eventprop_lookup(rp->suspect, L_response), 2856 &responseval) == 0) { 2857 2858 out(O_ALTFP, 2859 "[FME%d, %s adds response=%d to suspect list]", 2860 fmep->id, 2861 rp->suspect->enode->u.event.ename->u.name.s, 2862 responseval); 2863 if (nvlist_add_boolean_value(fault, 2864 FM_SUSPECT_RESPONSE, 2865 (responseval) ? B_TRUE : B_FALSE) != 0) { 2866 out(O_DIE, "cannot add no-response to fault"); 2867 } 2868 } 2869 2870 /* add any payload properties */ 2871 lut_walk(rp->suspect->payloadprops, 2872 (lut_cb)addpayloadprop, (void *)fault); 2873 rslfree(rp); 2874 2875 /* 2876 * If "action" property exists, evaluate it; this must be done 2877 * before the allfaulty check below since some actions may 2878 * modify the asru to be used in fmd_nvl_fmri_has_fault. This 2879 * needs to be restructured if any new actions are introduced 2880 * that have effects that we do not want to be visible if 2881 * we decide not to publish in the dupclose check below. 2882 */ 2883 if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) { 2884 struct evalue evalue; 2885 2886 out(O_ALTFP|O_NONL, 2887 "[FME%d, %s action ", fmep->id, 2888 rp->suspect->enode->u.event.ename->u.name.s); 2889 ptree_name_iter(O_ALTFP|O_NONL, snp); 2890 out(O_ALTFP, "]"); 2891 Action_nvl = fault; 2892 (void) eval_expr(snp, NULL, NULL, NULL, NULL, 2893 NULL, 0, &evalue); 2894 } 2895 2896 fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault); 2897 2898 /* 2899 * check if the asru is already marked as "faulty". 2900 */ 2901 if (allfaulty) { 2902 nvlist_t *asru; 2903 2904 out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id); 2905 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect); 2906 out(O_ALTFP|O_VERB|O_NONL, " "); 2907 if (nvlist_lookup_nvlist(fault, 2908 FM_FAULT_ASRU, &asru) != 0) { 2909 out(O_ALTFP|O_VERB, "NULL asru"); 2910 allfaulty = B_FALSE; 2911 } else if (fmd_nvl_fmri_has_fault(fmep->hdl, asru, 2912 FMD_HAS_FAULT_ASRU, NULL)) { 2913 out(O_ALTFP|O_VERB, "faulty"); 2914 } else { 2915 out(O_ALTFP|O_VERB, "not faulty"); 2916 allfaulty = B_FALSE; 2917 } 2918 } 2919 2920 } 2921 2922 if (!allfaulty) { 2923 /* 2924 * don't update the count stat if all asrus are already 2925 * present and unrepaired in the asru cache 2926 */ 2927 for (rp = erl; rp >= srl; rp--) { 2928 struct event *suspect = rp->suspect; 2929 2930 if (suspect == NULL) 2931 continue; 2932 2933 /* if "count" exists, increment the appropriate stat */ 2934 if ((snp = eventprop_lookup(suspect, 2935 L_count)) != NULL) { 2936 out(O_ALTFP|O_NONL, 2937 "[FME%d, %s count ", fmep->id, 2938 suspect->enode->u.event.ename->u.name.s); 2939 ptree_name_iter(O_ALTFP|O_NONL, snp); 2940 out(O_ALTFP, "]"); 2941 istat_bump(snp, 0); 2942 2943 } 2944 } 2945 istat_save(); /* write out any istat changes */ 2946 } 2947 } 2948 2949 static const char * 2950 undiag_2defect_str(int ud) 2951 { 2952 switch (ud) { 2953 case UD_VAL_MISSINGINFO: 2954 case UD_VAL_MISSINGOBS: 2955 case UD_VAL_MISSINGPATH: 2956 case UD_VAL_MISSINGZERO: 2957 case UD_VAL_BADOBS: 2958 case UD_VAL_CFGMISMATCH: 2959 return (UNDIAG_DEFECT_CHKPT); 2960 2961 case UD_VAL_BADEVENTI: 2962 case UD_VAL_BADEVENTPATH: 2963 case UD_VAL_BADEVENTCLASS: 2964 case UD_VAL_INSTFAIL: 2965 case UD_VAL_NOPATH: 2966 case UD_VAL_UNSOLVD: 2967 return (UNDIAG_DEFECT_FME); 2968 2969 case UD_VAL_MAXFME: 2970 return (UNDIAG_DEFECT_LIMIT); 2971 2972 case UD_VAL_UNKNOWN: 2973 default: 2974 return (UNDIAG_DEFECT_UNKNOWN); 2975 } 2976 } 2977 2978 static const char * 2979 undiag_2fault_str(int ud) 2980 { 2981 switch (ud) { 2982 case UD_VAL_BADEVENTI: 2983 case UD_VAL_BADEVENTPATH: 2984 case UD_VAL_BADEVENTCLASS: 2985 case UD_VAL_INSTFAIL: 2986 case UD_VAL_NOPATH: 2987 case UD_VAL_UNSOLVD: 2988 return (UNDIAG_FAULT_FME); 2989 default: 2990 return (NULL); 2991 } 2992 } 2993 2994 static char * 2995 undiag_2reason_str(int ud, char *arg) 2996 { 2997 const char *ptr; 2998 char *buf; 2999 int with_arg = 0; 3000 3001 switch (ud) { 3002 case UD_VAL_BADEVENTPATH: 3003 ptr = UD_STR_BADEVENTPATH; 3004 with_arg = 1; 3005 break; 3006 case UD_VAL_BADEVENTCLASS: 3007 ptr = UD_STR_BADEVENTCLASS; 3008 with_arg = 1; 3009 break; 3010 case UD_VAL_BADEVENTI: 3011 ptr = UD_STR_BADEVENTI; 3012 with_arg = 1; 3013 break; 3014 case UD_VAL_BADOBS: 3015 ptr = UD_STR_BADOBS; 3016 break; 3017 case UD_VAL_CFGMISMATCH: 3018 ptr = UD_STR_CFGMISMATCH; 3019 break; 3020 case UD_VAL_INSTFAIL: 3021 ptr = UD_STR_INSTFAIL; 3022 with_arg = 1; 3023 break; 3024 case UD_VAL_MAXFME: 3025 ptr = UD_STR_MAXFME; 3026 break; 3027 case UD_VAL_MISSINGINFO: 3028 ptr = UD_STR_MISSINGINFO; 3029 break; 3030 case UD_VAL_MISSINGOBS: 3031 ptr = UD_STR_MISSINGOBS; 3032 break; 3033 case UD_VAL_MISSINGPATH: 3034 ptr = UD_STR_MISSINGPATH; 3035 break; 3036 case UD_VAL_MISSINGZERO: 3037 ptr = UD_STR_MISSINGZERO; 3038 break; 3039 case UD_VAL_NOPATH: 3040 ptr = UD_STR_NOPATH; 3041 with_arg = 1; 3042 break; 3043 case UD_VAL_UNSOLVD: 3044 ptr = UD_STR_UNSOLVD; 3045 break; 3046 case UD_VAL_UNKNOWN: 3047 default: 3048 ptr = UD_STR_UNKNOWN; 3049 break; 3050 } 3051 if (with_arg) { 3052 buf = MALLOC(strlen(ptr) + strlen(arg) - 1); 3053 (void) sprintf(buf, ptr, arg); 3054 } else { 3055 buf = MALLOC(strlen(ptr) + 1); 3056 (void) sprintf(buf, ptr); 3057 } 3058 return (buf); 3059 } 3060 3061 static void 3062 publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep, fmd_case_t *fmcase, 3063 nvlist_t *detector, char *arg) 3064 { 3065 struct case_list *newcase; 3066 nvlist_t *defect, *fault; 3067 const char *faultstr; 3068 char *reason = undiag_2reason_str(Undiag_reason, arg); 3069 3070 out(O_ALTFP, 3071 "[undiagnosable ereport received, " 3072 "creating and closing a new case (%s)]", reason); 3073 3074 newcase = MALLOC(sizeof (struct case_list)); 3075 newcase->next = NULL; 3076 newcase->fmcase = fmcase; 3077 if (Undiagablecaselist != NULL) 3078 newcase->next = Undiagablecaselist; 3079 Undiagablecaselist = newcase; 3080 3081 if (ffep != NULL) 3082 fmd_case_add_ereport(hdl, newcase->fmcase, ffep); 3083 3084 /* add defect */ 3085 defect = fmd_nvl_create_fault(hdl, 3086 undiag_2defect_str(Undiag_reason), 50, NULL, NULL, detector); 3087 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3088 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, B_FALSE); 3089 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, B_FALSE); 3090 fmd_case_add_suspect(hdl, newcase->fmcase, defect); 3091 3092 /* add fault if appropriate */ 3093 faultstr = undiag_2fault_str(Undiag_reason); 3094 if (faultstr != NULL) { 3095 fault = fmd_nvl_create_fault(hdl, faultstr, 50, NULL, NULL, 3096 detector); 3097 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3098 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3099 B_FALSE); 3100 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3101 B_FALSE); 3102 fmd_case_add_suspect(hdl, newcase->fmcase, fault); 3103 } 3104 FREE(reason); 3105 3106 /* solve and close case */ 3107 fmd_case_solve(hdl, newcase->fmcase); 3108 fmd_case_close(hdl, newcase->fmcase); 3109 Undiag_reason = UD_VAL_UNKNOWN; 3110 } 3111 3112 static void 3113 fme_undiagnosable(struct fme *f) 3114 { 3115 nvlist_t *defect, *fault, *detector = NULL; 3116 struct event *ep; 3117 char *pathstr; 3118 const char *faultstr; 3119 char *reason = undiag_2reason_str(Undiag_reason, NULL); 3120 3121 out(O_ALTFP, "[solving/closing FME%d, case %s (%s)]", 3122 f->id, fmd_case_uuid(f->hdl, f->fmcase), reason); 3123 3124 for (ep = f->observations; ep; ep = ep->observations) { 3125 3126 if (ep->ffep != f->e0r) 3127 fmd_case_add_ereport(f->hdl, f->fmcase, ep->ffep); 3128 3129 pathstr = ipath2str(NULL, ipath(platform_getpath(ep->nvp))); 3130 platform_unit_translate(0, f->config, TOPO_PROP_RESOURCE, 3131 &detector, pathstr); 3132 FREE(pathstr); 3133 3134 /* add defect */ 3135 defect = fmd_nvl_create_fault(f->hdl, 3136 undiag_2defect_str(Undiag_reason), 50 / f->uniqobs, 3137 NULL, NULL, detector); 3138 (void) nvlist_add_string(defect, UNDIAG_REASON, reason); 3139 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RETIRE, 3140 B_FALSE); 3141 (void) nvlist_add_boolean_value(defect, FM_SUSPECT_RESPONSE, 3142 B_FALSE); 3143 fmd_case_add_suspect(f->hdl, f->fmcase, defect); 3144 3145 /* add fault if appropriate */ 3146 faultstr = undiag_2fault_str(Undiag_reason); 3147 if (faultstr == NULL) 3148 continue; 3149 fault = fmd_nvl_create_fault(f->hdl, faultstr, 50 / f->uniqobs, 3150 NULL, NULL, detector); 3151 (void) nvlist_add_string(fault, UNDIAG_REASON, reason); 3152 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RETIRE, 3153 B_FALSE); 3154 (void) nvlist_add_boolean_value(fault, FM_SUSPECT_RESPONSE, 3155 B_FALSE); 3156 fmd_case_add_suspect(f->hdl, f->fmcase, fault); 3157 nvlist_free(detector); 3158 } 3159 FREE(reason); 3160 fmd_case_solve(f->hdl, f->fmcase); 3161 fmd_case_close(f->hdl, f->fmcase); 3162 Undiag_reason = UD_VAL_UNKNOWN; 3163 } 3164 3165 /* 3166 * fme_close_case 3167 * 3168 * Find the requested case amongst our fmes and close it. Free up 3169 * the related fme. 3170 */ 3171 void 3172 fme_close_case(fmd_hdl_t *hdl, fmd_case_t *fmcase) 3173 { 3174 struct case_list *ucasep, *prevcasep = NULL; 3175 struct fme *prev = NULL; 3176 struct fme *fmep; 3177 3178 for (ucasep = Undiagablecaselist; ucasep; ucasep = ucasep->next) { 3179 if (fmcase != ucasep->fmcase) { 3180 prevcasep = ucasep; 3181 continue; 3182 } 3183 3184 if (prevcasep == NULL) 3185 Undiagablecaselist = Undiagablecaselist->next; 3186 else 3187 prevcasep->next = ucasep->next; 3188 3189 FREE(ucasep); 3190 return; 3191 } 3192 3193 for (fmep = FMElist; fmep; fmep = fmep->next) { 3194 if (fmep->hdl == hdl && fmep->fmcase == fmcase) 3195 break; 3196 prev = fmep; 3197 } 3198 3199 if (fmep == NULL) { 3200 out(O_WARN, "Eft asked to close unrecognized case [%s].", 3201 fmd_case_uuid(hdl, fmcase)); 3202 return; 3203 } 3204 3205 if (EFMElist == fmep) 3206 EFMElist = prev; 3207 3208 if (prev == NULL) 3209 FMElist = FMElist->next; 3210 else 3211 prev->next = fmep->next; 3212 3213 fmep->next = NULL; 3214 3215 /* Get rid of any timer this fme has set */ 3216 if (fmep->wull != 0) 3217 fmd_timer_remove(fmep->hdl, fmep->timer); 3218 3219 if (ClosedFMEs == NULL) { 3220 ClosedFMEs = fmep; 3221 } else { 3222 fmep->next = ClosedFMEs; 3223 ClosedFMEs = fmep; 3224 } 3225 3226 Open_fme_count--; 3227 3228 /* See if we can close the overflow FME */ 3229 if (Open_fme_count <= Max_fme) { 3230 for (fmep = FMElist; fmep; fmep = fmep->next) { 3231 if (fmep->overflow && !(fmd_case_closed(fmep->hdl, 3232 fmep->fmcase))) 3233 break; 3234 } 3235 3236 if (fmep != NULL) 3237 fmd_case_close(fmep->hdl, fmep->fmcase); 3238 } 3239 } 3240 3241 /* 3242 * fme_set_timer() 3243 * If the time we need to wait for the given FME is less than the 3244 * current timer, kick that old timer out and establish a new one. 3245 */ 3246 static int 3247 fme_set_timer(struct fme *fmep, unsigned long long wull) 3248 { 3249 out(O_ALTFP|O_VERB|O_NONL, " fme_set_timer: request to wait "); 3250 ptree_timeval(O_ALTFP|O_VERB, &wull); 3251 3252 if (wull <= fmep->pull) { 3253 out(O_ALTFP|O_VERB|O_NONL, "already have waited at least "); 3254 ptree_timeval(O_ALTFP|O_VERB, &fmep->pull); 3255 out(O_ALTFP|O_VERB, NULL); 3256 /* we've waited at least wull already, don't need timer */ 3257 return (0); 3258 } 3259 3260 out(O_ALTFP|O_VERB|O_NONL, " currently "); 3261 if (fmep->wull != 0) { 3262 out(O_ALTFP|O_VERB|O_NONL, "waiting "); 3263 ptree_timeval(O_ALTFP|O_VERB, &fmep->wull); 3264 out(O_ALTFP|O_VERB, NULL); 3265 } else { 3266 out(O_ALTFP|O_VERB|O_NONL, "not waiting"); 3267 out(O_ALTFP|O_VERB, NULL); 3268 } 3269 3270 if (fmep->wull != 0) 3271 if (wull >= fmep->wull) 3272 /* New timer would fire later than established timer */ 3273 return (0); 3274 3275 if (fmep->wull != 0) { 3276 fmd_timer_remove(fmep->hdl, fmep->timer); 3277 } 3278 3279 fmep->timer = fmd_timer_install(fmep->hdl, (void *)fmep, 3280 fmep->e0r, wull); 3281 out(O_ALTFP|O_VERB, "timer set, id is %ld", fmep->timer); 3282 fmep->wull = wull; 3283 return (1); 3284 } 3285 3286 void 3287 fme_timer_fired(struct fme *fmep, id_t tid) 3288 { 3289 struct fme *ffmep = NULL; 3290 3291 for (ffmep = FMElist; ffmep; ffmep = ffmep->next) 3292 if (ffmep == fmep) 3293 break; 3294 3295 if (ffmep == NULL) { 3296 out(O_WARN, "Timer fired for an FME (%p) not in FMEs list.", 3297 (void *)fmep); 3298 return; 3299 } 3300 3301 out(O_ALTFP|O_VERB, "Timer fired %lx", tid); 3302 fmep->pull = fmep->wull; 3303 fmep->wull = 0; 3304 fmd_buf_write(fmep->hdl, fmep->fmcase, 3305 WOBUF_PULL, (void *)&fmep->pull, sizeof (fmep->pull)); 3306 3307 fme_eval(fmep, fmep->e0r); 3308 } 3309 3310 /* 3311 * Preserve the fme's suspect list in its psuspects list, NULLing the 3312 * suspects list in the meantime. 3313 */ 3314 static void 3315 save_suspects(struct fme *fmep) 3316 { 3317 struct event *ep; 3318 struct event *nextep; 3319 3320 /* zero out the previous suspect list */ 3321 for (ep = fmep->psuspects; ep; ep = nextep) { 3322 nextep = ep->psuspects; 3323 ep->psuspects = NULL; 3324 } 3325 fmep->psuspects = NULL; 3326 3327 /* zero out the suspect list, copying it to previous suspect list */ 3328 fmep->psuspects = fmep->suspects; 3329 for (ep = fmep->suspects; ep; ep = nextep) { 3330 nextep = ep->suspects; 3331 ep->psuspects = ep->suspects; 3332 ep->suspects = NULL; 3333 ep->is_suspect = 0; 3334 } 3335 fmep->suspects = NULL; 3336 fmep->nsuspects = 0; 3337 } 3338 3339 /* 3340 * Retrieve the fme's suspect list from its psuspects list. 3341 */ 3342 static void 3343 restore_suspects(struct fme *fmep) 3344 { 3345 struct event *ep; 3346 struct event *nextep; 3347 3348 fmep->nsuspects = 0; 3349 fmep->suspects = fmep->psuspects; 3350 for (ep = fmep->psuspects; ep; ep = nextep) { 3351 fmep->nsuspects++; 3352 nextep = ep->psuspects; 3353 ep->suspects = ep->psuspects; 3354 } 3355 } 3356 3357 /* 3358 * this is what we use to call the Emrys prototype code instead of main() 3359 */ 3360 static void 3361 fme_eval(struct fme *fmep, fmd_event_t *ffep) 3362 { 3363 struct event *ep; 3364 unsigned long long my_delay = TIMEVAL_EVENTUALLY; 3365 struct rsl *srl = NULL; 3366 struct rsl *srl2 = NULL; 3367 int mess_zero_count; 3368 int rpcnt; 3369 3370 save_suspects(fmep); 3371 3372 out(O_ALTFP, "Evaluate FME %d", fmep->id); 3373 indent_set(" "); 3374 3375 lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep); 3376 fmep->state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay); 3377 3378 out(O_ALTFP|O_NONL, "FME%d state: %s, suspect list:", fmep->id, 3379 fme_state2str(fmep->state)); 3380 for (ep = fmep->suspects; ep; ep = ep->suspects) { 3381 out(O_ALTFP|O_NONL, " "); 3382 itree_pevent_brief(O_ALTFP|O_NONL, ep); 3383 } 3384 out(O_ALTFP, NULL); 3385 3386 switch (fmep->state) { 3387 case FME_CREDIBLE: 3388 print_suspects(SLNEW, fmep); 3389 (void) upsets_eval(fmep, ffep); 3390 3391 /* 3392 * we may have already posted suspects in upsets_eval() which 3393 * can recurse into fme_eval() again. If so then just return. 3394 */ 3395 if (fmep->posted_suspects) 3396 return; 3397 3398 stats_counter_bump(fmep->diags); 3399 rpcnt = fmep->nsuspects; 3400 save_suspects(fmep); 3401 3402 /* 3403 * create two lists, one for "message=1" faults and one for 3404 * "message=0" faults. If we have a mixture we will generate 3405 * two separate suspect lists. 3406 */ 3407 srl = MALLOC(rpcnt * sizeof (struct rsl)); 3408 bzero(srl, rpcnt * sizeof (struct rsl)); 3409 srl2 = MALLOC(rpcnt * sizeof (struct rsl)); 3410 bzero(srl2, rpcnt * sizeof (struct rsl)); 3411 mess_zero_count = trim_suspects(fmep, srl, srl2, ffep); 3412 3413 /* 3414 * If the resulting suspect list has no members, we're 3415 * done so simply close the case. Otherwise sort and publish. 3416 */ 3417 if (fmep->nsuspects == 0 && mess_zero_count == 0) { 3418 out(O_ALTFP, 3419 "[FME%d, case %s (all suspects are upsets)]", 3420 fmep->id, fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3421 fmd_case_close(fmep->hdl, fmep->fmcase); 3422 } else if (fmep->nsuspects != 0 && mess_zero_count == 0) { 3423 publish_suspects(fmep, srl); 3424 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3425 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3426 fmd_case_solve(fmep->hdl, fmep->fmcase); 3427 } else if (fmep->nsuspects == 0 && mess_zero_count != 0) { 3428 fmep->nsuspects = mess_zero_count; 3429 publish_suspects(fmep, srl2); 3430 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3431 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3432 fmd_case_solve(fmep->hdl, fmep->fmcase); 3433 } else { 3434 struct event *obsp; 3435 struct fme *nfmep; 3436 3437 publish_suspects(fmep, srl); 3438 out(O_ALTFP, "[solving FME%d, case %s]", fmep->id, 3439 fmd_case_uuid(fmep->hdl, fmep->fmcase)); 3440 fmd_case_solve(fmep->hdl, fmep->fmcase); 3441 3442 /* 3443 * Got both message=0 and message=1 so create a 3444 * duplicate case. Also need a temporary duplicate fme 3445 * structure for use by publish_suspects(). 3446 */ 3447 nfmep = alloc_fme(); 3448 nfmep->id = Nextid++; 3449 nfmep->hdl = fmep->hdl; 3450 nfmep->nsuspects = mess_zero_count; 3451 nfmep->fmcase = fmd_case_open(fmep->hdl, NULL); 3452 out(O_ALTFP|O_STAMP, 3453 "[creating parallel FME%d, case %s]", nfmep->id, 3454 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3455 Open_fme_count++; 3456 if (ffep) { 3457 fmd_case_setprincipal(nfmep->hdl, 3458 nfmep->fmcase, ffep); 3459 fmd_case_add_ereport(nfmep->hdl, 3460 nfmep->fmcase, ffep); 3461 } 3462 for (obsp = fmep->observations; obsp; 3463 obsp = obsp->observations) 3464 if (obsp->ffep && obsp->ffep != ffep) 3465 fmd_case_add_ereport(nfmep->hdl, 3466 nfmep->fmcase, obsp->ffep); 3467 3468 publish_suspects(nfmep, srl2); 3469 out(O_ALTFP, "[solving FME%d, case %s]", nfmep->id, 3470 fmd_case_uuid(nfmep->hdl, nfmep->fmcase)); 3471 fmd_case_solve(nfmep->hdl, nfmep->fmcase); 3472 FREE(nfmep); 3473 } 3474 FREE(srl); 3475 FREE(srl2); 3476 restore_suspects(fmep); 3477 3478 fmep->posted_suspects = 1; 3479 fmd_buf_write(fmep->hdl, fmep->fmcase, 3480 WOBUF_POSTD, 3481 (void *)&fmep->posted_suspects, 3482 sizeof (fmep->posted_suspects)); 3483 3484 /* 3485 * Now the suspects have been posted, we can clear up 3486 * the instance tree as we won't be looking at it again. 3487 * Also cancel the timer as the case is now solved. 3488 */ 3489 if (fmep->wull != 0) { 3490 fmd_timer_remove(fmep->hdl, fmep->timer); 3491 fmep->wull = 0; 3492 } 3493 break; 3494 3495 case FME_WAIT: 3496 ASSERT(my_delay > fmep->ull); 3497 (void) fme_set_timer(fmep, my_delay); 3498 print_suspects(SLWAIT, fmep); 3499 itree_prune(fmep->eventtree); 3500 return; 3501 3502 case FME_DISPROVED: 3503 print_suspects(SLDISPROVED, fmep); 3504 Undiag_reason = UD_VAL_UNSOLVD; 3505 fme_undiagnosable(fmep); 3506 break; 3507 } 3508 3509 itree_free(fmep->eventtree); 3510 fmep->eventtree = NULL; 3511 structconfig_free(fmep->config); 3512 fmep->config = NULL; 3513 destroy_fme_bufs(fmep); 3514 } 3515 3516 static void indent(void); 3517 static int triggered(struct fme *fmep, struct event *ep, int mark); 3518 static enum fme_state effects_test(struct fme *fmep, 3519 struct event *fault_event, unsigned long long at_latest_by, 3520 unsigned long long *pdelay); 3521 static enum fme_state requirements_test(struct fme *fmep, struct event *ep, 3522 unsigned long long at_latest_by, unsigned long long *pdelay); 3523 static enum fme_state causes_test(struct fme *fmep, struct event *ep, 3524 unsigned long long at_latest_by, unsigned long long *pdelay); 3525 3526 static int 3527 checkconstraints(struct fme *fmep, struct arrow *arrowp) 3528 { 3529 struct constraintlist *ctp; 3530 struct evalue value; 3531 char *sep = ""; 3532 3533 if (arrowp->forever_false) { 3534 indent(); 3535 out(O_ALTFP|O_VERB|O_NONL, " Forever false constraint: "); 3536 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3537 out(O_ALTFP|O_VERB|O_NONL, sep); 3538 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3539 sep = ", "; 3540 } 3541 out(O_ALTFP|O_VERB, NULL); 3542 return (0); 3543 } 3544 if (arrowp->forever_true) { 3545 indent(); 3546 out(O_ALTFP|O_VERB|O_NONL, " Forever true constraint: "); 3547 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3548 out(O_ALTFP|O_VERB|O_NONL, sep); 3549 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3550 sep = ", "; 3551 } 3552 out(O_ALTFP|O_VERB, NULL); 3553 return (1); 3554 } 3555 3556 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3557 if (eval_expr(ctp->cnode, NULL, NULL, 3558 &fmep->globals, fmep->config, 3559 arrowp, 0, &value)) { 3560 /* evaluation successful */ 3561 if (value.t == UNDEFINED || value.v == 0) { 3562 /* known false */ 3563 arrowp->forever_false = 1; 3564 indent(); 3565 out(O_ALTFP|O_VERB|O_NONL, 3566 " False constraint: "); 3567 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3568 out(O_ALTFP|O_VERB, NULL); 3569 return (0); 3570 } 3571 } else { 3572 /* evaluation unsuccessful -- unknown value */ 3573 indent(); 3574 out(O_ALTFP|O_VERB|O_NONL, 3575 " Deferred constraint: "); 3576 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3577 out(O_ALTFP|O_VERB, NULL); 3578 return (1); 3579 } 3580 } 3581 /* known true */ 3582 arrowp->forever_true = 1; 3583 indent(); 3584 out(O_ALTFP|O_VERB|O_NONL, " True constraint: "); 3585 for (ctp = arrowp->constraints; ctp != NULL; ctp = ctp->next) { 3586 out(O_ALTFP|O_VERB|O_NONL, sep); 3587 ptree(O_ALTFP|O_VERB|O_NONL, ctp->cnode, 1, 0); 3588 sep = ", "; 3589 } 3590 out(O_ALTFP|O_VERB, NULL); 3591 return (1); 3592 } 3593 3594 static int 3595 triggered(struct fme *fmep, struct event *ep, int mark) 3596 { 3597 struct bubble *bp; 3598 struct arrowlist *ap; 3599 int count = 0; 3600 3601 stats_counter_bump(fmep->Tcallcount); 3602 for (bp = itree_next_bubble(ep, NULL); bp; 3603 bp = itree_next_bubble(ep, bp)) { 3604 if (bp->t != B_TO) 3605 continue; 3606 for (ap = itree_next_arrow(bp, NULL); ap; 3607 ap = itree_next_arrow(bp, ap)) { 3608 /* check count of marks against K in the bubble */ 3609 if ((ap->arrowp->mark & mark) && 3610 ++count >= bp->nork) 3611 return (1); 3612 } 3613 } 3614 return (0); 3615 } 3616 3617 static int 3618 mark_arrows(struct fme *fmep, struct event *ep, int mark, 3619 unsigned long long at_latest_by, unsigned long long *pdelay, int keep) 3620 { 3621 struct bubble *bp; 3622 struct arrowlist *ap; 3623 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3624 unsigned long long my_delay; 3625 enum fme_state result; 3626 int retval = 0; 3627 3628 for (bp = itree_next_bubble(ep, NULL); bp; 3629 bp = itree_next_bubble(ep, bp)) { 3630 if (bp->t != B_FROM) 3631 continue; 3632 stats_counter_bump(fmep->Marrowcount); 3633 for (ap = itree_next_arrow(bp, NULL); ap; 3634 ap = itree_next_arrow(bp, ap)) { 3635 struct event *ep2 = ap->arrowp->head->myevent; 3636 /* 3637 * if we're clearing marks, we can avoid doing 3638 * all that work evaluating constraints. 3639 */ 3640 if (mark == 0) { 3641 if (ap->arrowp->arrow_marked == 0) 3642 continue; 3643 ap->arrowp->arrow_marked = 0; 3644 ap->arrowp->mark &= ~EFFECTS_COUNTER; 3645 if (keep && (ep2->cached_state & 3646 (WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT))) 3647 ep2->keep_in_tree = 1; 3648 ep2->cached_state &= 3649 ~(WAIT_EFFECT|CREDIBLE_EFFECT|PARENT_WAIT); 3650 (void) mark_arrows(fmep, ep2, mark, 0, NULL, 3651 keep); 3652 continue; 3653 } 3654 ap->arrowp->arrow_marked = 1; 3655 if (ep2->cached_state & REQMNTS_DISPROVED) { 3656 indent(); 3657 out(O_ALTFP|O_VERB|O_NONL, 3658 " ALREADY DISPROVED "); 3659 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3660 out(O_ALTFP|O_VERB, NULL); 3661 continue; 3662 } 3663 if (ep2->cached_state & WAIT_EFFECT) { 3664 indent(); 3665 out(O_ALTFP|O_VERB|O_NONL, 3666 " ALREADY EFFECTS WAIT "); 3667 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3668 out(O_ALTFP|O_VERB, NULL); 3669 continue; 3670 } 3671 if (ep2->cached_state & CREDIBLE_EFFECT) { 3672 indent(); 3673 out(O_ALTFP|O_VERB|O_NONL, 3674 " ALREADY EFFECTS CREDIBLE "); 3675 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3676 out(O_ALTFP|O_VERB, NULL); 3677 continue; 3678 } 3679 if ((ep2->cached_state & PARENT_WAIT) && 3680 (mark & PARENT_WAIT)) { 3681 indent(); 3682 out(O_ALTFP|O_VERB|O_NONL, 3683 " ALREADY PARENT EFFECTS WAIT "); 3684 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3685 out(O_ALTFP|O_VERB, NULL); 3686 continue; 3687 } 3688 platform_set_payloadnvp(ep2->nvp); 3689 if (checkconstraints(fmep, ap->arrowp) == 0) { 3690 platform_set_payloadnvp(NULL); 3691 indent(); 3692 out(O_ALTFP|O_VERB|O_NONL, 3693 " CONSTRAINTS FAIL "); 3694 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3695 out(O_ALTFP|O_VERB, NULL); 3696 continue; 3697 } 3698 platform_set_payloadnvp(NULL); 3699 ap->arrowp->mark |= EFFECTS_COUNTER; 3700 if (!triggered(fmep, ep2, EFFECTS_COUNTER)) { 3701 indent(); 3702 out(O_ALTFP|O_VERB|O_NONL, 3703 " K-COUNT NOT YET MET "); 3704 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3705 out(O_ALTFP|O_VERB, NULL); 3706 continue; 3707 } 3708 ep2->cached_state &= ~PARENT_WAIT; 3709 /* 3710 * if we've reached an ereport and no propagation time 3711 * is specified, use the Hesitate value 3712 */ 3713 if (ep2->t == N_EREPORT && at_latest_by == 0ULL && 3714 ap->arrowp->maxdelay == 0ULL) { 3715 out(O_ALTFP|O_VERB|O_NONL, " default wait "); 3716 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3717 out(O_ALTFP|O_VERB, NULL); 3718 result = requirements_test(fmep, ep2, Hesitate, 3719 &my_delay); 3720 } else { 3721 result = requirements_test(fmep, ep2, 3722 at_latest_by + ap->arrowp->maxdelay, 3723 &my_delay); 3724 } 3725 if (result == FME_WAIT) { 3726 retval = WAIT_EFFECT; 3727 if (overall_delay > my_delay) 3728 overall_delay = my_delay; 3729 ep2->cached_state |= WAIT_EFFECT; 3730 indent(); 3731 out(O_ALTFP|O_VERB|O_NONL, " EFFECTS WAIT "); 3732 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3733 out(O_ALTFP|O_VERB, NULL); 3734 indent_push(" E"); 3735 if (mark_arrows(fmep, ep2, PARENT_WAIT, 3736 at_latest_by, &my_delay, 0) == 3737 WAIT_EFFECT) { 3738 retval = WAIT_EFFECT; 3739 if (overall_delay > my_delay) 3740 overall_delay = my_delay; 3741 } 3742 indent_pop(); 3743 } else if (result == FME_DISPROVED) { 3744 indent(); 3745 out(O_ALTFP|O_VERB|O_NONL, 3746 " EFFECTS DISPROVED "); 3747 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3748 out(O_ALTFP|O_VERB, NULL); 3749 } else { 3750 ep2->cached_state |= mark; 3751 indent(); 3752 if (mark == CREDIBLE_EFFECT) 3753 out(O_ALTFP|O_VERB|O_NONL, 3754 " EFFECTS CREDIBLE "); 3755 else 3756 out(O_ALTFP|O_VERB|O_NONL, 3757 " PARENT EFFECTS WAIT "); 3758 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep2); 3759 out(O_ALTFP|O_VERB, NULL); 3760 indent_push(" E"); 3761 if (mark_arrows(fmep, ep2, mark, at_latest_by, 3762 &my_delay, 0) == WAIT_EFFECT) { 3763 retval = WAIT_EFFECT; 3764 if (overall_delay > my_delay) 3765 overall_delay = my_delay; 3766 } 3767 indent_pop(); 3768 } 3769 } 3770 } 3771 if (retval == WAIT_EFFECT) 3772 *pdelay = overall_delay; 3773 return (retval); 3774 } 3775 3776 static enum fme_state 3777 effects_test(struct fme *fmep, struct event *fault_event, 3778 unsigned long long at_latest_by, unsigned long long *pdelay) 3779 { 3780 struct event *error_event; 3781 enum fme_state return_value = FME_CREDIBLE; 3782 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3783 unsigned long long my_delay; 3784 3785 stats_counter_bump(fmep->Ecallcount); 3786 indent_push(" E"); 3787 indent(); 3788 out(O_ALTFP|O_VERB|O_NONL, "->"); 3789 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3790 out(O_ALTFP|O_VERB, NULL); 3791 3792 if (mark_arrows(fmep, fault_event, CREDIBLE_EFFECT, at_latest_by, 3793 &my_delay, 0) == WAIT_EFFECT) { 3794 return_value = FME_WAIT; 3795 if (overall_delay > my_delay) 3796 overall_delay = my_delay; 3797 } 3798 for (error_event = fmep->observations; 3799 error_event; error_event = error_event->observations) { 3800 indent(); 3801 out(O_ALTFP|O_VERB|O_NONL, " "); 3802 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, error_event); 3803 if (!(error_event->cached_state & CREDIBLE_EFFECT)) { 3804 if (error_event->cached_state & 3805 (PARENT_WAIT|WAIT_EFFECT)) { 3806 out(O_ALTFP|O_VERB, " NOT YET triggered"); 3807 continue; 3808 } 3809 return_value = FME_DISPROVED; 3810 out(O_ALTFP|O_VERB, " NOT triggered"); 3811 break; 3812 } else { 3813 out(O_ALTFP|O_VERB, " triggered"); 3814 } 3815 } 3816 if (return_value == FME_DISPROVED) { 3817 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 0); 3818 } else { 3819 fault_event->keep_in_tree = 1; 3820 (void) mark_arrows(fmep, fault_event, 0, 0, NULL, 1); 3821 } 3822 3823 indent(); 3824 out(O_ALTFP|O_VERB|O_NONL, "<-EFFECTS %s ", 3825 fme_state2str(return_value)); 3826 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, fault_event); 3827 out(O_ALTFP|O_VERB, NULL); 3828 indent_pop(); 3829 if (return_value == FME_WAIT) 3830 *pdelay = overall_delay; 3831 return (return_value); 3832 } 3833 3834 static enum fme_state 3835 requirements_test(struct fme *fmep, struct event *ep, 3836 unsigned long long at_latest_by, unsigned long long *pdelay) 3837 { 3838 int waiting_events; 3839 int credible_events; 3840 int deferred_events; 3841 enum fme_state return_value = FME_CREDIBLE; 3842 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 3843 unsigned long long arrow_delay; 3844 unsigned long long my_delay; 3845 struct event *ep2; 3846 struct bubble *bp; 3847 struct arrowlist *ap; 3848 3849 if (ep->cached_state & REQMNTS_CREDIBLE) { 3850 indent(); 3851 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY CREDIBLE "); 3852 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3853 out(O_ALTFP|O_VERB, NULL); 3854 return (FME_CREDIBLE); 3855 } 3856 if (ep->cached_state & REQMNTS_DISPROVED) { 3857 indent(); 3858 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY DISPROVED "); 3859 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3860 out(O_ALTFP|O_VERB, NULL); 3861 return (FME_DISPROVED); 3862 } 3863 if (ep->cached_state & REQMNTS_WAIT) { 3864 indent(); 3865 *pdelay = ep->cached_delay; 3866 out(O_ALTFP|O_VERB|O_NONL, " REQMNTS ALREADY WAIT "); 3867 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3868 out(O_ALTFP|O_VERB|O_NONL, ", wait for: "); 3869 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3870 out(O_ALTFP|O_VERB, NULL); 3871 return (FME_WAIT); 3872 } 3873 stats_counter_bump(fmep->Rcallcount); 3874 indent_push(" R"); 3875 indent(); 3876 out(O_ALTFP|O_VERB|O_NONL, "->"); 3877 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3878 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 3879 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3880 out(O_ALTFP|O_VERB, NULL); 3881 3882 if (ep->t == N_EREPORT) { 3883 if (ep->count == 0) { 3884 if (fmep->pull >= at_latest_by) { 3885 return_value = FME_DISPROVED; 3886 } else { 3887 ep->cached_delay = *pdelay = at_latest_by; 3888 return_value = FME_WAIT; 3889 } 3890 } 3891 3892 indent(); 3893 switch (return_value) { 3894 case FME_CREDIBLE: 3895 ep->cached_state |= REQMNTS_CREDIBLE; 3896 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS CREDIBLE "); 3897 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3898 break; 3899 case FME_DISPROVED: 3900 ep->cached_state |= REQMNTS_DISPROVED; 3901 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 3902 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3903 break; 3904 case FME_WAIT: 3905 ep->cached_state |= REQMNTS_WAIT; 3906 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS WAIT "); 3907 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 3908 out(O_ALTFP|O_VERB|O_NONL, " to "); 3909 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 3910 break; 3911 default: 3912 out(O_DIE, "requirements_test: unexpected fme_state"); 3913 break; 3914 } 3915 out(O_ALTFP|O_VERB, NULL); 3916 indent_pop(); 3917 3918 return (return_value); 3919 } 3920 3921 /* this event is not a report, descend the tree */ 3922 for (bp = itree_next_bubble(ep, NULL); bp; 3923 bp = itree_next_bubble(ep, bp)) { 3924 int n; 3925 3926 if (bp->t != B_FROM) 3927 continue; 3928 3929 n = bp->nork; 3930 3931 credible_events = 0; 3932 waiting_events = 0; 3933 deferred_events = 0; 3934 arrow_delay = TIMEVAL_EVENTUALLY; 3935 /* 3936 * n is -1 for 'A' so adjust it. 3937 * XXX just count up the arrows for now. 3938 */ 3939 if (n < 0) { 3940 n = 0; 3941 for (ap = itree_next_arrow(bp, NULL); ap; 3942 ap = itree_next_arrow(bp, ap)) 3943 n++; 3944 indent(); 3945 out(O_ALTFP|O_VERB, " Bubble Counted N=%d", n); 3946 } else { 3947 indent(); 3948 out(O_ALTFP|O_VERB, " Bubble N=%d", n); 3949 } 3950 3951 if (n == 0) 3952 continue; 3953 if (!(bp->mark & (BUBBLE_ELIDED|BUBBLE_OK))) { 3954 for (ap = itree_next_arrow(bp, NULL); ap; 3955 ap = itree_next_arrow(bp, ap)) { 3956 ep2 = ap->arrowp->head->myevent; 3957 platform_set_payloadnvp(ep2->nvp); 3958 (void) checkconstraints(fmep, ap->arrowp); 3959 if (!ap->arrowp->forever_false) { 3960 /* 3961 * if all arrows are invalidated by the 3962 * constraints, then we should elide the 3963 * whole bubble to be consistant with 3964 * the tree creation time behaviour 3965 */ 3966 bp->mark |= BUBBLE_OK; 3967 platform_set_payloadnvp(NULL); 3968 break; 3969 } 3970 platform_set_payloadnvp(NULL); 3971 } 3972 } 3973 for (ap = itree_next_arrow(bp, NULL); ap; 3974 ap = itree_next_arrow(bp, ap)) { 3975 ep2 = ap->arrowp->head->myevent; 3976 if (n <= credible_events) 3977 break; 3978 3979 ap->arrowp->mark |= REQMNTS_COUNTER; 3980 if (triggered(fmep, ep2, REQMNTS_COUNTER)) 3981 /* XXX adding max timevals! */ 3982 switch (requirements_test(fmep, ep2, 3983 at_latest_by + ap->arrowp->maxdelay, 3984 &my_delay)) { 3985 case FME_DEFERRED: 3986 deferred_events++; 3987 break; 3988 case FME_CREDIBLE: 3989 credible_events++; 3990 break; 3991 case FME_DISPROVED: 3992 break; 3993 case FME_WAIT: 3994 if (my_delay < arrow_delay) 3995 arrow_delay = my_delay; 3996 waiting_events++; 3997 break; 3998 default: 3999 out(O_DIE, 4000 "Bug in requirements_test."); 4001 } 4002 else 4003 deferred_events++; 4004 } 4005 if (!(bp->mark & BUBBLE_OK) && waiting_events == 0) { 4006 bp->mark |= BUBBLE_ELIDED; 4007 continue; 4008 } 4009 indent(); 4010 out(O_ALTFP|O_VERB, " Credible: %d Waiting %d", 4011 credible_events + deferred_events, waiting_events); 4012 if (credible_events + deferred_events + waiting_events < n) { 4013 /* Can never meet requirements */ 4014 ep->cached_state |= REQMNTS_DISPROVED; 4015 indent(); 4016 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS DISPROVED "); 4017 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4018 out(O_ALTFP|O_VERB, NULL); 4019 indent_pop(); 4020 return (FME_DISPROVED); 4021 } 4022 if (credible_events + deferred_events < n) { 4023 /* will have to wait */ 4024 /* wait time is shortest known */ 4025 if (arrow_delay < overall_delay) 4026 overall_delay = arrow_delay; 4027 return_value = FME_WAIT; 4028 } else if (credible_events < n) { 4029 if (return_value != FME_WAIT) 4030 return_value = FME_DEFERRED; 4031 } 4032 } 4033 4034 /* 4035 * don't mark as FME_DEFERRED. If this event isn't reached by another 4036 * path, then this will be considered FME_CREDIBLE. But if it is 4037 * reached by a different path so the K-count is met, then might 4038 * get overridden by FME_WAIT or FME_DISPROVED. 4039 */ 4040 if (return_value == FME_WAIT) { 4041 ep->cached_state |= REQMNTS_WAIT; 4042 ep->cached_delay = *pdelay = overall_delay; 4043 } else if (return_value == FME_CREDIBLE) { 4044 ep->cached_state |= REQMNTS_CREDIBLE; 4045 } 4046 indent(); 4047 out(O_ALTFP|O_VERB|O_NONL, "<-REQMNTS %s ", 4048 fme_state2str(return_value)); 4049 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4050 out(O_ALTFP|O_VERB, NULL); 4051 indent_pop(); 4052 return (return_value); 4053 } 4054 4055 static enum fme_state 4056 causes_test(struct fme *fmep, struct event *ep, 4057 unsigned long long at_latest_by, unsigned long long *pdelay) 4058 { 4059 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4060 unsigned long long my_delay; 4061 int credible_results = 0; 4062 int waiting_results = 0; 4063 enum fme_state fstate; 4064 struct event *tail_event; 4065 struct bubble *bp; 4066 struct arrowlist *ap; 4067 int k = 1; 4068 4069 stats_counter_bump(fmep->Ccallcount); 4070 indent_push(" C"); 4071 indent(); 4072 out(O_ALTFP|O_VERB|O_NONL, "->"); 4073 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4074 out(O_ALTFP|O_VERB, NULL); 4075 4076 for (bp = itree_next_bubble(ep, NULL); bp; 4077 bp = itree_next_bubble(ep, bp)) { 4078 if (bp->t != B_TO) 4079 continue; 4080 k = bp->nork; /* remember the K value */ 4081 for (ap = itree_next_arrow(bp, NULL); ap; 4082 ap = itree_next_arrow(bp, ap)) { 4083 int do_not_follow = 0; 4084 4085 /* 4086 * if we get to the same event multiple times 4087 * only worry about the first one. 4088 */ 4089 if (ap->arrowp->tail->myevent->cached_state & 4090 CAUSES_TESTED) { 4091 indent(); 4092 out(O_ALTFP|O_VERB|O_NONL, 4093 " causes test already run for "); 4094 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4095 ap->arrowp->tail->myevent); 4096 out(O_ALTFP|O_VERB, NULL); 4097 continue; 4098 } 4099 4100 /* 4101 * see if false constraint prevents us 4102 * from traversing this arrow 4103 */ 4104 platform_set_payloadnvp(ep->nvp); 4105 if (checkconstraints(fmep, ap->arrowp) == 0) 4106 do_not_follow = 1; 4107 platform_set_payloadnvp(NULL); 4108 if (do_not_follow) { 4109 indent(); 4110 out(O_ALTFP|O_VERB|O_NONL, 4111 " False arrow from "); 4112 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, 4113 ap->arrowp->tail->myevent); 4114 out(O_ALTFP|O_VERB, NULL); 4115 continue; 4116 } 4117 4118 ap->arrowp->tail->myevent->cached_state |= 4119 CAUSES_TESTED; 4120 tail_event = ap->arrowp->tail->myevent; 4121 fstate = hypothesise(fmep, tail_event, at_latest_by, 4122 &my_delay); 4123 4124 switch (fstate) { 4125 case FME_WAIT: 4126 if (my_delay < overall_delay) 4127 overall_delay = my_delay; 4128 waiting_results++; 4129 break; 4130 case FME_CREDIBLE: 4131 credible_results++; 4132 break; 4133 case FME_DISPROVED: 4134 break; 4135 default: 4136 out(O_DIE, "Bug in causes_test"); 4137 } 4138 } 4139 } 4140 /* compare against K */ 4141 if (credible_results + waiting_results < k) { 4142 indent(); 4143 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES DISPROVED "); 4144 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4145 out(O_ALTFP|O_VERB, NULL); 4146 indent_pop(); 4147 return (FME_DISPROVED); 4148 } 4149 if (waiting_results != 0) { 4150 *pdelay = overall_delay; 4151 indent(); 4152 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES WAIT "); 4153 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4154 out(O_ALTFP|O_VERB|O_NONL, " to "); 4155 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4156 out(O_ALTFP|O_VERB, NULL); 4157 indent_pop(); 4158 return (FME_WAIT); 4159 } 4160 indent(); 4161 out(O_ALTFP|O_VERB|O_NONL, "<-CAUSES CREDIBLE "); 4162 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4163 out(O_ALTFP|O_VERB, NULL); 4164 indent_pop(); 4165 return (FME_CREDIBLE); 4166 } 4167 4168 static enum fme_state 4169 hypothesise(struct fme *fmep, struct event *ep, 4170 unsigned long long at_latest_by, unsigned long long *pdelay) 4171 { 4172 enum fme_state rtr, otr; 4173 unsigned long long my_delay; 4174 unsigned long long overall_delay = TIMEVAL_EVENTUALLY; 4175 4176 stats_counter_bump(fmep->Hcallcount); 4177 indent_push(" H"); 4178 indent(); 4179 out(O_ALTFP|O_VERB|O_NONL, "->"); 4180 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4181 out(O_ALTFP|O_VERB|O_NONL, ", at latest by: "); 4182 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &at_latest_by); 4183 out(O_ALTFP|O_VERB, NULL); 4184 4185 rtr = requirements_test(fmep, ep, at_latest_by, &my_delay); 4186 if ((rtr == FME_WAIT) && (my_delay < overall_delay)) 4187 overall_delay = my_delay; 4188 if (rtr != FME_DISPROVED) { 4189 if (is_problem(ep->t)) { 4190 otr = effects_test(fmep, ep, at_latest_by, &my_delay); 4191 if (otr != FME_DISPROVED) { 4192 if (fmep->peek == 0 && ep->is_suspect == 0) { 4193 ep->suspects = fmep->suspects; 4194 ep->is_suspect = 1; 4195 fmep->suspects = ep; 4196 fmep->nsuspects++; 4197 } 4198 } 4199 } else 4200 otr = causes_test(fmep, ep, at_latest_by, &my_delay); 4201 if ((otr == FME_WAIT) && (my_delay < overall_delay)) 4202 overall_delay = my_delay; 4203 if ((otr != FME_DISPROVED) && 4204 ((rtr == FME_WAIT) || (otr == FME_WAIT))) 4205 *pdelay = overall_delay; 4206 } 4207 if (rtr == FME_DISPROVED) { 4208 indent(); 4209 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4210 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4211 out(O_ALTFP|O_VERB, " (doesn't meet requirements)"); 4212 indent_pop(); 4213 return (FME_DISPROVED); 4214 } 4215 if ((otr == FME_DISPROVED) && is_problem(ep->t)) { 4216 indent(); 4217 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4218 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4219 out(O_ALTFP|O_VERB, " (doesn't explain all reports)"); 4220 indent_pop(); 4221 return (FME_DISPROVED); 4222 } 4223 if (otr == FME_DISPROVED) { 4224 indent(); 4225 out(O_ALTFP|O_VERB|O_NONL, "<-DISPROVED "); 4226 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4227 out(O_ALTFP|O_VERB, " (causes are not credible)"); 4228 indent_pop(); 4229 return (FME_DISPROVED); 4230 } 4231 if ((rtr == FME_WAIT) || (otr == FME_WAIT)) { 4232 indent(); 4233 out(O_ALTFP|O_VERB|O_NONL, "<-WAIT "); 4234 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4235 out(O_ALTFP|O_VERB|O_NONL, " to "); 4236 ptree_timeval(O_ALTFP|O_VERB|O_NONL, &overall_delay); 4237 out(O_ALTFP|O_VERB, NULL); 4238 indent_pop(); 4239 return (FME_WAIT); 4240 } 4241 indent(); 4242 out(O_ALTFP|O_VERB|O_NONL, "<-CREDIBLE "); 4243 itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, ep); 4244 out(O_ALTFP|O_VERB, NULL); 4245 indent_pop(); 4246 return (FME_CREDIBLE); 4247 } 4248 4249 /* 4250 * fme_istat_load -- reconstitute any persistent istats 4251 */ 4252 void 4253 fme_istat_load(fmd_hdl_t *hdl) 4254 { 4255 int sz; 4256 char *sbuf; 4257 char *ptr; 4258 4259 if ((sz = fmd_buf_size(hdl, NULL, WOBUF_ISTATS)) == 0) { 4260 out(O_ALTFP, "fme_istat_load: No stats"); 4261 return; 4262 } 4263 4264 sbuf = alloca(sz); 4265 4266 fmd_buf_read(hdl, NULL, WOBUF_ISTATS, sbuf, sz); 4267 4268 /* 4269 * pick apart the serialized stats 4270 * 4271 * format is: 4272 * <class-name>, '@', <path>, '\0', <value>, '\0' 4273 * for example: 4274 * "stat.first@stat0/path0\02\0stat.second@stat0/path1\023\0" 4275 * 4276 * since this is parsing our own serialized data, any parsing issues 4277 * are fatal, so we check for them all with ASSERT() below. 4278 */ 4279 ptr = sbuf; 4280 while (ptr < &sbuf[sz]) { 4281 char *sepptr; 4282 struct node *np; 4283 int val; 4284 4285 sepptr = strchr(ptr, '@'); 4286 ASSERT(sepptr != NULL); 4287 *sepptr = '\0'; 4288 4289 /* construct the event */ 4290 np = newnode(T_EVENT, NULL, 0); 4291 np->u.event.ename = newnode(T_NAME, NULL, 0); 4292 np->u.event.ename->u.name.t = N_STAT; 4293 np->u.event.ename->u.name.s = stable(ptr); 4294 np->u.event.ename->u.name.it = IT_ENAME; 4295 np->u.event.ename->u.name.last = np->u.event.ename; 4296 4297 ptr = sepptr + 1; 4298 ASSERT(ptr < &sbuf[sz]); 4299 ptr += strlen(ptr); 4300 ptr++; /* move past the '\0' separating path from value */ 4301 ASSERT(ptr < &sbuf[sz]); 4302 ASSERT(isdigit(*ptr)); 4303 val = atoi(ptr); 4304 ASSERT(val > 0); 4305 ptr += strlen(ptr); 4306 ptr++; /* move past the final '\0' for this entry */ 4307 4308 np->u.event.epname = pathstring2epnamenp(sepptr + 1); 4309 ASSERT(np->u.event.epname != NULL); 4310 4311 istat_bump(np, val); 4312 tree_free(np); 4313 } 4314 4315 istat_save(); 4316 } 4317