1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * FMD Case Subsystem 29 * 30 * Diagnosis engines are expected to group telemetry events related to the 31 * diagnosis of a particular problem on the system into a set of cases. The 32 * diagnosis engine may have any number of cases open at a given point in time. 33 * Some cases may eventually be *solved* by associating a suspect list of one 34 * or more problems with the case, at which point fmd publishes a list.suspect 35 * event for the case and it becomes visible to administrators and agents. 36 * 37 * Every case is named using a UUID, and is globally visible in the case hash. 38 * Cases are reference-counted, except for the reference from the case hash 39 * itself. Consumers of case references include modules, which store active 40 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 41 * 42 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 43 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 44 * or transport) and the case is referenced by the mod_cases list. Once the 45 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 46 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 47 * 48 * +------------+ 49 * +----------| UNSOLVED | 50 * | +------------+ 51 * | 1 | 52 * | | 53 * | +-------v----+ 54 * 2 | | SOLVED | 55 * | +------------+ 56 * | 3 | 5 | 57 * +------------+ | | 58 * | | | 59 * +-v---v----v-+ 60 * | CLOSE_WAIT | 61 * +------------+ 62 * | | | 63 * +-----------+ | +------------+ 64 * | 4 | | 65 * v +-----v------+ | 66 * discard | CLOSED | 6 | 67 * +------------+ | 68 * | | 69 * | +------------+ 70 * 7 | | 71 * +-----v----v-+ 72 * | REPAIRED | 73 * +------------+ 74 * | 75 * 8 | 76 * +-----v------+ 77 * | RESOLVED | 78 * +------------+ 79 * | 80 * v 81 * discard 82 * 83 * The state machine changes are triggered by calls to fmd_case_transition() 84 * from various locations inside of fmd, as described below: 85 * 86 * [1] Called by: fmd_case_solve() 87 * Actions: FMD_CF_SOLVED flag is set in ci_flags 88 * conviction policy is applied to suspect list 89 * suspects convicted are marked faulty (F) in R$ 90 * list.suspect event logged and dispatched 91 * 92 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 93 * Actions: diagnosis engine fmdo_close() entry point scheduled 94 * case discarded upon exit from CLOSE_WAIT 95 * 96 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 97 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 98 * suspects convicted (F) are marked unusable (U) in R$ 99 * diagnosis engine fmdo_close() entry point scheduled 100 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 101 * 102 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 103 * Actions: list.isolated event dispatched 104 * case deleted from module's list of open cases 105 * 106 * [5] Called by: fmd_case_repair(), fmd_case_update() 107 * Actions: FMD_CF_REPAIR flag is set in ci_flags 108 * diagnosis engine fmdo_close() entry point scheduled 109 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 110 * 111 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 112 * Actions: suspects convicted are marked non faulty (!F) in R$ 113 * list.repaired or list.updated event dispatched 114 * 115 * [7] Called by: fmd_case_repair(), fmd_case_update() 116 * Actions: FMD_CF_REPAIR flag is set in ci_flags 117 * suspects convicted are marked non faulty (!F) in R$ 118 * list.repaired or list.updated event dispatched 119 * 120 * [8] Called by: fmd_case_uuresolve() 121 * Actions: list.resolved event dispatched 122 * case is discarded 123 */ 124 125 #include <sys/fm/protocol.h> 126 #include <uuid/uuid.h> 127 #include <alloca.h> 128 129 #include <fmd_alloc.h> 130 #include <fmd_module.h> 131 #include <fmd_error.h> 132 #include <fmd_conf.h> 133 #include <fmd_case.h> 134 #include <fmd_string.h> 135 #include <fmd_subr.h> 136 #include <fmd_protocol.h> 137 #include <fmd_event.h> 138 #include <fmd_eventq.h> 139 #include <fmd_dispq.h> 140 #include <fmd_buf.h> 141 #include <fmd_log.h> 142 #include <fmd_asru.h> 143 #include <fmd_fmri.h> 144 #include <fmd_xprt.h> 145 146 #include <fmd.h> 147 148 static const char *const _fmd_case_snames[] = { 149 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 150 "SOLVED", /* FMD_CASE_SOLVED */ 151 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 152 "CLOSED", /* FMD_CASE_CLOSED */ 153 "REPAIRED", /* FMD_CASE_REPAIRED */ 154 "RESOLVED" /* FMD_CASE_RESOLVED */ 155 }; 156 157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 158 159 fmd_case_hash_t * 160 fmd_case_hash_create(void) 161 { 162 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 163 164 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 165 chp->ch_hashlen = fmd.d_str_buckets; 166 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 167 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 168 FMD_SLEEP); 169 chp->ch_count = 0; 170 171 return (chp); 172 } 173 174 /* 175 * Destroy the case hash. Unlike most of our hash tables, no active references 176 * are kept by the case hash itself; all references come from other subsystems. 177 * The hash must be destroyed after all modules are unloaded; if anything was 178 * present in the hash it would be by definition a reference count leak. 179 */ 180 void 181 fmd_case_hash_destroy(fmd_case_hash_t *chp) 182 { 183 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 185 fmd_free(chp, sizeof (fmd_case_hash_t)); 186 } 187 188 /* 189 * Take a snapshot of the case hash by placing an additional hold on each 190 * member in an auxiliary array, and then call 'func' for each case. 191 */ 192 void 193 fmd_case_hash_apply(fmd_case_hash_t *chp, 194 void (*func)(fmd_case_t *, void *), void *arg) 195 { 196 fmd_case_impl_t *cp, **cps, **cpp; 197 uint_t cpc, i; 198 199 (void) pthread_rwlock_rdlock(&chp->ch_lock); 200 201 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 202 cpc = chp->ch_count; 203 204 for (i = 0; i < chp->ch_hashlen; i++) { 205 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 206 *cpp++ = fmd_case_tryhold(cp); 207 } 208 209 ASSERT(cpp == cps + cpc); 210 (void) pthread_rwlock_unlock(&chp->ch_lock); 211 212 for (i = 0; i < cpc; i++) { 213 if (cps[i] != NULL) { 214 func((fmd_case_t *)cps[i], arg); 215 fmd_case_rele((fmd_case_t *)cps[i]); 216 } 217 } 218 219 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 220 } 221 222 static void 223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 224 { 225 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 226 227 cip->ci_code_next = chp->ch_code_hash[h]; 228 chp->ch_code_hash[h] = cip; 229 } 230 231 static void 232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 233 { 234 fmd_case_impl_t **pp, *cp; 235 236 if (cip->ci_code) { 237 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 238 239 pp = &chp->ch_code_hash[h]; 240 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 241 if (cp != cip) 242 pp = &cp->ci_code_next; 243 else 244 break; 245 } 246 if (cp != NULL) { 247 *pp = cp->ci_code_next; 248 cp->ci_code_next = NULL; 249 } 250 } 251 } 252 253 /* 254 * Look up the diagcode for this case and cache it in ci_code. If no suspects 255 * were defined for this case or if the lookup fails, the event dictionary or 256 * module code is broken, and we set the event code to a precomputed default. 257 */ 258 static const char * 259 fmd_case_mkcode(fmd_case_t *cp) 260 { 261 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 262 fmd_case_susp_t *cis; 263 fmd_case_hash_t *chp = fmd.d_cases; 264 265 char **keys, **keyp; 266 const char *s; 267 268 ASSERT(MUTEX_HELD(&cip->ci_lock)); 269 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 270 271 /* 272 * delete any existing entry from code hash if it is on it 273 */ 274 fmd_case_code_hash_delete(chp, cip); 275 276 fmd_free(cip->ci_code, cip->ci_codelen); 277 cip->ci_codelen = cip->ci_mod->mod_codelen; 278 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 279 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 280 281 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 282 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 283 keyp++; 284 } 285 286 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 287 288 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 289 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 290 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 291 fmd_free(cip->ci_code, cip->ci_codelen); 292 cip->ci_codelen = strlen(s) + 1; 293 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 294 (void) strcpy(cip->ci_code, s); 295 } 296 297 /* 298 * add into hash of solved cases 299 */ 300 fmd_case_code_hash_insert(chp, cip); 301 302 return (cip->ci_code); 303 } 304 305 typedef struct { 306 int *fcl_countp; 307 int fcl_maxcount; 308 uint8_t *fcl_ba; 309 nvlist_t **fcl_nva; 310 int *fcl_msgp; 311 } fmd_case_lst_t; 312 313 static void 314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 315 { 316 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 317 boolean_t b; 318 int state; 319 320 if (*entryp->fcl_countp >= entryp->fcl_maxcount) 321 return; 322 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 323 &b) == 0 && b == B_FALSE) 324 *entryp->fcl_msgp = B_FALSE; 325 entryp->fcl_ba[*entryp->fcl_countp] = 0; 326 state = fmd_asru_al_getstate(alp); 327 if (state & FMD_ASRU_DEGRADED) 328 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 329 if (state & FMD_ASRU_UNUSABLE) 330 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 331 if (state & FMD_ASRU_FAULTY) 332 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 333 if (!(state & FMD_ASRU_PRESENT)) 334 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 335 if (alp->al_reason == FMD_ASRU_REPAIRED) 336 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 337 else if (alp->al_reason == FMD_ASRU_REPLACED) 338 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 339 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 340 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 341 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 342 (*entryp->fcl_countp)++; 343 } 344 345 static void 346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 347 { 348 int *faultyp = (int *)arg; 349 350 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 351 } 352 353 static void 354 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 355 { 356 int *usablep = (int *)arg; 357 358 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 359 } 360 361 static void 362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 363 { 364 int *not_faultyp = (int *)arg; 365 366 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 367 } 368 369 /* 370 * Have we got any suspects with an asru that are still unusable and present? 371 */ 372 static void 373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 374 { 375 int *rvalp = (int *)arg; 376 int state; 377 nvlist_t *asru; 378 379 /* 380 * if this a proxy case and this suspect doesn't have an local asru 381 * then state is unknown so we must assume it may still be unusable. 382 */ 383 if ((alp->al_flags & FMD_ASRU_PROXY) && 384 !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { 385 *rvalp |= B_TRUE; 386 return; 387 } 388 389 state = fmd_asru_al_getstate(alp); 390 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 391 return; 392 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 393 } 394 395 nvlist_t * 396 fmd_case_mkevent(fmd_case_t *cp, const char *class) 397 { 398 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 399 nvlist_t **nva, *nvl; 400 uint8_t *ba; 401 int msg = B_TRUE; 402 const char *code; 403 fmd_case_lst_t fcl; 404 int count = 0; 405 406 (void) pthread_mutex_lock(&cip->ci_lock); 407 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 408 409 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 410 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 411 412 /* 413 * For each suspect associated with the case, store its fault event 414 * nvlist in 'nva'. We also look to see if any of the suspect faults 415 * have asked not to be messaged. If any of them have made such a 416 * request, propagate that attribute to the composite list.* event. 417 * Finally, store each suspect's faulty status into the bitmap 'ba'. 418 */ 419 fcl.fcl_countp = &count; 420 fcl.fcl_maxcount = cip->ci_nsuspects; 421 fcl.fcl_msgp = &msg; 422 fcl.fcl_ba = ba; 423 fcl.fcl_nva = nva; 424 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 425 426 if (cip->ci_code == NULL) 427 (void) fmd_case_mkcode(cp); 428 /* 429 * For repair and updated event, we lookup diagcode from dict using key 430 * "list.repaired" or "list.updated" or "list.resolved". 431 */ 432 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 433 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 434 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 435 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 436 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 437 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 438 else 439 code = cip->ci_code; 440 441 if (msg == B_FALSE) 442 cip->ci_flags |= FMD_CF_INVISIBLE; 443 444 /* 445 * Use the ci_diag_de if one has been saved (eg for an injected fault). 446 * Otherwise use the authority for the current module. 447 */ 448 nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? 449 cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, 450 nva, ba, msg, &cip->ci_tv); 451 452 (void) pthread_mutex_unlock(&cip->ci_lock); 453 return (nvl); 454 } 455 456 static boolean_t 457 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 458 { 459 nvlist_t *new_rsrc; 460 nvlist_t *rsrc; 461 char *new_name = NULL; 462 char *name = NULL; 463 ssize_t new_namelen; 464 ssize_t namelen; 465 int fmri_present = 1; 466 int new_fmri_present = 1; 467 int match = B_FALSE; 468 fmd_topo_t *ftp = fmd_topo_hold(); 469 470 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 471 fmri_present = 0; 472 else { 473 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 474 goto done; 475 name = fmd_alloc(namelen + 1, FMD_SLEEP); 476 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 477 goto done; 478 } 479 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 480 new_fmri_present = 0; 481 else { 482 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 483 goto done; 484 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 485 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 486 goto done; 487 } 488 match = (fmri_present == new_fmri_present && 489 (fmri_present == 0 || 490 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 491 done: 492 if (name != NULL) 493 fmd_free(name, namelen + 1); 494 if (new_name != NULL) 495 fmd_free(new_name, new_namelen + 1); 496 fmd_topo_rele(ftp); 497 return (match); 498 } 499 500 static int 501 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 502 { 503 char *class, *new_class; 504 505 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 506 return (0); 507 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 508 FM_FAULT_RESOURCE)) 509 return (0); 510 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 511 return (0); 512 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 513 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 514 return (strcmp(class, new_class) == 0); 515 } 516 517 /* 518 * see if an identical suspect list already exists in the cache 519 */ 520 static int 521 fmd_case_check_for_dups(fmd_case_t *cp) 522 { 523 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 524 fmd_case_hash_t *chp = fmd.d_cases; 525 fmd_case_susp_t *xcis, *cis; 526 int match = 0, match_susp; 527 uint_t h; 528 529 (void) pthread_rwlock_rdlock(&chp->ch_lock); 530 531 /* 532 * Find all cases with this code 533 */ 534 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 535 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 536 xcip = xcip->ci_code_next) { 537 /* 538 * only look for any cases (apart from this one) 539 * whose code and number of suspects match 540 */ 541 if (xcip == cip || fmd_case_tryhold(xcip) == NULL) 542 continue; 543 if (strcmp(xcip->ci_code, cip->ci_code) != 0 || 544 xcip->ci_nsuspects != cip->ci_nsuspects) { 545 fmd_case_rele((fmd_case_t *)xcip); 546 continue; 547 } 548 549 /* 550 * For each suspect in one list, check if there 551 * is an identical suspect in the other list 552 */ 553 match = 1; 554 for (xcis = xcip->ci_suspects; xcis != NULL; 555 xcis = xcis->cis_next) { 556 match_susp = 0; 557 for (cis = cip->ci_suspects; cis != NULL; 558 cis = cis->cis_next) { 559 if (fmd_case_match_suspect(cis, xcis) == 1) { 560 match_susp = 1; 561 break; 562 } 563 } 564 if (match_susp == 0) { 565 match = 0; 566 break; 567 } 568 } 569 fmd_case_rele((fmd_case_t *)xcip); 570 if (match) { 571 (void) pthread_rwlock_unlock(&chp->ch_lock); 572 return (1); 573 } 574 } 575 (void) pthread_rwlock_unlock(&chp->ch_lock); 576 return (0); 577 } 578 579 /* 580 * Convict suspects in a case by applying a conviction policy and updating the 581 * resource cache prior to emitting the list.suspect event for the given case. 582 * At present, our policy is very simple: convict every suspect in the case. 583 * In the future, this policy can be extended and made configurable to permit: 584 * 585 * - convicting the suspect with the highest FIT rate 586 * - convicting the suspect with the cheapest FRU 587 * - convicting the suspect with the FRU that is in a depot's inventory 588 * - convicting the suspect with the longest lifetime 589 * 590 * and so forth. A word to the wise: this problem is significantly harder that 591 * it seems at first glance. Future work should heed the following advice: 592 * 593 * Hacking the policy into C code here is a very bad idea. The policy needs to 594 * be decided upon very carefully and fundamentally encodes knowledge of what 595 * suspect list combinations can be emitted by what diagnosis engines. As such 596 * fmd's code is the wrong location, because that would require fmd itself to 597 * be updated for every diagnosis engine change, defeating the entire design. 598 * The FMA Event Registry knows the suspect list combinations: policy inputs 599 * can be derived from it and used to produce per-module policy configuration. 600 * 601 * If the policy needs to be dynamic and not statically fixed at either fmd 602 * startup or module load time, any implementation of dynamic policy retrieval 603 * must employ some kind of caching mechanism or be part of a built-in module. 604 * The fmd_case_convict() function is called with locks held inside of fmd and 605 * is not a place where unbounded blocking on some inter-process or inter- 606 * system communication to another service (e.g. another daemon) can occur. 607 */ 608 static int 609 fmd_case_convict(fmd_case_t *cp) 610 { 611 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 612 fmd_asru_hash_t *ahp = fmd.d_asrus; 613 614 fmd_case_susp_t *cis; 615 fmd_asru_link_t *alp; 616 617 (void) pthread_mutex_lock(&cip->ci_lock); 618 if (cip->ci_code == NULL) 619 (void) fmd_case_mkcode(cp); 620 else if (cip->ci_precanned) 621 fmd_case_code_hash_insert(fmd.d_cases, cip); 622 if (fmd_case_check_for_dups(cp) == 1) { 623 (void) pthread_mutex_unlock(&cip->ci_lock); 624 return (1); 625 } 626 627 /* 628 * no suspect list already exists - allocate new cache entries 629 */ 630 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 631 if ((alp = fmd_asru_hash_create_entry(ahp, 632 cp, cis->cis_nvl)) == NULL) { 633 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 634 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 635 continue; 636 } 637 alp->al_flags |= FMD_ASRU_PRESENT; 638 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; 639 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 640 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 641 } 642 643 (void) pthread_mutex_unlock(&cip->ci_lock); 644 return (0); 645 } 646 647 void 648 fmd_case_publish(fmd_case_t *cp, uint_t state) 649 { 650 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 651 fmd_event_t *e; 652 nvlist_t *nvl; 653 char *class; 654 655 if (state == FMD_CASE_CURRENT) 656 state = cip->ci_state; /* use current state */ 657 658 switch (state) { 659 case FMD_CASE_SOLVED: 660 (void) pthread_mutex_lock(&cip->ci_lock); 661 662 /* 663 * If we already have a code, then case is already solved. 664 */ 665 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && 666 cip->ci_code != NULL) { 667 (void) pthread_mutex_unlock(&cip->ci_lock); 668 break; 669 } 670 671 if (cip->ci_tv_valid == 0) { 672 fmd_time_gettimeofday(&cip->ci_tv); 673 cip->ci_tv_valid = 1; 674 } 675 (void) pthread_mutex_unlock(&cip->ci_lock); 676 677 if (fmd_case_convict(cp) == 1) { /* dupclose */ 678 cip->ci_flags &= ~FMD_CF_SOLVED; 679 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 680 break; 681 } 682 if (cip->ci_xprt != NULL) { 683 /* 684 * For proxy, save some information about the transport 685 * in the resource cache. 686 */ 687 int count = 0; 688 fmd_asru_set_on_proxy_t fasp; 689 fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; 690 691 fasp.fasp_countp = &count; 692 fasp.fasp_maxcount = cip->ci_nsuspects; 693 fasp.fasp_proxy_asru = cip->ci_proxy_asru; 694 fasp.fasp_proxy_external = xip->xi_flags & 695 FMD_XPRT_EXTERNAL; 696 fasp.fasp_proxy_rdonly = ((xip->xi_flags & 697 FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); 698 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 699 fmd_asru_set_on_proxy, &fasp); 700 } 701 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 702 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 703 704 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 705 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 706 fmd_log_append(fmd.d_fltlog, e, cp); 707 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 708 fmd_dispq_dispatch(fmd.d_disp, e, class); 709 710 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 711 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 712 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 713 714 break; 715 716 case FMD_CASE_CLOSE_WAIT: 717 fmd_case_hold(cp); 718 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 719 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 720 721 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 722 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 723 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 724 725 break; 726 727 case FMD_CASE_CLOSED: 728 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 729 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 730 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 731 fmd_dispq_dispatch(fmd.d_disp, e, class); 732 break; 733 734 case FMD_CASE_REPAIRED: 735 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 736 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 737 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 738 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 739 fmd_log_append(fmd.d_fltlog, e, cp); 740 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 741 fmd_dispq_dispatch(fmd.d_disp, e, class); 742 break; 743 744 case FMD_CASE_RESOLVED: 745 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 746 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 747 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 748 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 749 fmd_log_append(fmd.d_fltlog, e, cp); 750 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 751 fmd_dispq_dispatch(fmd.d_disp, e, class); 752 break; 753 } 754 } 755 756 fmd_case_t * 757 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 758 { 759 fmd_case_impl_t *cip; 760 uint_t h; 761 762 (void) pthread_rwlock_rdlock(&chp->ch_lock); 763 h = fmd_strhash(uuid) % chp->ch_hashlen; 764 765 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 766 if (strcmp(cip->ci_uuid, uuid) == 0) 767 break; 768 } 769 770 /* 771 * If deleting bit is set, treat the case as if it doesn't exist. 772 */ 773 if (cip != NULL) 774 cip = fmd_case_tryhold(cip); 775 776 if (cip == NULL) 777 (void) fmd_set_errno(EFMD_CASE_INVAL); 778 779 (void) pthread_rwlock_unlock(&chp->ch_lock); 780 return ((fmd_case_t *)cip); 781 } 782 783 static fmd_case_impl_t * 784 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 785 { 786 fmd_case_impl_t *eip; 787 uint_t h; 788 789 (void) pthread_rwlock_wrlock(&chp->ch_lock); 790 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 791 792 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 793 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 794 fmd_case_tryhold(eip) != NULL) { 795 (void) pthread_rwlock_unlock(&chp->ch_lock); 796 return (eip); /* uuid already present */ 797 } 798 } 799 800 cip->ci_next = chp->ch_hash[h]; 801 chp->ch_hash[h] = cip; 802 803 chp->ch_count++; 804 ASSERT(chp->ch_count != 0); 805 806 (void) pthread_rwlock_unlock(&chp->ch_lock); 807 return (cip); 808 } 809 810 static void 811 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 812 { 813 fmd_case_impl_t *cp, **pp; 814 uint_t h; 815 816 ASSERT(MUTEX_HELD(&cip->ci_lock)); 817 818 cip->ci_flags |= FMD_CF_DELETING; 819 (void) pthread_mutex_unlock(&cip->ci_lock); 820 821 (void) pthread_rwlock_wrlock(&chp->ch_lock); 822 823 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 824 pp = &chp->ch_hash[h]; 825 826 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 827 if (cp != cip) 828 pp = &cp->ci_next; 829 else 830 break; 831 } 832 833 if (cp == NULL) { 834 fmd_panic("case %p (%s) not found on hash chain %u\n", 835 (void *)cip, cip->ci_uuid, h); 836 } 837 838 *pp = cp->ci_next; 839 cp->ci_next = NULL; 840 841 /* 842 * delete from code hash if it is on it 843 */ 844 fmd_case_code_hash_delete(chp, cip); 845 846 ASSERT(chp->ch_count != 0); 847 chp->ch_count--; 848 849 (void) pthread_rwlock_unlock(&chp->ch_lock); 850 851 (void) pthread_mutex_lock(&cip->ci_lock); 852 ASSERT(cip->ci_flags & FMD_CF_DELETING); 853 } 854 855 fmd_case_t * 856 fmd_case_create(fmd_module_t *mp, void *data) 857 { 858 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 859 fmd_case_impl_t *eip = NULL; 860 uuid_t uuid; 861 862 (void) pthread_mutex_init(&cip->ci_lock, NULL); 863 fmd_buf_hash_create(&cip->ci_bufs); 864 865 fmd_module_hold(mp); 866 cip->ci_mod = mp; 867 cip->ci_refs = 1; 868 cip->ci_state = FMD_CASE_UNSOLVED; 869 cip->ci_flags = FMD_CF_DIRTY; 870 cip->ci_data = data; 871 872 /* 873 * Calling libuuid: get a clue. The library interfaces cleverly do not 874 * define any constant for the length of an unparse string, and do not 875 * permit the caller to specify a buffer length for safety. The spec 876 * says it will be 36 bytes, but we make it tunable just in case. 877 */ 878 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 879 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 880 881 /* 882 * We expect this loop to execute only once, but code it defensively 883 * against the possibility of libuuid bugs. Keep generating uuids and 884 * attempting to do a hash insert until we get a unique one. 885 */ 886 do { 887 if (eip != NULL) 888 fmd_case_rele((fmd_case_t *)eip); 889 uuid_generate(uuid); 890 uuid_unparse(uuid, cip->ci_uuid); 891 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 892 893 ASSERT(fmd_module_locked(mp)); 894 fmd_list_append(&mp->mod_cases, cip); 895 fmd_module_setcdirty(mp); 896 897 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 898 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 899 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 900 901 return ((fmd_case_t *)cip); 902 } 903 904 static void 905 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 906 { 907 fmd_case_susp_t *cis, *ncis; 908 909 ASSERT(MUTEX_HELD(&cip->ci_lock)); 910 911 if (cip->ci_proxy_asru) 912 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * 913 cip->ci_nsuspects); 914 if (cip->ci_diag_de) 915 nvlist_free(cip->ci_diag_de); 916 if (cip->ci_diag_asru) 917 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * 918 cip->ci_nsuspects); 919 920 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 921 ncis = cis->cis_next; 922 nvlist_free(cis->cis_nvl); 923 fmd_free(cis, sizeof (fmd_case_susp_t)); 924 } 925 926 cip->ci_suspects = NULL; 927 cip->ci_nsuspects = 0; 928 } 929 930 fmd_case_t * 931 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 932 uint_t state, const char *uuid, const char *code) 933 { 934 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 935 fmd_case_impl_t *eip; 936 937 ASSERT(state < FMD_CASE_RESOLVED); 938 939 (void) pthread_mutex_init(&cip->ci_lock, NULL); 940 fmd_buf_hash_create(&cip->ci_bufs); 941 942 fmd_module_hold(mp); 943 cip->ci_mod = mp; 944 cip->ci_xprt = xp; 945 cip->ci_refs = 1; 946 cip->ci_state = state; 947 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 948 cip->ci_uuidlen = strlen(cip->ci_uuid); 949 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 950 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 951 952 if (state > FMD_CASE_CLOSE_WAIT) 953 cip->ci_flags |= FMD_CF_SOLVED; 954 955 /* 956 * Insert the case into the global case hash. If the specified UUID is 957 * already present, check to see if it is an orphan: if so, reclaim it; 958 * otherwise if it is owned by a different module then return NULL. 959 */ 960 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 961 (void) pthread_mutex_lock(&cip->ci_lock); 962 cip->ci_refs--; /* decrement to zero */ 963 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 964 965 cip = eip; /* switch 'cip' to the existing case */ 966 (void) pthread_mutex_lock(&cip->ci_lock); 967 968 /* 969 * If the ASRU cache is trying to recreate an orphan, then just 970 * return the existing case that we found without changing it. 971 */ 972 if (mp == fmd.d_rmod) { 973 /* 974 * In case the case has already been created from 975 * a checkpoint file we need to set up code now. 976 */ 977 if (cip->ci_state < FMD_CASE_CLOSED) { 978 if (code != NULL && cip->ci_code == NULL) { 979 cip->ci_code = fmd_strdup(code, 980 FMD_SLEEP); 981 cip->ci_codelen = cip->ci_code ? 982 strlen(cip->ci_code) + 1 : 0; 983 fmd_case_code_hash_insert(fmd.d_cases, 984 cip); 985 } 986 } 987 988 /* 989 * When recreating an orphan case, state passed in may 990 * either be CLOSED (faulty) or REPAIRED (!faulty). If 991 * any suspects are still CLOSED (faulty) then the 992 * overall state needs to be CLOSED. 993 */ 994 if (cip->ci_state == FMD_CASE_REPAIRED && 995 state == FMD_CASE_CLOSED) 996 cip->ci_state = FMD_CASE_CLOSED; 997 (void) pthread_mutex_unlock(&cip->ci_lock); 998 fmd_case_rele((fmd_case_t *)cip); 999 return ((fmd_case_t *)cip); 1000 } 1001 1002 /* 1003 * If the existing case isn't an orphan or is being proxied, 1004 * then we have a UUID conflict: return failure to the caller. 1005 */ 1006 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 1007 (void) pthread_mutex_unlock(&cip->ci_lock); 1008 fmd_case_rele((fmd_case_t *)cip); 1009 return (NULL); 1010 } 1011 1012 /* 1013 * If the new module is reclaiming an orphaned case, remove 1014 * the case from the root module, switch ci_mod, and then fall 1015 * through to adding the case to the new owner module 'mp'. 1016 */ 1017 fmd_module_lock(cip->ci_mod); 1018 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1019 fmd_module_unlock(cip->ci_mod); 1020 1021 fmd_module_rele(cip->ci_mod); 1022 cip->ci_mod = mp; 1023 fmd_module_hold(mp); 1024 1025 /* 1026 * It's possible that fmd crashed or was restarted during a 1027 * previous solve operation between the asru cache being created 1028 * and the ckpt file being updated to SOLVED. Thus when the DE 1029 * recreates the case here from the checkpoint file, the state 1030 * will be UNSOLVED and yet we are having to reclaim because 1031 * the case was in the asru cache. If this happens, revert the 1032 * case back to the UNSOLVED state and let the DE solve it again 1033 */ 1034 if (state == FMD_CASE_UNSOLVED) { 1035 fmd_asru_hash_delete_case(fmd.d_asrus, 1036 (fmd_case_t *)cip); 1037 fmd_case_destroy_suspects(cip); 1038 fmd_case_code_hash_delete(fmd.d_cases, cip); 1039 fmd_free(cip->ci_code, cip->ci_codelen); 1040 cip->ci_code = NULL; 1041 cip->ci_codelen = 0; 1042 cip->ci_tv_valid = 0; 1043 } 1044 1045 cip->ci_state = state; 1046 1047 (void) pthread_mutex_unlock(&cip->ci_lock); 1048 fmd_case_rele((fmd_case_t *)cip); 1049 } else { 1050 /* 1051 * add into hash of solved cases 1052 */ 1053 if (cip->ci_code) 1054 fmd_case_code_hash_insert(fmd.d_cases, cip); 1055 } 1056 1057 ASSERT(fmd_module_locked(mp)); 1058 fmd_list_append(&mp->mod_cases, cip); 1059 1060 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1061 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1062 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1063 1064 return ((fmd_case_t *)cip); 1065 } 1066 1067 void 1068 fmd_case_destroy(fmd_case_t *cp, int visible) 1069 { 1070 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1071 fmd_case_item_t *cit, *ncit; 1072 1073 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1074 ASSERT(cip->ci_refs == 0); 1075 1076 if (visible) { 1077 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 1078 fmd_case_hash_delete(fmd.d_cases, cip); 1079 } 1080 1081 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 1082 ncit = cit->cit_next; 1083 fmd_event_rele(cit->cit_event); 1084 fmd_free(cit, sizeof (fmd_case_item_t)); 1085 } 1086 1087 fmd_case_destroy_suspects(cip); 1088 1089 if (cip->ci_principal != NULL) 1090 fmd_event_rele(cip->ci_principal); 1091 1092 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1093 fmd_free(cip->ci_code, cip->ci_codelen); 1094 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1095 1096 fmd_module_rele(cip->ci_mod); 1097 fmd_free(cip, sizeof (fmd_case_impl_t)); 1098 } 1099 1100 void 1101 fmd_case_hold(fmd_case_t *cp) 1102 { 1103 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1104 1105 (void) pthread_mutex_lock(&cip->ci_lock); 1106 fmd_case_hold_locked(cp); 1107 (void) pthread_mutex_unlock(&cip->ci_lock); 1108 } 1109 1110 void 1111 fmd_case_hold_locked(fmd_case_t *cp) 1112 { 1113 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1114 1115 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1116 if (cip->ci_flags & FMD_CF_DELETING) 1117 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1118 (void *)cip, cip->ci_uuid); 1119 cip->ci_refs++; 1120 ASSERT(cip->ci_refs != 0); 1121 } 1122 1123 static fmd_case_impl_t * 1124 fmd_case_tryhold(fmd_case_impl_t *cip) 1125 { 1126 /* 1127 * If the case's "deleting" bit is unset, hold and return case, 1128 * otherwise, return NULL. 1129 */ 1130 (void) pthread_mutex_lock(&cip->ci_lock); 1131 if (cip->ci_flags & FMD_CF_DELETING) { 1132 (void) pthread_mutex_unlock(&cip->ci_lock); 1133 cip = NULL; 1134 } else { 1135 fmd_case_hold_locked((fmd_case_t *)cip); 1136 (void) pthread_mutex_unlock(&cip->ci_lock); 1137 } 1138 return (cip); 1139 } 1140 1141 void 1142 fmd_case_rele(fmd_case_t *cp) 1143 { 1144 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1145 1146 (void) pthread_mutex_lock(&cip->ci_lock); 1147 ASSERT(cip->ci_refs != 0); 1148 1149 if (--cip->ci_refs == 0) 1150 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1151 else 1152 (void) pthread_mutex_unlock(&cip->ci_lock); 1153 } 1154 1155 void 1156 fmd_case_rele_locked(fmd_case_t *cp) 1157 { 1158 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1159 1160 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1161 --cip->ci_refs; 1162 ASSERT(cip->ci_refs != 0); 1163 } 1164 1165 int 1166 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1167 { 1168 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1169 fmd_case_item_t *cit; 1170 fmd_event_t *oep; 1171 uint_t state; 1172 int new; 1173 1174 fmd_event_hold(ep); 1175 (void) pthread_mutex_lock(&cip->ci_lock); 1176 1177 if (cip->ci_flags & FMD_CF_SOLVED) 1178 state = FMD_EVS_DIAGNOSED; 1179 else 1180 state = FMD_EVS_ACCEPTED; 1181 1182 oep = cip->ci_principal; 1183 cip->ci_principal = ep; 1184 1185 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1186 if (cit->cit_event == ep) 1187 break; 1188 } 1189 1190 cip->ci_flags |= FMD_CF_DIRTY; 1191 new = cit == NULL && ep != oep; 1192 1193 (void) pthread_mutex_unlock(&cip->ci_lock); 1194 1195 fmd_module_setcdirty(cip->ci_mod); 1196 fmd_event_transition(ep, state); 1197 1198 if (oep != NULL) 1199 fmd_event_rele(oep); 1200 1201 return (new); 1202 } 1203 1204 int 1205 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1206 { 1207 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1208 fmd_case_item_t *cit; 1209 uint_t state; 1210 int new; 1211 1212 (void) pthread_mutex_lock(&cip->ci_lock); 1213 1214 if (cip->ci_flags & FMD_CF_SOLVED) 1215 state = FMD_EVS_DIAGNOSED; 1216 else 1217 state = FMD_EVS_ACCEPTED; 1218 1219 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1220 if (cit->cit_event == ep) 1221 break; 1222 } 1223 1224 new = cit == NULL && ep != cip->ci_principal; 1225 1226 /* 1227 * If the event is already in the case or the case is already solved, 1228 * there is no reason to save it: just transition it appropriately. 1229 */ 1230 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1231 (void) pthread_mutex_unlock(&cip->ci_lock); 1232 fmd_event_transition(ep, state); 1233 return (new); 1234 } 1235 1236 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1237 fmd_event_hold(ep); 1238 1239 cit->cit_next = cip->ci_items; 1240 cit->cit_event = ep; 1241 1242 cip->ci_items = cit; 1243 cip->ci_nitems++; 1244 1245 cip->ci_flags |= FMD_CF_DIRTY; 1246 (void) pthread_mutex_unlock(&cip->ci_lock); 1247 1248 fmd_module_setcdirty(cip->ci_mod); 1249 fmd_event_transition(ep, state); 1250 1251 return (new); 1252 } 1253 1254 void 1255 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1256 { 1257 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1258 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1259 1260 (void) pthread_mutex_lock(&cip->ci_lock); 1261 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1262 cip->ci_flags |= FMD_CF_DIRTY; 1263 1264 cis->cis_next = cip->ci_suspects; 1265 cis->cis_nvl = nvl; 1266 1267 cip->ci_suspects = cis; 1268 cip->ci_nsuspects++; 1269 1270 (void) pthread_mutex_unlock(&cip->ci_lock); 1271 if (cip->ci_xprt == NULL) 1272 fmd_module_setcdirty(cip->ci_mod); 1273 } 1274 1275 void 1276 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1277 { 1278 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1279 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1280 boolean_t b; 1281 1282 (void) pthread_mutex_lock(&cip->ci_lock); 1283 1284 cis->cis_next = cip->ci_suspects; 1285 cis->cis_nvl = nvl; 1286 1287 if (nvlist_lookup_boolean_value(nvl, 1288 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1289 cip->ci_flags |= FMD_CF_INVISIBLE; 1290 1291 cip->ci_suspects = cis; 1292 cip->ci_nsuspects++; 1293 1294 (void) pthread_mutex_unlock(&cip->ci_lock); 1295 } 1296 1297 void 1298 fmd_case_reset_suspects(fmd_case_t *cp) 1299 { 1300 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1301 1302 (void) pthread_mutex_lock(&cip->ci_lock); 1303 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1304 1305 fmd_case_destroy_suspects(cip); 1306 cip->ci_flags |= FMD_CF_DIRTY; 1307 1308 (void) pthread_mutex_unlock(&cip->ci_lock); 1309 fmd_module_setcdirty(cip->ci_mod); 1310 } 1311 1312 /*ARGSUSED*/ 1313 static void 1314 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1315 { 1316 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1317 } 1318 1319 /* 1320 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1321 * whatever actions and emit whatever events are appropriate for the state. 1322 * Refer to the topmost block comment explaining the state machine for details. 1323 */ 1324 void 1325 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1326 { 1327 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1328 fmd_case_item_t *cit; 1329 fmd_event_t *e; 1330 int resolved = 0; 1331 int any_unusable_and_present = 0; 1332 1333 ASSERT(state <= FMD_CASE_RESOLVED); 1334 (void) pthread_mutex_lock(&cip->ci_lock); 1335 1336 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1337 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); 1338 1339 cip->ci_flags |= flags; 1340 1341 if (cip->ci_state >= state) { 1342 (void) pthread_mutex_unlock(&cip->ci_lock); 1343 return; /* already in specified state */ 1344 } 1345 1346 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1347 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1348 1349 cip->ci_state = state; 1350 cip->ci_flags |= FMD_CF_DIRTY; 1351 1352 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1353 fmd_module_setcdirty(cip->ci_mod); 1354 1355 switch (state) { 1356 case FMD_CASE_SOLVED: 1357 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1358 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1359 1360 if (cip->ci_principal != NULL) { 1361 fmd_event_transition(cip->ci_principal, 1362 FMD_EVS_DIAGNOSED); 1363 } 1364 break; 1365 1366 case FMD_CASE_CLOSE_WAIT: 1367 /* 1368 * If the case was never solved, do not change ASRUs. 1369 * If the case was never fmd_case_closed, do not change ASRUs. 1370 * If the case was repaired, do not change ASRUs. 1371 */ 1372 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1373 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1374 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1375 fmd_case_unusable, NULL); 1376 1377 /* 1378 * If an orphaned case transitions to CLOSE_WAIT, the owning 1379 * module is no longer loaded: continue on to CASE_CLOSED. 1380 */ 1381 if (fmd_case_orphaned(cp)) 1382 state = cip->ci_state = FMD_CASE_CLOSED; 1383 break; 1384 1385 case FMD_CASE_REPAIRED: 1386 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); 1387 1388 /* 1389 * If we've been requested to transition straight on to the 1390 * RESOLVED state (which can happen with fault proxying where a 1391 * list.resolved or a uuresolved is received from the other 1392 * side), or if all suspects are already either usable or not 1393 * present then transition straight to RESOLVED state, 1394 * publishing both the list.repaired and list.resolved. For a 1395 * proxy, if we discover here that all suspects are already 1396 * either usable or not present, notify the diag side instead 1397 * using fmd_xprt_uuresolved(). 1398 */ 1399 if (flags & FMD_CF_RESOLVED) { 1400 if (cip->ci_xprt != NULL) { 1401 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1402 } else { 1403 fmd_module_lock(cip->ci_mod); 1404 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1405 fmd_module_unlock(cip->ci_mod); 1406 } 1407 } else { 1408 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1409 fmd_case_unusable_and_present, 1410 &any_unusable_and_present); 1411 if (any_unusable_and_present) 1412 break; 1413 if (cip->ci_xprt != NULL) { 1414 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); 1415 break; 1416 } 1417 fmd_module_lock(cip->ci_mod); 1418 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1419 fmd_module_unlock(cip->ci_mod); 1420 } 1421 1422 cip->ci_state = FMD_CASE_RESOLVED; 1423 (void) pthread_mutex_unlock(&cip->ci_lock); 1424 fmd_case_publish(cp, state); 1425 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1426 _fmd_case_snames[FMD_CASE_REPAIRED], 1427 _fmd_case_snames[FMD_CASE_RESOLVED])); 1428 state = FMD_CASE_RESOLVED; 1429 resolved = 1; 1430 (void) pthread_mutex_lock(&cip->ci_lock); 1431 break; 1432 1433 case FMD_CASE_RESOLVED: 1434 /* 1435 * For a proxy, no need to check that all suspects are already 1436 * either usable or not present - this request has come from 1437 * the diagnosing side which makes the final decision on this. 1438 */ 1439 if (cip->ci_xprt != NULL) { 1440 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1441 resolved = 1; 1442 break; 1443 } 1444 1445 ASSERT(fmd_case_orphaned(cp)); 1446 1447 /* 1448 * If all suspects are already either usable or not present then 1449 * carry on, publish list.resolved and discard the case. 1450 */ 1451 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1452 fmd_case_unusable_and_present, &any_unusable_and_present); 1453 if (any_unusable_and_present) { 1454 (void) pthread_mutex_unlock(&cip->ci_lock); 1455 return; 1456 } 1457 1458 fmd_module_lock(cip->ci_mod); 1459 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1460 fmd_module_unlock(cip->ci_mod); 1461 resolved = 1; 1462 break; 1463 } 1464 1465 (void) pthread_mutex_unlock(&cip->ci_lock); 1466 1467 /* 1468 * If the module has initialized, then publish the appropriate event 1469 * for the new case state. If not, we are being called from the 1470 * checkpoint code during module load, in which case the module's 1471 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1472 * may not be open yet, which will prevent us from computing the event 1473 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1474 * event in our queue: this won't be processed until _fmd_init is done. 1475 */ 1476 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1477 fmd_case_publish(cp, state); 1478 else { 1479 fmd_case_hold(cp); 1480 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1481 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1482 } 1483 1484 if (resolved) { 1485 /* 1486 * If we transitioned to RESOLVED, adjust the reference count to 1487 * reflect our removal from fmd.d_rmod->mod_cases above. If the 1488 * caller has not placed an additional hold on the case, it 1489 * will now be freed. 1490 */ 1491 (void) pthread_mutex_lock(&cip->ci_lock); 1492 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1493 (void) pthread_mutex_unlock(&cip->ci_lock); 1494 fmd_case_rele(cp); 1495 } 1496 } 1497 1498 /* 1499 * Transition the specified case to *at least* the specified state by first 1500 * re-validating the suspect list using the resource cache. This function is 1501 * employed by the checkpoint code when restoring a saved, solved case to see 1502 * if the state of the case has effectively changed while fmd was not running 1503 * or the module was not loaded. 1504 */ 1505 void 1506 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1507 { 1508 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1509 1510 int usable = 0; /* are any suspects usable? */ 1511 1512 ASSERT(state >= FMD_CASE_SOLVED); 1513 (void) pthread_mutex_lock(&cip->ci_lock); 1514 1515 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1516 1517 (void) pthread_mutex_unlock(&cip->ci_lock); 1518 1519 if (!usable) { 1520 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1521 flags |= FMD_CF_ISOLATED; 1522 } 1523 1524 fmd_case_transition(cp, state, flags); 1525 } 1526 1527 void 1528 fmd_case_setdirty(fmd_case_t *cp) 1529 { 1530 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1531 1532 (void) pthread_mutex_lock(&cip->ci_lock); 1533 cip->ci_flags |= FMD_CF_DIRTY; 1534 (void) pthread_mutex_unlock(&cip->ci_lock); 1535 1536 fmd_module_setcdirty(cip->ci_mod); 1537 } 1538 1539 void 1540 fmd_case_clrdirty(fmd_case_t *cp) 1541 { 1542 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1543 1544 (void) pthread_mutex_lock(&cip->ci_lock); 1545 cip->ci_flags &= ~FMD_CF_DIRTY; 1546 (void) pthread_mutex_unlock(&cip->ci_lock); 1547 } 1548 1549 void 1550 fmd_case_commit(fmd_case_t *cp) 1551 { 1552 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1553 fmd_case_item_t *cit; 1554 1555 (void) pthread_mutex_lock(&cip->ci_lock); 1556 1557 if (cip->ci_flags & FMD_CF_DIRTY) { 1558 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1559 fmd_event_commit(cit->cit_event); 1560 1561 if (cip->ci_principal != NULL) 1562 fmd_event_commit(cip->ci_principal); 1563 1564 fmd_buf_hash_commit(&cip->ci_bufs); 1565 cip->ci_flags &= ~FMD_CF_DIRTY; 1566 } 1567 1568 (void) pthread_mutex_unlock(&cip->ci_lock); 1569 } 1570 1571 /* 1572 * On proxy side, send back repair/acquit/etc request to diagnosing side 1573 */ 1574 void 1575 fmd_case_xprt_updated(fmd_case_t *cp) 1576 { 1577 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1578 nvlist_t **nva; 1579 uint8_t *ba; 1580 int msg = B_TRUE; 1581 int count = 0; 1582 fmd_case_lst_t fcl; 1583 1584 ASSERT(cip->ci_xprt != NULL); 1585 (void) pthread_mutex_lock(&cip->ci_lock); 1586 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 1587 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 1588 fcl.fcl_countp = &count; 1589 fcl.fcl_maxcount = cip->ci_nsuspects; 1590 fcl.fcl_msgp = &msg; 1591 fcl.fcl_ba = ba; 1592 fcl.fcl_nva = nva; 1593 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 1594 (void) pthread_mutex_unlock(&cip->ci_lock); 1595 fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, 1596 count); 1597 } 1598 1599 /* 1600 * fmd_case_update_status() can be called on either the proxy side when a 1601 * list.suspect is received, or on the diagnosing side when an update request 1602 * is received from the proxy. It updates the status in the resource cache. 1603 */ 1604 void 1605 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, 1606 uint8_t *diag_asrup) 1607 { 1608 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1609 int count = 0; 1610 fmd_asru_update_status_t faus; 1611 1612 /* 1613 * update status of resource cache entries 1614 */ 1615 faus.faus_countp = &count; 1616 faus.faus_maxcount = cip->ci_nsuspects; 1617 faus.faus_ba = statusp; 1618 faus.faus_proxy_asru = proxy_asrup; 1619 faus.faus_diag_asru = diag_asrup; 1620 faus.faus_is_proxy = (cip->ci_xprt != NULL); 1621 (void) pthread_mutex_lock(&cip->ci_lock); 1622 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, 1623 &faus); 1624 (void) pthread_mutex_unlock(&cip->ci_lock); 1625 } 1626 1627 /* 1628 * Called on either the proxy side or the diag side when a repair has taken 1629 * place on the other side but this side may know the asru "contains" 1630 * relationships. 1631 */ 1632 void 1633 fmd_case_update_containees(fmd_case_t *cp) 1634 { 1635 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1636 1637 (void) pthread_mutex_lock(&cip->ci_lock); 1638 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1639 fmd_asru_update_containees, NULL); 1640 (void) pthread_mutex_unlock(&cip->ci_lock); 1641 } 1642 1643 /* 1644 * fmd_case_close_status() is called on diagnosing side when proxy side 1645 * has had a uuclose. It updates the status in the resource cache. 1646 */ 1647 void 1648 fmd_case_close_status(fmd_case_t *cp) 1649 { 1650 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1651 int count = 0; 1652 fmd_asru_close_status_t facs; 1653 1654 /* 1655 * update status of resource cache entries 1656 */ 1657 facs.facs_countp = &count; 1658 facs.facs_maxcount = cip->ci_nsuspects; 1659 (void) pthread_mutex_lock(&cip->ci_lock); 1660 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, 1661 &facs); 1662 (void) pthread_mutex_unlock(&cip->ci_lock); 1663 } 1664 1665 /* 1666 * Indicate that the case may need to change state because one or more of the 1667 * ASRUs named as a suspect has changed state. We examine all the suspects 1668 * and if none are still faulty, we initiate a case close transition. 1669 */ 1670 void 1671 fmd_case_update(fmd_case_t *cp) 1672 { 1673 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1674 uint_t cstate; 1675 int faulty = 0; 1676 1677 (void) pthread_mutex_lock(&cip->ci_lock); 1678 cstate = cip->ci_state; 1679 1680 if (cip->ci_state < FMD_CASE_SOLVED) { 1681 (void) pthread_mutex_unlock(&cip->ci_lock); 1682 return; /* update is not appropriate */ 1683 } 1684 1685 if (cip->ci_flags & FMD_CF_REPAIRED) { 1686 (void) pthread_mutex_unlock(&cip->ci_lock); 1687 return; /* already repaired */ 1688 } 1689 1690 TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); 1691 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1692 (void) pthread_mutex_unlock(&cip->ci_lock); 1693 1694 if (faulty) { 1695 nvlist_t *nvl; 1696 fmd_event_t *e; 1697 char *class; 1698 1699 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); 1700 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 1701 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1702 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1703 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1704 fmd_log_append(fmd.d_fltlog, e, cp); 1705 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1706 fmd_dispq_dispatch(fmd.d_disp, e, class); 1707 return; /* one or more suspects are still marked faulty */ 1708 } 1709 1710 if (cstate == FMD_CASE_CLOSED) 1711 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1712 else 1713 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1714 } 1715 1716 /* 1717 * Delete a closed case from the module's case list once the fmdo_close() entry 1718 * point has run to completion. If the case is owned by a transport module, 1719 * tell the transport to proxy a case close on the other end of the transport. 1720 * Transition to the appropriate next state based on ci_flags. This 1721 * function represents the end of CLOSE_WAIT and transitions the case to either 1722 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1723 * refer to the topmost block comment explaining the state machine for details. 1724 */ 1725 void 1726 fmd_case_delete(fmd_case_t *cp) 1727 { 1728 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1729 fmd_modstat_t *msp; 1730 size_t buftotal; 1731 1732 TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); 1733 ASSERT(fmd_module_locked(cip->ci_mod)); 1734 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1735 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1736 1737 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1738 msp = cip->ci_mod->mod_stats; 1739 1740 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1741 msp->ms_caseopen.fmds_value.ui64--; 1742 1743 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1744 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1745 1746 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1747 1748 if (cip->ci_xprt == NULL) 1749 fmd_module_setcdirty(cip->ci_mod); 1750 1751 fmd_module_rele(cip->ci_mod); 1752 cip->ci_mod = fmd.d_rmod; 1753 fmd_module_hold(cip->ci_mod); 1754 1755 /* 1756 * If the case has been solved, then retain it 1757 * on the root module's case list at least until we're transitioned. 1758 * Otherwise free the case with our final fmd_case_rele() below. 1759 */ 1760 if (cip->ci_flags & FMD_CF_SOLVED) { 1761 fmd_module_lock(cip->ci_mod); 1762 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1763 fmd_module_unlock(cip->ci_mod); 1764 fmd_case_hold(cp); 1765 } 1766 1767 /* 1768 * Transition onwards to REPAIRED or CLOSED as originally requested. 1769 * Note that for proxy case if we're transitioning to CLOSED it means 1770 * the case was isolated locally, so call fmd_xprt_uuclose() to notify 1771 * the diagnosing side. No need to notify the diagnosing side if we are 1772 * transitioning to REPAIRED as we only do this when requested to do 1773 * so by the diagnosing side anyway. 1774 */ 1775 if (cip->ci_flags & FMD_CF_REPAIRED) 1776 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1777 else if (cip->ci_flags & FMD_CF_ISOLATED) { 1778 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1779 if (cip->ci_xprt != NULL) 1780 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1781 } 1782 1783 fmd_case_rele(cp); 1784 } 1785 1786 void 1787 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) 1788 { 1789 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1790 1791 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1792 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1793 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1794 1795 ASSERT(fmd_module_locked(cip->ci_mod)); 1796 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1797 if (delete_from_asru_cache) { 1798 (void) pthread_mutex_lock(&cip->ci_lock); 1799 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1800 (void) pthread_mutex_unlock(&cip->ci_lock); 1801 } 1802 fmd_case_rele(cp); 1803 } 1804 1805 /* 1806 * Indicate that the problem corresponding to a case has been repaired by 1807 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1808 * already been closed, this function initiates the transition to CLOSE_WAIT. 1809 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1810 * grab and drop ci_lock without the case being able to be freed in between. 1811 */ 1812 int 1813 fmd_case_repair(fmd_case_t *cp) 1814 { 1815 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1816 uint_t cstate; 1817 fmd_asru_rep_arg_t fara; 1818 1819 (void) pthread_mutex_lock(&cip->ci_lock); 1820 cstate = cip->ci_state; 1821 1822 if (cstate < FMD_CASE_SOLVED) { 1823 (void) pthread_mutex_unlock(&cip->ci_lock); 1824 return (fmd_set_errno(EFMD_CASE_STATE)); 1825 } 1826 1827 if (cip->ci_flags & FMD_CF_REPAIRED) { 1828 (void) pthread_mutex_unlock(&cip->ci_lock); 1829 return (0); /* already repaired */ 1830 } 1831 1832 TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); 1833 fara.fara_reason = FMD_ASRU_REPAIRED; 1834 fara.fara_bywhat = FARA_BY_CASE; 1835 fara.fara_rval = NULL; 1836 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 1837 (void) pthread_mutex_unlock(&cip->ci_lock); 1838 1839 /* 1840 * if this is a proxied case, send the repair across the transport. 1841 * The remote side will then do the repair and send a list.repaired back 1842 * again such that we can finally repair the case on this side. 1843 */ 1844 if (cip->ci_xprt != NULL) { 1845 fmd_case_xprt_updated(cp); 1846 return (0); 1847 } 1848 1849 if (cstate == FMD_CASE_CLOSED) 1850 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1851 else 1852 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1853 1854 return (0); 1855 } 1856 1857 int 1858 fmd_case_acquit(fmd_case_t *cp) 1859 { 1860 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1861 uint_t cstate; 1862 fmd_asru_rep_arg_t fara; 1863 1864 (void) pthread_mutex_lock(&cip->ci_lock); 1865 cstate = cip->ci_state; 1866 1867 if (cstate < FMD_CASE_SOLVED) { 1868 (void) pthread_mutex_unlock(&cip->ci_lock); 1869 return (fmd_set_errno(EFMD_CASE_STATE)); 1870 } 1871 1872 if (cip->ci_flags & FMD_CF_REPAIRED) { 1873 (void) pthread_mutex_unlock(&cip->ci_lock); 1874 return (0); /* already repaired */ 1875 } 1876 1877 TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); 1878 fara.fara_reason = FMD_ASRU_ACQUITTED; 1879 fara.fara_bywhat = FARA_BY_CASE; 1880 fara.fara_rval = NULL; 1881 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 1882 (void) pthread_mutex_unlock(&cip->ci_lock); 1883 1884 /* 1885 * if this is a proxied case, send the repair across the transport. 1886 * The remote side will then do the repair and send a list.repaired back 1887 * again such that we can finally repair the case on this side. 1888 */ 1889 if (cip->ci_xprt != NULL) { 1890 fmd_case_xprt_updated(cp); 1891 return (0); 1892 } 1893 1894 if (cstate == FMD_CASE_CLOSED) 1895 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1896 else 1897 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1898 1899 return (0); 1900 } 1901 1902 int 1903 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1904 { 1905 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1906 fmd_case_item_t *cit; 1907 uint_t state; 1908 int rv = 0; 1909 1910 (void) pthread_mutex_lock(&cip->ci_lock); 1911 1912 if (cip->ci_state >= FMD_CASE_SOLVED) 1913 state = FMD_EVS_DIAGNOSED; 1914 else 1915 state = FMD_EVS_ACCEPTED; 1916 1917 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1918 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1919 break; 1920 } 1921 1922 if (rv == 0 && cip->ci_principal != NULL) 1923 rv = fmd_event_equal(ep, cip->ci_principal); 1924 1925 (void) pthread_mutex_unlock(&cip->ci_lock); 1926 1927 if (rv != 0) 1928 fmd_event_transition(ep, state); 1929 1930 return (rv); 1931 } 1932 1933 int 1934 fmd_case_orphaned(fmd_case_t *cp) 1935 { 1936 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1937 } 1938 1939 void 1940 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1941 { 1942 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1943 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1944 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1945 } 1946 1947 void 1948 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) 1949 { 1950 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1951 1952 if (cip->ci_diag_de) 1953 nvlist_free(cip->ci_diag_de); 1954 cip->ci_diag_de = nvl; 1955 } 1956 1957 void 1958 fmd_case_setcode(fmd_case_t *cp, char *code) 1959 { 1960 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1961 1962 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 1963 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 1964 } 1965 1966 /*ARGSUSED*/ 1967 void 1968 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 1969 { 1970 int not_faulty = 0; 1971 int faulty = 0; 1972 nvlist_t *nvl; 1973 fmd_event_t *e; 1974 char *class; 1975 int any_unusable_and_present = 0; 1976 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1977 1978 if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) 1979 return; 1980 1981 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1982 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 1983 ¬_faulty); 1984 1985 if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) { 1986 /* 1987 * If none of the suspects is faulty, replay the list.repaired. 1988 * If all suspects are already either usable or not present then 1989 * also transition straight to RESOLVED state. 1990 */ 1991 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1992 fmd_case_unusable_and_present, &any_unusable_and_present); 1993 if (!any_unusable_and_present) { 1994 fmd_module_lock(cip->ci_mod); 1995 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1996 fmd_module_unlock(cip->ci_mod); 1997 cip->ci_state = FMD_CASE_RESOLVED; 1998 1999 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2000 cip->ci_uuid)); 2001 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2002 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2003 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2004 class); 2005 fmd_dispq_dispatch(fmd.d_disp, e, class); 2006 2007 TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", 2008 cip->ci_uuid)); 2009 fmd_case_publish(cp, FMD_CASE_RESOLVED); 2010 (void) pthread_mutex_lock(&cip->ci_lock); 2011 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 2012 (void) pthread_mutex_unlock(&cip->ci_lock); 2013 fmd_case_rele(cp); 2014 } else { 2015 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2016 cip->ci_uuid)); 2017 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2018 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2019 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2020 class); 2021 fmd_dispq_dispatch(fmd.d_disp, e, class); 2022 } 2023 } else if (faulty && not_faulty) { 2024 /* 2025 * if some but not all of the suspects are not faulty, replay 2026 * the list.updated. 2027 */ 2028 TRACE((FMD_DBG_CASE, "replay sending list.updated %s", 2029 cip->ci_uuid)); 2030 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2031 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2032 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2033 fmd_dispq_dispatch(fmd.d_disp, e, class); 2034 } 2035 } 2036 2037 void 2038 fmd_case_repair_replay() 2039 { 2040 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 2041 } 2042