1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * FMD Case Subsystem 29 * 30 * Diagnosis engines are expected to group telemetry events related to the 31 * diagnosis of a particular problem on the system into a set of cases. The 32 * diagnosis engine may have any number of cases open at a given point in time. 33 * Some cases may eventually be *solved* by associating a suspect list of one 34 * or more problems with the case, at which point fmd publishes a list.suspect 35 * event for the case and it becomes visible to administrators and agents. 36 * 37 * Every case is named using a UUID, and is globally visible in the case hash. 38 * Cases are reference-counted, except for the reference from the case hash 39 * itself. Consumers of case references include modules, which store active 40 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 41 * 42 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 43 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 44 * or transport) and the case is referenced by the mod_cases list. Once the 45 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 46 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 47 * 48 * +------------+ 49 * +----------| UNSOLVED | 50 * | +------------+ 51 * | 1 | 52 * | | 53 * | +-------v----+ 54 * 2 | | SOLVED | 55 * | +------------+ 56 * | 3 | 5 | 57 * +------------+ | | 58 * | | | 59 * +-v---v----v-+ 60 * | CLOSE_WAIT | 61 * +------------+ 62 * | | | 63 * +-----------+ | +------------+ 64 * | 4 | | 65 * v +-----v------+ | 66 * discard | CLOSED | 6 | 67 * +------------+ | 68 * | | 69 * | +------------+ 70 * 7 | | 71 * +-----v----v-+ 72 * | REPAIRED | 73 * +------------+ 74 * | 75 * 8 | 76 * +-----v------+ 77 * | RESOLVED | 78 * +------------+ 79 * | 80 * v 81 * discard 82 * 83 * The state machine changes are triggered by calls to fmd_case_transition() 84 * from various locations inside of fmd, as described below: 85 * 86 * [1] Called by: fmd_case_solve() 87 * Actions: FMD_CF_SOLVED flag is set in ci_flags 88 * conviction policy is applied to suspect list 89 * suspects convicted are marked faulty (F) in R$ 90 * list.suspect event logged and dispatched 91 * 92 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 93 * Actions: diagnosis engine fmdo_close() entry point scheduled 94 * case discarded upon exit from CLOSE_WAIT 95 * 96 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 97 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 98 * suspects convicted (F) are marked unusable (U) in R$ 99 * diagnosis engine fmdo_close() entry point scheduled 100 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 101 * 102 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 103 * Actions: list.isolated event dispatched 104 * case deleted from module's list of open cases 105 * 106 * [5] Called by: fmd_case_repair(), fmd_case_update() 107 * Actions: FMD_CF_REPAIR flag is set in ci_flags 108 * diagnosis engine fmdo_close() entry point scheduled 109 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 110 * 111 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 112 * Actions: suspects convicted are marked non faulty (!F) in R$ 113 * list.repaired or list.updated event dispatched 114 * 115 * [7] Called by: fmd_case_repair(), fmd_case_update() 116 * Actions: FMD_CF_REPAIR flag is set in ci_flags 117 * suspects convicted are marked non faulty (!F) in R$ 118 * list.repaired or list.updated event dispatched 119 * 120 * [8] Called by: fmd_case_uuresolve() 121 * Actions: list.resolved event dispatched 122 * case is discarded 123 */ 124 125 #include <sys/fm/protocol.h> 126 #include <uuid/uuid.h> 127 #include <alloca.h> 128 129 #include <fmd_alloc.h> 130 #include <fmd_module.h> 131 #include <fmd_error.h> 132 #include <fmd_conf.h> 133 #include <fmd_case.h> 134 #include <fmd_string.h> 135 #include <fmd_subr.h> 136 #include <fmd_protocol.h> 137 #include <fmd_event.h> 138 #include <fmd_eventq.h> 139 #include <fmd_dispq.h> 140 #include <fmd_buf.h> 141 #include <fmd_log.h> 142 #include <fmd_asru.h> 143 #include <fmd_fmri.h> 144 #include <fmd_xprt.h> 145 146 #include <fmd.h> 147 148 static const char *const _fmd_case_snames[] = { 149 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 150 "SOLVED", /* FMD_CASE_SOLVED */ 151 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 152 "CLOSED", /* FMD_CASE_CLOSED */ 153 "REPAIRED", /* FMD_CASE_REPAIRED */ 154 "RESOLVED" /* FMD_CASE_RESOLVED */ 155 }; 156 157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 158 159 fmd_case_hash_t * 160 fmd_case_hash_create(void) 161 { 162 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 163 164 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 165 chp->ch_hashlen = fmd.d_str_buckets; 166 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 167 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 168 FMD_SLEEP); 169 chp->ch_count = 0; 170 171 return (chp); 172 } 173 174 /* 175 * Destroy the case hash. Unlike most of our hash tables, no active references 176 * are kept by the case hash itself; all references come from other subsystems. 177 * The hash must be destroyed after all modules are unloaded; if anything was 178 * present in the hash it would be by definition a reference count leak. 179 */ 180 void 181 fmd_case_hash_destroy(fmd_case_hash_t *chp) 182 { 183 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 185 fmd_free(chp, sizeof (fmd_case_hash_t)); 186 } 187 188 /* 189 * Take a snapshot of the case hash by placing an additional hold on each 190 * member in an auxiliary array, and then call 'func' for each case. 191 */ 192 void 193 fmd_case_hash_apply(fmd_case_hash_t *chp, 194 void (*func)(fmd_case_t *, void *), void *arg) 195 { 196 fmd_case_impl_t *cp, **cps, **cpp; 197 uint_t cpc, i; 198 199 (void) pthread_rwlock_rdlock(&chp->ch_lock); 200 201 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 202 cpc = chp->ch_count; 203 204 for (i = 0; i < chp->ch_hashlen; i++) { 205 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 206 *cpp++ = fmd_case_tryhold(cp); 207 } 208 209 ASSERT(cpp == cps + cpc); 210 (void) pthread_rwlock_unlock(&chp->ch_lock); 211 212 for (i = 0; i < cpc; i++) { 213 if (cps[i] != NULL) { 214 func((fmd_case_t *)cps[i], arg); 215 fmd_case_rele((fmd_case_t *)cps[i]); 216 } 217 } 218 219 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 220 } 221 222 static void 223 fmd_case_hash_apply_except_current(fmd_case_hash_t *chp, 224 void (*func)(fmd_case_t *, void *), void *arg, fmd_case_t *current) 225 { 226 fmd_case_impl_t *cp, **cps, **cpp; 227 uint_t cpc, i; 228 229 (void) pthread_rwlock_rdlock(&chp->ch_lock); 230 231 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 232 cpc = chp->ch_count; 233 234 for (i = 0; i < chp->ch_hashlen; i++) { 235 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 236 if (cp != (fmd_case_impl_t *)current) 237 *cpp++ = fmd_case_tryhold(cp); 238 else 239 *cpp++ = cp; 240 } 241 242 ASSERT(cpp == cps + cpc); 243 (void) pthread_rwlock_unlock(&chp->ch_lock); 244 245 for (i = 0; i < cpc; i++) { 246 if (cps[i] != NULL && cps[i] != (fmd_case_impl_t *)current) { 247 func((fmd_case_t *)cps[i], arg); 248 fmd_case_rele((fmd_case_t *)cps[i]); 249 } 250 } 251 252 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 253 } 254 255 static void 256 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 257 { 258 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 259 260 cip->ci_code_next = chp->ch_code_hash[h]; 261 chp->ch_code_hash[h] = cip; 262 } 263 264 static void 265 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 266 { 267 fmd_case_impl_t **pp, *cp; 268 269 if (cip->ci_code) { 270 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 271 272 pp = &chp->ch_code_hash[h]; 273 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 274 if (cp != cip) 275 pp = &cp->ci_code_next; 276 else 277 break; 278 } 279 if (cp != NULL) { 280 *pp = cp->ci_code_next; 281 cp->ci_code_next = NULL; 282 } 283 } 284 } 285 286 /* 287 * Look up the diagcode for this case and cache it in ci_code. If no suspects 288 * were defined for this case or if the lookup fails, the event dictionary or 289 * module code is broken, and we set the event code to a precomputed default. 290 */ 291 static const char * 292 fmd_case_mkcode(fmd_case_t *cp) 293 { 294 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 295 fmd_case_susp_t *cis; 296 fmd_case_hash_t *chp = fmd.d_cases; 297 298 char **keys, **keyp; 299 const char *s; 300 301 ASSERT(MUTEX_HELD(&cip->ci_lock)); 302 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 303 304 /* 305 * delete any existing entry from code hash if it is on it 306 */ 307 fmd_case_code_hash_delete(chp, cip); 308 309 fmd_free(cip->ci_code, cip->ci_codelen); 310 cip->ci_codelen = cip->ci_mod->mod_codelen; 311 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 312 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 313 314 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 315 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 316 keyp++; 317 } 318 319 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 320 321 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 322 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 323 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 324 fmd_free(cip->ci_code, cip->ci_codelen); 325 cip->ci_codelen = strlen(s) + 1; 326 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 327 (void) strcpy(cip->ci_code, s); 328 } 329 330 /* 331 * add into hash of solved cases 332 */ 333 fmd_case_code_hash_insert(chp, cip); 334 335 return (cip->ci_code); 336 } 337 338 typedef struct { 339 int *fcl_countp; 340 int fcl_maxcount; 341 uint8_t *fcl_ba; 342 nvlist_t **fcl_nva; 343 int *fcl_msgp; 344 } fmd_case_lst_t; 345 346 static void 347 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 348 { 349 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 350 boolean_t b; 351 int state; 352 353 if (*entryp->fcl_countp >= entryp->fcl_maxcount) 354 return; 355 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 356 &b) == 0 && b == B_FALSE) 357 *entryp->fcl_msgp = B_FALSE; 358 entryp->fcl_ba[*entryp->fcl_countp] = 0; 359 state = fmd_asru_al_getstate(alp); 360 if (state & FMD_ASRU_DEGRADED) 361 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 362 if (state & FMD_ASRU_UNUSABLE) 363 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 364 if (state & FMD_ASRU_FAULTY) 365 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 366 if (!(state & FMD_ASRU_PRESENT)) 367 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 368 if (alp->al_reason == FMD_ASRU_REPAIRED) 369 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 370 else if (alp->al_reason == FMD_ASRU_REPLACED) 371 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 372 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 373 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 374 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 375 (*entryp->fcl_countp)++; 376 } 377 378 static void 379 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 380 { 381 int *faultyp = (int *)arg; 382 383 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 384 } 385 386 static void 387 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 388 { 389 int *usablep = (int *)arg; 390 391 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 392 } 393 394 static void 395 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 396 { 397 int *not_faultyp = (int *)arg; 398 399 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 400 } 401 402 /* 403 * Have we got any suspects with an asru that are still unusable and present? 404 */ 405 static void 406 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 407 { 408 int *rvalp = (int *)arg; 409 int state; 410 nvlist_t *asru; 411 412 /* 413 * if this a proxy case and this suspect doesn't have an local asru 414 * then state is unknown so we must assume it may still be unusable. 415 */ 416 if ((alp->al_flags & FMD_ASRU_PROXY) && 417 !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { 418 *rvalp |= B_TRUE; 419 return; 420 } 421 422 state = fmd_asru_al_getstate(alp); 423 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 424 return; 425 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 426 } 427 428 nvlist_t * 429 fmd_case_mkevent(fmd_case_t *cp, const char *class) 430 { 431 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 432 nvlist_t **nva, *nvl; 433 uint8_t *ba; 434 int msg = B_TRUE; 435 const char *code; 436 fmd_case_lst_t fcl; 437 int count = 0; 438 439 (void) pthread_mutex_lock(&cip->ci_lock); 440 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 441 442 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 443 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 444 445 /* 446 * For each suspect associated with the case, store its fault event 447 * nvlist in 'nva'. We also look to see if any of the suspect faults 448 * have asked not to be messaged. If any of them have made such a 449 * request, propagate that attribute to the composite list.* event. 450 * Finally, store each suspect's faulty status into the bitmap 'ba'. 451 */ 452 fcl.fcl_countp = &count; 453 fcl.fcl_maxcount = cip->ci_nsuspects; 454 fcl.fcl_msgp = &msg; 455 fcl.fcl_ba = ba; 456 fcl.fcl_nva = nva; 457 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 458 459 if (cip->ci_code == NULL) 460 (void) fmd_case_mkcode(cp); 461 /* 462 * For repair and updated event, we lookup diagcode from dict using key 463 * "list.repaired" or "list.updated" or "list.resolved". 464 */ 465 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 466 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 467 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 468 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 469 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 470 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 471 else 472 code = cip->ci_code; 473 474 if (msg == B_FALSE) 475 cip->ci_flags |= FMD_CF_INVISIBLE; 476 477 /* 478 * Use the ci_diag_de if one has been saved (eg for an injected fault). 479 * Otherwise use the authority for the current module. 480 */ 481 nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? 482 cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, 483 nva, ba, msg, &cip->ci_tv, cip->ci_injected); 484 485 (void) pthread_mutex_unlock(&cip->ci_lock); 486 return (nvl); 487 } 488 489 static int fmd_case_match_on_faulty_overlap = 1; 490 static int fmd_case_match_on_acquit_overlap = 1; 491 static int fmd_case_auto_acquit_isolated = 1; 492 static int fmd_case_auto_acquit_non_acquitted = 1; 493 static int fmd_case_too_recent = 10; /* time in seconds */ 494 495 static boolean_t 496 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 497 { 498 nvlist_t *new_rsrc; 499 nvlist_t *rsrc; 500 char *new_name = NULL; 501 char *name = NULL; 502 ssize_t new_namelen; 503 ssize_t namelen; 504 int fmri_present = 1; 505 int new_fmri_present = 1; 506 int match = B_FALSE; 507 fmd_topo_t *ftp = fmd_topo_hold(); 508 509 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 510 fmri_present = 0; 511 else { 512 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 513 goto done; 514 name = fmd_alloc(namelen + 1, FMD_SLEEP); 515 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 516 goto done; 517 } 518 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 519 new_fmri_present = 0; 520 else { 521 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 522 goto done; 523 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 524 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 525 goto done; 526 } 527 match = (fmri_present == new_fmri_present && 528 (fmri_present == 0 || 529 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 530 done: 531 if (name != NULL) 532 fmd_free(name, namelen + 1); 533 if (new_name != NULL) 534 fmd_free(new_name, new_namelen + 1); 535 fmd_topo_rele(ftp); 536 return (match); 537 } 538 539 static int 540 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2) 541 { 542 char *class, *new_class; 543 544 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU)) 545 return (0); 546 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE)) 547 return (0); 548 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU)) 549 return (0); 550 (void) nvlist_lookup_string(nvl2, FM_CLASS, &class); 551 (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class); 552 return (strcmp(class, new_class) == 0); 553 } 554 555 typedef struct { 556 int *fcms_countp; 557 int fcms_maxcount; 558 fmd_case_impl_t *fcms_cip; 559 uint8_t *fcms_new_susp_state; 560 uint8_t *fcms_old_susp_state; 561 uint8_t *fcms_old_match_state; 562 } fcms_t; 563 #define SUSPECT_STATE_FAULTY 0x1 564 #define SUSPECT_STATE_ISOLATED 0x2 565 #define SUSPECT_STATE_REMOVED 0x4 566 #define SUSPECT_STATE_ACQUITED 0x8 567 #define SUSPECT_STATE_REPAIRED 0x10 568 #define SUSPECT_STATE_REPLACED 0x20 569 #define SUSPECT_STATE_NO_MATCH 0x1 570 571 /* 572 * This is called for each suspect in the old case. Compare it against each 573 * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state 574 * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not 575 * found in the old case. 576 */ 577 static void 578 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg) 579 { 580 fcms_t *fcmsp = (fcms_t *)arg; 581 fmd_case_impl_t *cip = fcmsp->fcms_cip; 582 fmd_case_susp_t *cis; 583 int i = 0; 584 int state = fmd_asru_al_getstate(alp); 585 586 if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount) 587 return; 588 589 if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) && 590 alp->al_reason == FMD_ASRU_REMOVED)) 591 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 592 SUSPECT_STATE_REMOVED; 593 else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY)) 594 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 595 SUSPECT_STATE_ISOLATED; 596 else if (state & FMD_ASRU_FAULTY) 597 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 598 SUSPECT_STATE_FAULTY; 599 else if (alp->al_reason == FMD_ASRU_REPLACED) 600 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 601 SUSPECT_STATE_REPLACED; 602 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 603 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 604 SUSPECT_STATE_ACQUITED; 605 else 606 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 607 SUSPECT_STATE_REPAIRED; 608 609 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++) 610 if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1) 611 break; 612 if (cis != NULL) 613 fcmsp->fcms_new_susp_state[i] = 614 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp]; 615 else 616 fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |= 617 SUSPECT_STATE_NO_MATCH; 618 (*fcmsp->fcms_countp)++; 619 } 620 621 typedef struct { 622 int *fca_do_update; 623 fmd_case_impl_t *fca_cip; 624 } fca_t; 625 626 /* 627 * Re-fault all acquitted suspects that are still present in the new list. 628 */ 629 static void 630 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg) 631 { 632 fca_t *fcap = (fca_t *)arg; 633 fmd_case_impl_t *cip = fcap->fca_cip; 634 fmd_case_susp_t *cis; 635 int state = fmd_asru_al_getstate(alp); 636 637 if (!(state & FMD_ASRU_FAULTY) && 638 alp->al_reason == FMD_ASRU_ACQUITTED) { 639 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 640 if (fmd_case_match_suspect(cis->cis_nvl, 641 alp->al_event) == 1) 642 break; 643 if (cis != NULL) { 644 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 645 *fcap->fca_do_update = 1; 646 } 647 } 648 } 649 650 /* 651 * Re-fault all suspects that are still present in the new list. 652 */ 653 static void 654 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg) 655 { 656 fca_t *fcap = (fca_t *)arg; 657 fmd_case_impl_t *cip = fcap->fca_cip; 658 fmd_case_susp_t *cis; 659 int state = fmd_asru_al_getstate(alp); 660 661 if (!(state & FMD_ASRU_FAULTY)) { 662 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 663 if (fmd_case_match_suspect(cis->cis_nvl, 664 alp->al_event) == 1) 665 break; 666 if (cis != NULL) { 667 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 668 *fcap->fca_do_update = 1; 669 } 670 } 671 } 672 673 /* 674 * Acquit all suspects that are no longer present in the new list. 675 */ 676 static void 677 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg) 678 { 679 fca_t *fcap = (fca_t *)arg; 680 fmd_case_impl_t *cip = fcap->fca_cip; 681 fmd_case_susp_t *cis; 682 int state = fmd_asru_al_getstate(alp); 683 684 if (state & FMD_ASRU_FAULTY) { 685 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 686 if (fmd_case_match_suspect(cis->cis_nvl, 687 alp->al_event) == 1) 688 break; 689 if (cis == NULL) { 690 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 691 FMD_ASRU_ACQUITTED); 692 *fcap->fca_do_update = 1; 693 } 694 } 695 } 696 697 /* 698 * Acquit all isolated suspects. 699 */ 700 static void 701 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg) 702 { 703 int *do_update = (int *)arg; 704 int state = fmd_asru_al_getstate(alp); 705 706 if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) && 707 (state & FMD_ASRU_FAULTY)) { 708 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 709 FMD_ASRU_ACQUITTED); 710 *do_update = 1; 711 } 712 } 713 714 /* 715 * Acquit suspect which matches specified nvlist 716 */ 717 static void 718 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg) 719 { 720 nvlist_t *nvl = (nvlist_t *)arg; 721 int state = fmd_asru_al_getstate(alp); 722 723 if ((state & FMD_ASRU_FAULTY) && 724 fmd_case_match_suspect(nvl, alp->al_event) == 1) 725 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 726 FMD_ASRU_ACQUITTED); 727 } 728 729 typedef struct { 730 fmd_case_impl_t *fccd_cip; 731 uint8_t *fccd_new_susp_state; 732 uint8_t *fccd_new_match_state; 733 int *fccd_discard_new; 734 int *fccd_adjust_new; 735 } fccd_t; 736 737 /* 738 * see if a matching suspect list already exists in the cache 739 */ 740 static void 741 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg) 742 { 743 fccd_t *fccdp = (fccd_t *)arg; 744 fmd_case_impl_t *new_cip = fccdp->fccd_cip; 745 fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp; 746 int i, count = 0, do_update = 0, got_isolated_overlap = 0; 747 int got_faulty_overlap = 0; 748 int got_acquit_overlap = 0; 749 boolean_t too_recent; 750 uint64_t most_recent = 0; 751 fcms_t fcms; 752 fca_t fca; 753 uint8_t *new_susp_state; 754 uint8_t *old_susp_state; 755 uint8_t *old_match_state; 756 757 new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t)); 758 for (i = 0; i < new_cip->ci_nsuspects; i++) 759 new_susp_state[i] = 0; 760 old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 761 for (i = 0; i < old_cip->ci_nsuspects; i++) 762 old_susp_state[i] = 0; 763 old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 764 for (i = 0; i < old_cip->ci_nsuspects; i++) 765 old_match_state[i] = 0; 766 767 /* 768 * Compare with each suspect in the existing case. 769 */ 770 fcms.fcms_countp = &count; 771 fcms.fcms_maxcount = old_cip->ci_nsuspects; 772 fcms.fcms_cip = new_cip; 773 fcms.fcms_new_susp_state = new_susp_state; 774 fcms.fcms_old_susp_state = old_susp_state; 775 fcms.fcms_old_match_state = old_match_state; 776 fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip, 777 fmd_case_match_suspects, &fcms); 778 779 /* 780 * If we have some faulty, non-isolated suspects that overlap, then most 781 * likely it is the suspects that overlap in the suspect lists that are 782 * to blame. So we can consider this to be a match. 783 */ 784 for (i = 0; i < new_cip->ci_nsuspects; i++) 785 if (new_susp_state[i] == SUSPECT_STATE_FAULTY) 786 got_faulty_overlap = 1; 787 if (got_faulty_overlap && fmd_case_match_on_faulty_overlap) 788 goto got_match; 789 790 /* 791 * If we have no faulty, non-isolated suspects in the old case, but we 792 * do have some acquitted suspects that overlap, then most likely it is 793 * the acquitted suspects that overlap in the suspect lists that are 794 * to blame. So we can consider this to be a match. 795 */ 796 for (i = 0; i < new_cip->ci_nsuspects; i++) 797 if (new_susp_state[i] == SUSPECT_STATE_ACQUITED) 798 got_acquit_overlap = 1; 799 for (i = 0; i < old_cip->ci_nsuspects; i++) 800 if (old_susp_state[i] == SUSPECT_STATE_FAULTY) 801 got_acquit_overlap = 0; 802 if (got_acquit_overlap && fmd_case_match_on_acquit_overlap) 803 goto got_match; 804 805 /* 806 * Check that all suspects in the new list are present in the old list. 807 * Return if we find one that isn't. 808 */ 809 for (i = 0; i < new_cip->ci_nsuspects; i++) 810 if (new_susp_state[i] == 0) 811 return; 812 813 /* 814 * Check that all suspects in the old list are present in the new list 815 * *or* they are isolated or removed/replaced (which would explain why 816 * they are not present in the new list). Return if we find one that is 817 * faulty and unisolated or repaired or acquitted, and that is not 818 * present in the new case. 819 */ 820 for (i = 0; i < old_cip->ci_nsuspects; i++) 821 if (old_match_state[i] == SUSPECT_STATE_NO_MATCH && 822 (old_susp_state[i] == SUSPECT_STATE_FAULTY || 823 old_susp_state[i] == SUSPECT_STATE_ACQUITED || 824 old_susp_state[i] == SUSPECT_STATE_REPAIRED)) 825 return; 826 827 got_match: 828 /* 829 * If the old case is already in repaired/resolved state, we can't 830 * do anything more with it, so keep the new case, but acquit some 831 * of the suspects if appropriate. 832 */ 833 if (old_cip->ci_state >= FMD_CASE_REPAIRED) { 834 if (fmd_case_auto_acquit_non_acquitted) { 835 *fccdp->fccd_adjust_new = 1; 836 for (i = 0; i < new_cip->ci_nsuspects; i++) { 837 fccdp->fccd_new_susp_state[i] |= 838 new_susp_state[i]; 839 if (new_susp_state[i] == 0) 840 fccdp->fccd_new_susp_state[i] = 841 SUSPECT_STATE_NO_MATCH; 842 } 843 } 844 return; 845 } 846 847 /* 848 * Otherwise discard the new case and keep the old, again updating the 849 * state of the suspects as appropriate 850 */ 851 *fccdp->fccd_discard_new = 1; 852 fca.fca_cip = new_cip; 853 fca.fca_do_update = &do_update; 854 855 /* 856 * See if new case occurred within fmd_case_too_recent seconds of the 857 * most recent modification to the old case and if so don't do 858 * auto-acquit. This avoids problems if a flood of ereports come in and 859 * they don't all get diagnosed before the first case causes some of 860 * the devices to be isolated making it appear that an isolated device 861 * was in the suspect list. 862 */ 863 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 864 fmd_asru_most_recent, &most_recent); 865 too_recent = (new_cip->ci_tv.tv_sec - most_recent < 866 fmd_case_too_recent); 867 868 if (got_faulty_overlap) { 869 /* 870 * Acquit any suspects not present in the new list, plus 871 * any that are are present but are isolated. 872 */ 873 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 874 fmd_case_acquit_no_match, &fca); 875 if (fmd_case_auto_acquit_isolated && !too_recent) 876 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 877 fmd_case_acquit_isolated, &do_update); 878 } else if (got_acquit_overlap) { 879 /* 880 * Re-fault the acquitted matching suspects and acquit all 881 * isolated suspects. 882 */ 883 if (fmd_case_auto_acquit_isolated && !too_recent) { 884 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 885 fmd_case_fault_acquitted_matching, &fca); 886 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 887 fmd_case_acquit_isolated, &do_update); 888 } 889 } else if (fmd_case_auto_acquit_isolated) { 890 /* 891 * To get here, there must be no faulty or acquitted suspects, 892 * but there must be at least one isolated suspect. Just acquit 893 * non-matching isolated suspects. If there are no matching 894 * isolated suspects, then re-fault all matching suspects. 895 */ 896 for (i = 0; i < new_cip->ci_nsuspects; i++) 897 if (new_susp_state[i] == SUSPECT_STATE_ISOLATED) 898 got_isolated_overlap = 1; 899 if (!got_isolated_overlap) 900 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 901 fmd_case_fault_all_matching, &fca); 902 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 903 fmd_case_acquit_no_match, &fca); 904 } 905 906 /* 907 * If we've updated anything in the old case, call fmd_case_update() 908 */ 909 if (do_update) 910 fmd_case_update(old_cp); 911 } 912 913 /* 914 * Convict suspects in a case by applying a conviction policy and updating the 915 * resource cache prior to emitting the list.suspect event for the given case. 916 * At present, our policy is very simple: convict every suspect in the case. 917 * In the future, this policy can be extended and made configurable to permit: 918 * 919 * - convicting the suspect with the highest FIT rate 920 * - convicting the suspect with the cheapest FRU 921 * - convicting the suspect with the FRU that is in a depot's inventory 922 * - convicting the suspect with the longest lifetime 923 * 924 * and so forth. A word to the wise: this problem is significantly harder that 925 * it seems at first glance. Future work should heed the following advice: 926 * 927 * Hacking the policy into C code here is a very bad idea. The policy needs to 928 * be decided upon very carefully and fundamentally encodes knowledge of what 929 * suspect list combinations can be emitted by what diagnosis engines. As such 930 * fmd's code is the wrong location, because that would require fmd itself to 931 * be updated for every diagnosis engine change, defeating the entire design. 932 * The FMA Event Registry knows the suspect list combinations: policy inputs 933 * can be derived from it and used to produce per-module policy configuration. 934 * 935 * If the policy needs to be dynamic and not statically fixed at either fmd 936 * startup or module load time, any implementation of dynamic policy retrieval 937 * must employ some kind of caching mechanism or be part of a built-in module. 938 * The fmd_case_convict() function is called with locks held inside of fmd and 939 * is not a place where unbounded blocking on some inter-process or inter- 940 * system communication to another service (e.g. another daemon) can occur. 941 */ 942 static int 943 fmd_case_convict(fmd_case_t *cp) 944 { 945 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 946 fmd_asru_hash_t *ahp = fmd.d_asrus; 947 int discard_new = 0, i; 948 fmd_case_susp_t *cis; 949 fmd_asru_link_t *alp; 950 uint8_t *new_susp_state; 951 uint8_t *new_match_state; 952 int adjust_new = 0; 953 fccd_t fccd; 954 955 (void) pthread_mutex_lock(&cip->ci_lock); 956 if (cip->ci_code == NULL) 957 (void) fmd_case_mkcode(cp); 958 else if (cip->ci_precanned) 959 fmd_case_code_hash_insert(fmd.d_cases, cip); 960 961 /* 962 * First we must see if any matching cases already exist. 963 */ 964 new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 965 for (i = 0; i < cip->ci_nsuspects; i++) 966 new_susp_state[i] = 0; 967 new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 968 for (i = 0; i < cip->ci_nsuspects; i++) 969 new_match_state[i] = 0; 970 fccd.fccd_cip = cip; 971 fccd.fccd_adjust_new = &adjust_new; 972 fccd.fccd_new_susp_state = new_susp_state; 973 fccd.fccd_new_match_state = new_match_state; 974 fccd.fccd_discard_new = &discard_new; 975 fmd_case_hash_apply_except_current(fmd.d_cases, fmd_case_check_for_dups, 976 &fccd, cp); 977 978 if (discard_new) { 979 /* 980 * We've found an existing case that is a match and it is not 981 * already in repaired or resolved state. So we can close this 982 * one as a duplicate. 983 */ 984 (void) pthread_mutex_unlock(&cip->ci_lock); 985 return (1); 986 } 987 988 /* 989 * Allocate new cache entries 990 */ 991 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 992 if ((alp = fmd_asru_hash_create_entry(ahp, 993 cp, cis->cis_nvl)) == NULL) { 994 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 995 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 996 continue; 997 } 998 alp->al_flags |= FMD_ASRU_PRESENT; 999 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; 1000 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 1001 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 1002 } 1003 1004 if (adjust_new) { 1005 int some_suspect = 0, some_not_suspect = 0; 1006 1007 /* 1008 * There is one or more matching case but they are already in 1009 * repaired or resolved state. So we need to keep the new 1010 * case, but we can adjust it. Repaired/removed/replaced 1011 * suspects are unlikely to be to blame (unless there are 1012 * actually two separate faults). So if we have a combination of 1013 * repaired/replaced/removed suspects and acquitted suspects in 1014 * the old lists, then we should acquit in the new list those 1015 * that were repaired/replaced/removed in the old. 1016 */ 1017 for (i = 0; i < cip->ci_nsuspects; i++) { 1018 if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) || 1019 (new_susp_state[i] & SUSPECT_STATE_REPAIRED) || 1020 (new_susp_state[i] & SUSPECT_STATE_REMOVED) || 1021 (new_match_state[i] & SUSPECT_STATE_NO_MATCH)) 1022 some_not_suspect = 1; 1023 else 1024 some_suspect = 1; 1025 } 1026 if (some_suspect && some_not_suspect) { 1027 for (cis = cip->ci_suspects, i = 0; cis != NULL; 1028 cis = cis->cis_next, i++) 1029 if ((new_susp_state[i] & 1030 SUSPECT_STATE_REPLACED) || 1031 (new_susp_state[i] & 1032 SUSPECT_STATE_REPAIRED) || 1033 (new_susp_state[i] & 1034 SUSPECT_STATE_REMOVED) || 1035 (new_match_state[i] & 1036 SUSPECT_STATE_NO_MATCH)) 1037 fmd_asru_hash_apply_by_case(fmd.d_asrus, 1038 cp, fmd_case_acquit_suspect, 1039 cis->cis_nvl); 1040 } 1041 } 1042 1043 (void) pthread_mutex_unlock(&cip->ci_lock); 1044 return (0); 1045 } 1046 1047 void 1048 fmd_case_publish(fmd_case_t *cp, uint_t state) 1049 { 1050 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1051 fmd_event_t *e; 1052 nvlist_t *nvl; 1053 char *class; 1054 1055 if (state == FMD_CASE_CURRENT) 1056 state = cip->ci_state; /* use current state */ 1057 1058 switch (state) { 1059 case FMD_CASE_SOLVED: 1060 (void) pthread_mutex_lock(&cip->ci_lock); 1061 1062 /* 1063 * If we already have a code, then case is already solved. 1064 */ 1065 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && 1066 cip->ci_code != NULL) { 1067 (void) pthread_mutex_unlock(&cip->ci_lock); 1068 break; 1069 } 1070 1071 if (cip->ci_tv_valid == 0) { 1072 fmd_time_gettimeofday(&cip->ci_tv); 1073 cip->ci_tv_valid = 1; 1074 } 1075 (void) pthread_mutex_unlock(&cip->ci_lock); 1076 1077 if (fmd_case_convict(cp) == 1) { /* dupclose */ 1078 cip->ci_flags &= ~FMD_CF_SOLVED; 1079 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 1080 break; 1081 } 1082 if (cip->ci_xprt != NULL) { 1083 /* 1084 * For proxy, save some information about the transport 1085 * in the resource cache. 1086 */ 1087 int count = 0; 1088 fmd_asru_set_on_proxy_t fasp; 1089 fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; 1090 1091 fasp.fasp_countp = &count; 1092 fasp.fasp_maxcount = cip->ci_nsuspects; 1093 fasp.fasp_proxy_asru = cip->ci_proxy_asru; 1094 fasp.fasp_proxy_external = xip->xi_flags & 1095 FMD_XPRT_EXTERNAL; 1096 fasp.fasp_proxy_rdonly = ((xip->xi_flags & 1097 FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); 1098 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1099 fmd_asru_set_on_proxy, &fasp); 1100 } 1101 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 1102 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1103 1104 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1105 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1106 fmd_log_append(fmd.d_fltlog, e, cp); 1107 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1108 fmd_dispq_dispatch(fmd.d_disp, e, class); 1109 1110 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1111 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 1112 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1113 1114 break; 1115 1116 case FMD_CASE_CLOSE_WAIT: 1117 fmd_case_hold(cp); 1118 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 1119 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1120 1121 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1122 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 1123 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1124 1125 break; 1126 1127 case FMD_CASE_CLOSED: 1128 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 1129 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1130 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1131 fmd_dispq_dispatch(fmd.d_disp, e, class); 1132 break; 1133 1134 case FMD_CASE_REPAIRED: 1135 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1136 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1137 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1138 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1139 fmd_log_append(fmd.d_fltlog, e, cp); 1140 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1141 fmd_dispq_dispatch(fmd.d_disp, e, class); 1142 break; 1143 1144 case FMD_CASE_RESOLVED: 1145 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 1146 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1147 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1148 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1149 fmd_log_append(fmd.d_fltlog, e, cp); 1150 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1151 fmd_dispq_dispatch(fmd.d_disp, e, class); 1152 break; 1153 } 1154 } 1155 1156 fmd_case_t * 1157 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 1158 { 1159 fmd_case_impl_t *cip; 1160 uint_t h; 1161 1162 (void) pthread_rwlock_rdlock(&chp->ch_lock); 1163 h = fmd_strhash(uuid) % chp->ch_hashlen; 1164 1165 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 1166 if (strcmp(cip->ci_uuid, uuid) == 0) 1167 break; 1168 } 1169 1170 /* 1171 * If deleting bit is set, treat the case as if it doesn't exist. 1172 */ 1173 if (cip != NULL) 1174 cip = fmd_case_tryhold(cip); 1175 1176 if (cip == NULL) 1177 (void) fmd_set_errno(EFMD_CASE_INVAL); 1178 1179 (void) pthread_rwlock_unlock(&chp->ch_lock); 1180 return ((fmd_case_t *)cip); 1181 } 1182 1183 static fmd_case_impl_t * 1184 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1185 { 1186 fmd_case_impl_t *eip; 1187 uint_t h; 1188 1189 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1190 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1191 1192 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 1193 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 1194 fmd_case_tryhold(eip) != NULL) { 1195 (void) pthread_rwlock_unlock(&chp->ch_lock); 1196 return (eip); /* uuid already present */ 1197 } 1198 } 1199 1200 cip->ci_next = chp->ch_hash[h]; 1201 chp->ch_hash[h] = cip; 1202 1203 chp->ch_count++; 1204 ASSERT(chp->ch_count != 0); 1205 1206 (void) pthread_rwlock_unlock(&chp->ch_lock); 1207 return (cip); 1208 } 1209 1210 static void 1211 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1212 { 1213 fmd_case_impl_t *cp, **pp; 1214 uint_t h; 1215 1216 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1217 1218 cip->ci_flags |= FMD_CF_DELETING; 1219 (void) pthread_mutex_unlock(&cip->ci_lock); 1220 1221 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1222 1223 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1224 pp = &chp->ch_hash[h]; 1225 1226 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 1227 if (cp != cip) 1228 pp = &cp->ci_next; 1229 else 1230 break; 1231 } 1232 1233 if (cp == NULL) { 1234 fmd_panic("case %p (%s) not found on hash chain %u\n", 1235 (void *)cip, cip->ci_uuid, h); 1236 } 1237 1238 *pp = cp->ci_next; 1239 cp->ci_next = NULL; 1240 1241 /* 1242 * delete from code hash if it is on it 1243 */ 1244 fmd_case_code_hash_delete(chp, cip); 1245 1246 ASSERT(chp->ch_count != 0); 1247 chp->ch_count--; 1248 1249 (void) pthread_rwlock_unlock(&chp->ch_lock); 1250 1251 (void) pthread_mutex_lock(&cip->ci_lock); 1252 ASSERT(cip->ci_flags & FMD_CF_DELETING); 1253 } 1254 1255 fmd_case_t * 1256 fmd_case_create(fmd_module_t *mp, void *data) 1257 { 1258 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1259 fmd_case_impl_t *eip = NULL; 1260 uuid_t uuid; 1261 1262 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1263 fmd_buf_hash_create(&cip->ci_bufs); 1264 1265 fmd_module_hold(mp); 1266 cip->ci_mod = mp; 1267 cip->ci_refs = 1; 1268 cip->ci_state = FMD_CASE_UNSOLVED; 1269 cip->ci_flags = FMD_CF_DIRTY; 1270 cip->ci_data = data; 1271 1272 /* 1273 * Calling libuuid: get a clue. The library interfaces cleverly do not 1274 * define any constant for the length of an unparse string, and do not 1275 * permit the caller to specify a buffer length for safety. The spec 1276 * says it will be 36 bytes, but we make it tunable just in case. 1277 */ 1278 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 1279 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 1280 1281 /* 1282 * We expect this loop to execute only once, but code it defensively 1283 * against the possibility of libuuid bugs. Keep generating uuids and 1284 * attempting to do a hash insert until we get a unique one. 1285 */ 1286 do { 1287 if (eip != NULL) 1288 fmd_case_rele((fmd_case_t *)eip); 1289 uuid_generate(uuid); 1290 uuid_unparse(uuid, cip->ci_uuid); 1291 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 1292 1293 ASSERT(fmd_module_locked(mp)); 1294 fmd_list_append(&mp->mod_cases, cip); 1295 fmd_module_setcdirty(mp); 1296 1297 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1298 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1299 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1300 1301 return ((fmd_case_t *)cip); 1302 } 1303 1304 static void 1305 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 1306 { 1307 fmd_case_susp_t *cis, *ncis; 1308 1309 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1310 1311 if (cip->ci_proxy_asru) 1312 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * 1313 cip->ci_nsuspects); 1314 if (cip->ci_diag_de) 1315 nvlist_free(cip->ci_diag_de); 1316 if (cip->ci_diag_asru) 1317 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * 1318 cip->ci_nsuspects); 1319 1320 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 1321 ncis = cis->cis_next; 1322 nvlist_free(cis->cis_nvl); 1323 fmd_free(cis, sizeof (fmd_case_susp_t)); 1324 } 1325 1326 cip->ci_suspects = NULL; 1327 cip->ci_nsuspects = 0; 1328 } 1329 1330 fmd_case_t * 1331 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 1332 uint_t state, const char *uuid, const char *code) 1333 { 1334 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1335 fmd_case_impl_t *eip; 1336 1337 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1338 fmd_buf_hash_create(&cip->ci_bufs); 1339 1340 fmd_module_hold(mp); 1341 cip->ci_mod = mp; 1342 cip->ci_xprt = xp; 1343 cip->ci_refs = 1; 1344 cip->ci_state = state; 1345 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 1346 cip->ci_uuidlen = strlen(cip->ci_uuid); 1347 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 1348 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 1349 1350 if (state > FMD_CASE_CLOSE_WAIT) 1351 cip->ci_flags |= FMD_CF_SOLVED; 1352 1353 /* 1354 * Insert the case into the global case hash. If the specified UUID is 1355 * already present, check to see if it is an orphan: if so, reclaim it; 1356 * otherwise if it is owned by a different module then return NULL. 1357 */ 1358 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 1359 (void) pthread_mutex_lock(&cip->ci_lock); 1360 cip->ci_refs--; /* decrement to zero */ 1361 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 1362 1363 cip = eip; /* switch 'cip' to the existing case */ 1364 (void) pthread_mutex_lock(&cip->ci_lock); 1365 1366 /* 1367 * If the ASRU cache is trying to recreate an orphan, then just 1368 * return the existing case that we found without changing it. 1369 */ 1370 if (mp == fmd.d_rmod) { 1371 /* 1372 * In case the case has already been created from 1373 * a checkpoint file we need to set up code now. 1374 */ 1375 if (cip->ci_state < FMD_CASE_CLOSED) { 1376 if (code != NULL && cip->ci_code == NULL) { 1377 cip->ci_code = fmd_strdup(code, 1378 FMD_SLEEP); 1379 cip->ci_codelen = cip->ci_code ? 1380 strlen(cip->ci_code) + 1 : 0; 1381 fmd_case_code_hash_insert(fmd.d_cases, 1382 cip); 1383 } 1384 } 1385 1386 /* 1387 * When recreating an orphan case, state passed in may 1388 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If 1389 * any suspects are still CLOSED (faulty) then the 1390 * overall state needs to be CLOSED. 1391 */ 1392 if ((cip->ci_state == FMD_CASE_REPAIRED || 1393 cip->ci_state == FMD_CASE_RESOLVED) && 1394 state == FMD_CASE_CLOSED) 1395 cip->ci_state = FMD_CASE_CLOSED; 1396 (void) pthread_mutex_unlock(&cip->ci_lock); 1397 fmd_case_rele((fmd_case_t *)cip); 1398 return ((fmd_case_t *)cip); 1399 } 1400 1401 /* 1402 * If the existing case isn't an orphan or is being proxied, 1403 * then we have a UUID conflict: return failure to the caller. 1404 */ 1405 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 1406 (void) pthread_mutex_unlock(&cip->ci_lock); 1407 fmd_case_rele((fmd_case_t *)cip); 1408 return (NULL); 1409 } 1410 1411 /* 1412 * If the new module is reclaiming an orphaned case, remove 1413 * the case from the root module, switch ci_mod, and then fall 1414 * through to adding the case to the new owner module 'mp'. 1415 */ 1416 fmd_module_lock(cip->ci_mod); 1417 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1418 fmd_module_unlock(cip->ci_mod); 1419 1420 fmd_module_rele(cip->ci_mod); 1421 cip->ci_mod = mp; 1422 fmd_module_hold(mp); 1423 1424 /* 1425 * It's possible that fmd crashed or was restarted during a 1426 * previous solve operation between the asru cache being created 1427 * and the ckpt file being updated to SOLVED. Thus when the DE 1428 * recreates the case here from the checkpoint file, the state 1429 * will be UNSOLVED and yet we are having to reclaim because 1430 * the case was in the asru cache. If this happens, revert the 1431 * case back to the UNSOLVED state and let the DE solve it again 1432 */ 1433 if (state == FMD_CASE_UNSOLVED) { 1434 fmd_asru_hash_delete_case(fmd.d_asrus, 1435 (fmd_case_t *)cip); 1436 fmd_case_destroy_suspects(cip); 1437 fmd_case_code_hash_delete(fmd.d_cases, cip); 1438 fmd_free(cip->ci_code, cip->ci_codelen); 1439 cip->ci_code = NULL; 1440 cip->ci_codelen = 0; 1441 cip->ci_tv_valid = 0; 1442 } 1443 1444 cip->ci_state = state; 1445 1446 (void) pthread_mutex_unlock(&cip->ci_lock); 1447 fmd_case_rele((fmd_case_t *)cip); 1448 } else { 1449 /* 1450 * add into hash of solved cases 1451 */ 1452 if (cip->ci_code) 1453 fmd_case_code_hash_insert(fmd.d_cases, cip); 1454 } 1455 1456 ASSERT(fmd_module_locked(mp)); 1457 fmd_list_append(&mp->mod_cases, cip); 1458 1459 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1460 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1461 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1462 1463 return ((fmd_case_t *)cip); 1464 } 1465 1466 void 1467 fmd_case_destroy(fmd_case_t *cp, int visible) 1468 { 1469 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1470 fmd_case_item_t *cit, *ncit; 1471 1472 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1473 ASSERT(cip->ci_refs == 0); 1474 1475 if (visible) { 1476 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 1477 fmd_case_hash_delete(fmd.d_cases, cip); 1478 } 1479 1480 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 1481 ncit = cit->cit_next; 1482 fmd_event_rele(cit->cit_event); 1483 fmd_free(cit, sizeof (fmd_case_item_t)); 1484 } 1485 1486 fmd_case_destroy_suspects(cip); 1487 1488 if (cip->ci_principal != NULL) 1489 fmd_event_rele(cip->ci_principal); 1490 1491 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1492 fmd_free(cip->ci_code, cip->ci_codelen); 1493 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1494 1495 fmd_module_rele(cip->ci_mod); 1496 fmd_free(cip, sizeof (fmd_case_impl_t)); 1497 } 1498 1499 void 1500 fmd_case_hold(fmd_case_t *cp) 1501 { 1502 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1503 1504 (void) pthread_mutex_lock(&cip->ci_lock); 1505 fmd_case_hold_locked(cp); 1506 (void) pthread_mutex_unlock(&cip->ci_lock); 1507 } 1508 1509 void 1510 fmd_case_hold_locked(fmd_case_t *cp) 1511 { 1512 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1513 1514 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1515 if (cip->ci_flags & FMD_CF_DELETING) 1516 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1517 (void *)cip, cip->ci_uuid); 1518 cip->ci_refs++; 1519 ASSERT(cip->ci_refs != 0); 1520 } 1521 1522 static fmd_case_impl_t * 1523 fmd_case_tryhold(fmd_case_impl_t *cip) 1524 { 1525 /* 1526 * If the case's "deleting" bit is unset, hold and return case, 1527 * otherwise, return NULL. 1528 */ 1529 (void) pthread_mutex_lock(&cip->ci_lock); 1530 if (cip->ci_flags & FMD_CF_DELETING) { 1531 (void) pthread_mutex_unlock(&cip->ci_lock); 1532 cip = NULL; 1533 } else { 1534 fmd_case_hold_locked((fmd_case_t *)cip); 1535 (void) pthread_mutex_unlock(&cip->ci_lock); 1536 } 1537 return (cip); 1538 } 1539 1540 void 1541 fmd_case_rele(fmd_case_t *cp) 1542 { 1543 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1544 1545 (void) pthread_mutex_lock(&cip->ci_lock); 1546 ASSERT(cip->ci_refs != 0); 1547 1548 if (--cip->ci_refs == 0) 1549 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1550 else 1551 (void) pthread_mutex_unlock(&cip->ci_lock); 1552 } 1553 1554 void 1555 fmd_case_rele_locked(fmd_case_t *cp) 1556 { 1557 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1558 1559 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1560 --cip->ci_refs; 1561 ASSERT(cip->ci_refs != 0); 1562 } 1563 1564 int 1565 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1566 { 1567 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1568 fmd_case_item_t *cit; 1569 fmd_event_t *oep; 1570 uint_t state; 1571 int new; 1572 1573 fmd_event_hold(ep); 1574 (void) pthread_mutex_lock(&cip->ci_lock); 1575 1576 if (cip->ci_flags & FMD_CF_SOLVED) 1577 state = FMD_EVS_DIAGNOSED; 1578 else 1579 state = FMD_EVS_ACCEPTED; 1580 1581 oep = cip->ci_principal; 1582 cip->ci_principal = ep; 1583 1584 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1585 if (cit->cit_event == ep) 1586 break; 1587 } 1588 1589 cip->ci_flags |= FMD_CF_DIRTY; 1590 new = cit == NULL && ep != oep; 1591 1592 (void) pthread_mutex_unlock(&cip->ci_lock); 1593 1594 fmd_module_setcdirty(cip->ci_mod); 1595 fmd_event_transition(ep, state); 1596 1597 if (oep != NULL) 1598 fmd_event_rele(oep); 1599 1600 return (new); 1601 } 1602 1603 int 1604 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1605 { 1606 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1607 fmd_case_item_t *cit; 1608 uint_t state; 1609 int new; 1610 boolean_t injected; 1611 1612 (void) pthread_mutex_lock(&cip->ci_lock); 1613 1614 if (cip->ci_flags & FMD_CF_SOLVED) 1615 state = FMD_EVS_DIAGNOSED; 1616 else 1617 state = FMD_EVS_ACCEPTED; 1618 1619 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1620 if (cit->cit_event == ep) 1621 break; 1622 } 1623 1624 new = cit == NULL && ep != cip->ci_principal; 1625 1626 /* 1627 * If the event is already in the case or the case is already solved, 1628 * there is no reason to save it: just transition it appropriately. 1629 */ 1630 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1631 (void) pthread_mutex_unlock(&cip->ci_lock); 1632 fmd_event_transition(ep, state); 1633 return (new); 1634 } 1635 1636 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1637 fmd_event_hold(ep); 1638 1639 if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl, 1640 "__injected", &injected) == 0 && injected) 1641 fmd_case_set_injected(cp); 1642 1643 cit->cit_next = cip->ci_items; 1644 cit->cit_event = ep; 1645 1646 cip->ci_items = cit; 1647 cip->ci_nitems++; 1648 1649 cip->ci_flags |= FMD_CF_DIRTY; 1650 (void) pthread_mutex_unlock(&cip->ci_lock); 1651 1652 fmd_module_setcdirty(cip->ci_mod); 1653 fmd_event_transition(ep, state); 1654 1655 return (new); 1656 } 1657 1658 void 1659 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1660 { 1661 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1662 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1663 1664 (void) pthread_mutex_lock(&cip->ci_lock); 1665 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1666 cip->ci_flags |= FMD_CF_DIRTY; 1667 1668 cis->cis_next = cip->ci_suspects; 1669 cis->cis_nvl = nvl; 1670 1671 cip->ci_suspects = cis; 1672 cip->ci_nsuspects++; 1673 1674 (void) pthread_mutex_unlock(&cip->ci_lock); 1675 if (cip->ci_xprt == NULL) 1676 fmd_module_setcdirty(cip->ci_mod); 1677 } 1678 1679 void 1680 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1681 { 1682 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1683 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1684 boolean_t b; 1685 1686 (void) pthread_mutex_lock(&cip->ci_lock); 1687 1688 cis->cis_next = cip->ci_suspects; 1689 cis->cis_nvl = nvl; 1690 1691 if (nvlist_lookup_boolean_value(nvl, 1692 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1693 cip->ci_flags |= FMD_CF_INVISIBLE; 1694 1695 cip->ci_suspects = cis; 1696 cip->ci_nsuspects++; 1697 1698 (void) pthread_mutex_unlock(&cip->ci_lock); 1699 } 1700 1701 void 1702 fmd_case_reset_suspects(fmd_case_t *cp) 1703 { 1704 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1705 1706 (void) pthread_mutex_lock(&cip->ci_lock); 1707 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1708 1709 fmd_case_destroy_suspects(cip); 1710 cip->ci_flags |= FMD_CF_DIRTY; 1711 1712 (void) pthread_mutex_unlock(&cip->ci_lock); 1713 fmd_module_setcdirty(cip->ci_mod); 1714 } 1715 1716 /*ARGSUSED*/ 1717 static void 1718 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1719 { 1720 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1721 } 1722 1723 /* 1724 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1725 * whatever actions and emit whatever events are appropriate for the state. 1726 * Refer to the topmost block comment explaining the state machine for details. 1727 */ 1728 void 1729 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1730 { 1731 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1732 fmd_case_item_t *cit; 1733 fmd_event_t *e; 1734 int resolved = 0; 1735 int any_unusable_and_present = 0; 1736 1737 ASSERT(state <= FMD_CASE_RESOLVED); 1738 (void) pthread_mutex_lock(&cip->ci_lock); 1739 1740 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1741 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); 1742 1743 cip->ci_flags |= flags; 1744 1745 if (cip->ci_state >= state) { 1746 (void) pthread_mutex_unlock(&cip->ci_lock); 1747 return; /* already in specified state */ 1748 } 1749 1750 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1751 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1752 1753 cip->ci_state = state; 1754 cip->ci_flags |= FMD_CF_DIRTY; 1755 1756 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1757 fmd_module_setcdirty(cip->ci_mod); 1758 1759 switch (state) { 1760 case FMD_CASE_SOLVED: 1761 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1762 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1763 1764 if (cip->ci_principal != NULL) { 1765 fmd_event_transition(cip->ci_principal, 1766 FMD_EVS_DIAGNOSED); 1767 } 1768 break; 1769 1770 case FMD_CASE_CLOSE_WAIT: 1771 /* 1772 * If the case was never solved, do not change ASRUs. 1773 * If the case was never fmd_case_closed, do not change ASRUs. 1774 * If the case was repaired, do not change ASRUs. 1775 */ 1776 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1777 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1778 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1779 fmd_case_unusable, NULL); 1780 1781 /* 1782 * If an orphaned case transitions to CLOSE_WAIT, the owning 1783 * module is no longer loaded: continue on to CASE_CLOSED. 1784 */ 1785 if (fmd_case_orphaned(cp)) 1786 state = cip->ci_state = FMD_CASE_CLOSED; 1787 break; 1788 1789 case FMD_CASE_REPAIRED: 1790 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); 1791 1792 /* 1793 * If we've been requested to transition straight on to the 1794 * RESOLVED state (which can happen with fault proxying where a 1795 * list.resolved or a uuresolved is received from the other 1796 * side), or if all suspects are already either usable or not 1797 * present then transition straight to RESOLVED state, 1798 * publishing both the list.repaired and list.resolved. For a 1799 * proxy, if we discover here that all suspects are already 1800 * either usable or not present, notify the diag side instead 1801 * using fmd_xprt_uuresolved(). 1802 */ 1803 if (flags & FMD_CF_RESOLVED) { 1804 if (cip->ci_xprt != NULL) 1805 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1806 } else { 1807 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1808 fmd_case_unusable_and_present, 1809 &any_unusable_and_present); 1810 if (any_unusable_and_present) 1811 break; 1812 if (cip->ci_xprt != NULL) { 1813 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); 1814 break; 1815 } 1816 } 1817 1818 cip->ci_state = FMD_CASE_RESOLVED; 1819 (void) pthread_mutex_unlock(&cip->ci_lock); 1820 fmd_case_publish(cp, state); 1821 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1822 _fmd_case_snames[FMD_CASE_REPAIRED], 1823 _fmd_case_snames[FMD_CASE_RESOLVED])); 1824 state = FMD_CASE_RESOLVED; 1825 resolved = 1; 1826 (void) pthread_mutex_lock(&cip->ci_lock); 1827 break; 1828 1829 case FMD_CASE_RESOLVED: 1830 /* 1831 * For a proxy, no need to check that all suspects are already 1832 * either usable or not present - this request has come from 1833 * the diagnosing side which makes the final decision on this. 1834 */ 1835 if (cip->ci_xprt != NULL) { 1836 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1837 resolved = 1; 1838 break; 1839 } 1840 1841 ASSERT(fmd_case_orphaned(cp)); 1842 1843 /* 1844 * If all suspects are already either usable or not present then 1845 * carry on, publish list.resolved and discard the case. 1846 */ 1847 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1848 fmd_case_unusable_and_present, &any_unusable_and_present); 1849 if (any_unusable_and_present) { 1850 (void) pthread_mutex_unlock(&cip->ci_lock); 1851 return; 1852 } 1853 1854 resolved = 1; 1855 break; 1856 } 1857 1858 (void) pthread_mutex_unlock(&cip->ci_lock); 1859 1860 /* 1861 * If the module has initialized, then publish the appropriate event 1862 * for the new case state. If not, we are being called from the 1863 * checkpoint code during module load, in which case the module's 1864 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1865 * may not be open yet, which will prevent us from computing the event 1866 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1867 * event in our queue: this won't be processed until _fmd_init is done. 1868 */ 1869 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1870 fmd_case_publish(cp, state); 1871 else { 1872 fmd_case_hold(cp); 1873 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1874 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1875 } 1876 1877 if (resolved) { 1878 if (cip->ci_xprt != NULL) { 1879 /* 1880 * If we transitioned to RESOLVED, adjust the reference 1881 * count to reflect our removal from 1882 * fmd.d_rmod->mod_cases above. If the caller has not 1883 * placed an additional hold on the case, it will now 1884 * be freed. 1885 */ 1886 (void) pthread_mutex_lock(&cip->ci_lock); 1887 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1888 (void) pthread_mutex_unlock(&cip->ci_lock); 1889 fmd_case_rele(cp); 1890 } else { 1891 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1892 fmd_asru_log_resolved, NULL); 1893 (void) pthread_mutex_lock(&cip->ci_lock); 1894 /* mark as "ready to be discarded */ 1895 cip->ci_flags |= FMD_CF_RES_CMPL; 1896 (void) pthread_mutex_unlock(&cip->ci_lock); 1897 } 1898 } 1899 } 1900 1901 /* 1902 * Discard any case if it is in RESOLVED state (and if check_if_aged argument 1903 * is set if all suspects have passed the rsrc.aged time). 1904 */ 1905 void 1906 fmd_case_discard_resolved(fmd_case_t *cp, void *arg) 1907 { 1908 int check_if_aged = *(int *)arg; 1909 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1910 1911 /* 1912 * First check if case has completed transition to resolved. 1913 */ 1914 (void) pthread_mutex_lock(&cip->ci_lock); 1915 if (!(cip->ci_flags & FMD_CF_RES_CMPL)) { 1916 (void) pthread_mutex_unlock(&cip->ci_lock); 1917 return; 1918 } 1919 1920 /* 1921 * Now if check_is_aged is set, see if all suspects have aged. 1922 */ 1923 if (check_if_aged) { 1924 int aged = 1; 1925 1926 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1927 fmd_asru_check_if_aged, &aged); 1928 if (!aged) { 1929 (void) pthread_mutex_unlock(&cip->ci_lock); 1930 return; 1931 } 1932 } 1933 1934 /* 1935 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't 1936 * do it twice. 1937 */ 1938 fmd_module_lock(cip->ci_mod); 1939 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1940 fmd_module_unlock(cip->ci_mod); 1941 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1942 cip->ci_flags &= ~FMD_CF_RES_CMPL; 1943 (void) pthread_mutex_unlock(&cip->ci_lock); 1944 fmd_case_rele(cp); 1945 } 1946 1947 /* 1948 * Transition the specified case to *at least* the specified state by first 1949 * re-validating the suspect list using the resource cache. This function is 1950 * employed by the checkpoint code when restoring a saved, solved case to see 1951 * if the state of the case has effectively changed while fmd was not running 1952 * or the module was not loaded. 1953 */ 1954 void 1955 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1956 { 1957 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1958 1959 int usable = 0; /* are any suspects usable? */ 1960 1961 ASSERT(state >= FMD_CASE_SOLVED); 1962 (void) pthread_mutex_lock(&cip->ci_lock); 1963 1964 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1965 1966 (void) pthread_mutex_unlock(&cip->ci_lock); 1967 1968 if (!usable) { 1969 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1970 flags |= FMD_CF_ISOLATED; 1971 } 1972 1973 fmd_case_transition(cp, state, flags); 1974 } 1975 1976 void 1977 fmd_case_setdirty(fmd_case_t *cp) 1978 { 1979 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1980 1981 (void) pthread_mutex_lock(&cip->ci_lock); 1982 cip->ci_flags |= FMD_CF_DIRTY; 1983 (void) pthread_mutex_unlock(&cip->ci_lock); 1984 1985 fmd_module_setcdirty(cip->ci_mod); 1986 } 1987 1988 void 1989 fmd_case_clrdirty(fmd_case_t *cp) 1990 { 1991 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1992 1993 (void) pthread_mutex_lock(&cip->ci_lock); 1994 cip->ci_flags &= ~FMD_CF_DIRTY; 1995 (void) pthread_mutex_unlock(&cip->ci_lock); 1996 } 1997 1998 void 1999 fmd_case_commit(fmd_case_t *cp) 2000 { 2001 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2002 fmd_case_item_t *cit; 2003 2004 (void) pthread_mutex_lock(&cip->ci_lock); 2005 2006 if (cip->ci_flags & FMD_CF_DIRTY) { 2007 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 2008 fmd_event_commit(cit->cit_event); 2009 2010 if (cip->ci_principal != NULL) 2011 fmd_event_commit(cip->ci_principal); 2012 2013 fmd_buf_hash_commit(&cip->ci_bufs); 2014 cip->ci_flags &= ~FMD_CF_DIRTY; 2015 } 2016 2017 (void) pthread_mutex_unlock(&cip->ci_lock); 2018 } 2019 2020 /* 2021 * On proxy side, send back repair/acquit/etc request to diagnosing side 2022 */ 2023 void 2024 fmd_case_xprt_updated(fmd_case_t *cp) 2025 { 2026 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2027 nvlist_t **nva; 2028 uint8_t *ba; 2029 int msg = B_TRUE; 2030 int count = 0; 2031 fmd_case_lst_t fcl; 2032 2033 ASSERT(cip->ci_xprt != NULL); 2034 (void) pthread_mutex_lock(&cip->ci_lock); 2035 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 2036 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 2037 fcl.fcl_countp = &count; 2038 fcl.fcl_maxcount = cip->ci_nsuspects; 2039 fcl.fcl_msgp = &msg; 2040 fcl.fcl_ba = ba; 2041 fcl.fcl_nva = nva; 2042 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 2043 (void) pthread_mutex_unlock(&cip->ci_lock); 2044 fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, 2045 count); 2046 } 2047 2048 /* 2049 * fmd_case_update_status() can be called on either the proxy side when a 2050 * list.suspect is received, or on the diagnosing side when an update request 2051 * is received from the proxy. It updates the status in the resource cache. 2052 */ 2053 void 2054 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, 2055 uint8_t *diag_asrup) 2056 { 2057 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2058 int count = 0; 2059 fmd_asru_update_status_t faus; 2060 2061 /* 2062 * update status of resource cache entries 2063 */ 2064 faus.faus_countp = &count; 2065 faus.faus_maxcount = cip->ci_nsuspects; 2066 faus.faus_ba = statusp; 2067 faus.faus_proxy_asru = proxy_asrup; 2068 faus.faus_diag_asru = diag_asrup; 2069 faus.faus_is_proxy = (cip->ci_xprt != NULL); 2070 (void) pthread_mutex_lock(&cip->ci_lock); 2071 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, 2072 &faus); 2073 (void) pthread_mutex_unlock(&cip->ci_lock); 2074 } 2075 2076 /* 2077 * Called on either the proxy side or the diag side when a repair has taken 2078 * place on the other side but this side may know the asru "contains" 2079 * relationships. 2080 */ 2081 void 2082 fmd_case_update_containees(fmd_case_t *cp) 2083 { 2084 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2085 2086 (void) pthread_mutex_lock(&cip->ci_lock); 2087 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2088 fmd_asru_update_containees, NULL); 2089 (void) pthread_mutex_unlock(&cip->ci_lock); 2090 } 2091 2092 /* 2093 * fmd_case_close_status() is called on diagnosing side when proxy side 2094 * has had a uuclose. It updates the status in the resource cache. 2095 */ 2096 void 2097 fmd_case_close_status(fmd_case_t *cp) 2098 { 2099 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2100 int count = 0; 2101 fmd_asru_close_status_t facs; 2102 2103 /* 2104 * update status of resource cache entries 2105 */ 2106 facs.facs_countp = &count; 2107 facs.facs_maxcount = cip->ci_nsuspects; 2108 (void) pthread_mutex_lock(&cip->ci_lock); 2109 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, 2110 &facs); 2111 (void) pthread_mutex_unlock(&cip->ci_lock); 2112 } 2113 2114 /* 2115 * Indicate that the case may need to change state because one or more of the 2116 * ASRUs named as a suspect has changed state. We examine all the suspects 2117 * and if none are still faulty, we initiate a case close transition. 2118 */ 2119 void 2120 fmd_case_update(fmd_case_t *cp) 2121 { 2122 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2123 uint_t cstate; 2124 int faulty = 0; 2125 2126 (void) pthread_mutex_lock(&cip->ci_lock); 2127 cstate = cip->ci_state; 2128 2129 if (cip->ci_state < FMD_CASE_SOLVED) { 2130 (void) pthread_mutex_unlock(&cip->ci_lock); 2131 return; /* update is not appropriate */ 2132 } 2133 2134 if (cip->ci_flags & FMD_CF_REPAIRED) { 2135 (void) pthread_mutex_unlock(&cip->ci_lock); 2136 return; /* already repaired */ 2137 } 2138 2139 TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); 2140 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2141 (void) pthread_mutex_unlock(&cip->ci_lock); 2142 2143 if (faulty) { 2144 nvlist_t *nvl; 2145 fmd_event_t *e; 2146 char *class; 2147 2148 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); 2149 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2150 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2151 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2152 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 2153 fmd_log_append(fmd.d_fltlog, e, cp); 2154 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 2155 fmd_dispq_dispatch(fmd.d_disp, e, class); 2156 return; /* one or more suspects are still marked faulty */ 2157 } 2158 2159 if (cstate == FMD_CASE_CLOSED) 2160 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2161 else 2162 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2163 } 2164 2165 /* 2166 * Delete a closed case from the module's case list once the fmdo_close() entry 2167 * point has run to completion. If the case is owned by a transport module, 2168 * tell the transport to proxy a case close on the other end of the transport. 2169 * Transition to the appropriate next state based on ci_flags. This 2170 * function represents the end of CLOSE_WAIT and transitions the case to either 2171 * CLOSED or REPAIRED or discards it entirely because it was never solved; 2172 * refer to the topmost block comment explaining the state machine for details. 2173 */ 2174 void 2175 fmd_case_delete(fmd_case_t *cp) 2176 { 2177 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2178 fmd_modstat_t *msp; 2179 size_t buftotal; 2180 2181 TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); 2182 ASSERT(fmd_module_locked(cip->ci_mod)); 2183 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2184 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 2185 2186 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2187 msp = cip->ci_mod->mod_stats; 2188 2189 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 2190 msp->ms_caseopen.fmds_value.ui64--; 2191 2192 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 2193 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 2194 2195 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2196 2197 if (cip->ci_xprt == NULL) 2198 fmd_module_setcdirty(cip->ci_mod); 2199 2200 fmd_module_rele(cip->ci_mod); 2201 cip->ci_mod = fmd.d_rmod; 2202 fmd_module_hold(cip->ci_mod); 2203 2204 /* 2205 * If the case has been solved, then retain it 2206 * on the root module's case list at least until we're transitioned. 2207 * Otherwise free the case with our final fmd_case_rele() below. 2208 */ 2209 if (cip->ci_flags & FMD_CF_SOLVED) { 2210 fmd_module_lock(cip->ci_mod); 2211 fmd_list_append(&cip->ci_mod->mod_cases, cip); 2212 fmd_module_unlock(cip->ci_mod); 2213 fmd_case_hold(cp); 2214 } 2215 2216 /* 2217 * Transition onwards to REPAIRED or CLOSED as originally requested. 2218 * Note that for proxy case if we're transitioning to CLOSED it means 2219 * the case was isolated locally, so call fmd_xprt_uuclose() to notify 2220 * the diagnosing side. No need to notify the diagnosing side if we are 2221 * transitioning to REPAIRED as we only do this when requested to do 2222 * so by the diagnosing side anyway. 2223 */ 2224 if (cip->ci_flags & FMD_CF_REPAIRED) 2225 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 2226 else if (cip->ci_flags & FMD_CF_ISOLATED) { 2227 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 2228 if (cip->ci_xprt != NULL) 2229 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 2230 } 2231 2232 fmd_case_rele(cp); 2233 } 2234 2235 void 2236 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) 2237 { 2238 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2239 2240 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2241 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 2242 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2243 2244 ASSERT(fmd_module_locked(cip->ci_mod)); 2245 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2246 if (delete_from_asru_cache) { 2247 (void) pthread_mutex_lock(&cip->ci_lock); 2248 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 2249 (void) pthread_mutex_unlock(&cip->ci_lock); 2250 } 2251 fmd_case_rele(cp); 2252 } 2253 2254 /* 2255 * Indicate that the problem corresponding to a case has been repaired by 2256 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 2257 * already been closed, this function initiates the transition to CLOSE_WAIT. 2258 * The caller must have the case held from fmd_case_hash_lookup(), so we can 2259 * grab and drop ci_lock without the case being able to be freed in between. 2260 */ 2261 int 2262 fmd_case_repair(fmd_case_t *cp) 2263 { 2264 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2265 uint_t cstate; 2266 fmd_asru_rep_arg_t fara; 2267 2268 (void) pthread_mutex_lock(&cip->ci_lock); 2269 cstate = cip->ci_state; 2270 2271 if (cstate < FMD_CASE_SOLVED) { 2272 (void) pthread_mutex_unlock(&cip->ci_lock); 2273 return (fmd_set_errno(EFMD_CASE_STATE)); 2274 } 2275 2276 if (cip->ci_flags & FMD_CF_REPAIRED) { 2277 (void) pthread_mutex_unlock(&cip->ci_lock); 2278 return (0); /* already repaired */ 2279 } 2280 2281 TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); 2282 fara.fara_reason = FMD_ASRU_REPAIRED; 2283 fara.fara_bywhat = FARA_BY_CASE; 2284 fara.fara_rval = NULL; 2285 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2286 (void) pthread_mutex_unlock(&cip->ci_lock); 2287 2288 /* 2289 * if this is a proxied case, send the repair across the transport. 2290 * The remote side will then do the repair and send a list.repaired back 2291 * again such that we can finally repair the case on this side. 2292 */ 2293 if (cip->ci_xprt != NULL) { 2294 fmd_case_xprt_updated(cp); 2295 return (0); 2296 } 2297 2298 if (cstate == FMD_CASE_CLOSED) 2299 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2300 else 2301 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2302 2303 return (0); 2304 } 2305 2306 int 2307 fmd_case_acquit(fmd_case_t *cp) 2308 { 2309 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2310 uint_t cstate; 2311 fmd_asru_rep_arg_t fara; 2312 2313 (void) pthread_mutex_lock(&cip->ci_lock); 2314 cstate = cip->ci_state; 2315 2316 if (cstate < FMD_CASE_SOLVED) { 2317 (void) pthread_mutex_unlock(&cip->ci_lock); 2318 return (fmd_set_errno(EFMD_CASE_STATE)); 2319 } 2320 2321 if (cip->ci_flags & FMD_CF_REPAIRED) { 2322 (void) pthread_mutex_unlock(&cip->ci_lock); 2323 return (0); /* already repaired */ 2324 } 2325 2326 TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); 2327 fara.fara_reason = FMD_ASRU_ACQUITTED; 2328 fara.fara_bywhat = FARA_BY_CASE; 2329 fara.fara_rval = NULL; 2330 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2331 (void) pthread_mutex_unlock(&cip->ci_lock); 2332 2333 /* 2334 * if this is a proxied case, send the repair across the transport. 2335 * The remote side will then do the repair and send a list.repaired back 2336 * again such that we can finally repair the case on this side. 2337 */ 2338 if (cip->ci_xprt != NULL) { 2339 fmd_case_xprt_updated(cp); 2340 return (0); 2341 } 2342 2343 if (cstate == FMD_CASE_CLOSED) 2344 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2345 else 2346 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2347 2348 return (0); 2349 } 2350 2351 int 2352 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 2353 { 2354 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2355 fmd_case_item_t *cit; 2356 uint_t state; 2357 int rv = 0; 2358 2359 (void) pthread_mutex_lock(&cip->ci_lock); 2360 2361 if (cip->ci_state >= FMD_CASE_SOLVED) 2362 state = FMD_EVS_DIAGNOSED; 2363 else 2364 state = FMD_EVS_ACCEPTED; 2365 2366 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 2367 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 2368 break; 2369 } 2370 2371 if (rv == 0 && cip->ci_principal != NULL) 2372 rv = fmd_event_equal(ep, cip->ci_principal); 2373 2374 (void) pthread_mutex_unlock(&cip->ci_lock); 2375 2376 if (rv != 0) 2377 fmd_event_transition(ep, state); 2378 2379 return (rv); 2380 } 2381 2382 int 2383 fmd_case_orphaned(fmd_case_t *cp) 2384 { 2385 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 2386 } 2387 2388 void 2389 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 2390 { 2391 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 2392 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 2393 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 2394 } 2395 2396 void 2397 fmd_case_set_injected(fmd_case_t *cp) 2398 { 2399 ((fmd_case_impl_t *)cp)->ci_injected = 1; 2400 } 2401 2402 void 2403 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) 2404 { 2405 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2406 2407 if (cip->ci_diag_de) 2408 nvlist_free(cip->ci_diag_de); 2409 cip->ci_diag_de = nvl; 2410 } 2411 2412 void 2413 fmd_case_setcode(fmd_case_t *cp, char *code) 2414 { 2415 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2416 2417 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 2418 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 2419 } 2420 2421 /*ARGSUSED*/ 2422 static void 2423 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 2424 { 2425 int not_faulty = 0; 2426 int faulty = 0; 2427 nvlist_t *nvl; 2428 fmd_event_t *e; 2429 char *class; 2430 int any_unusable_and_present = 0; 2431 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2432 2433 if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) 2434 return; 2435 2436 if (cip->ci_state == FMD_CASE_RESOLVED) { 2437 cip->ci_flags |= FMD_CF_RES_CMPL; 2438 return; 2439 } 2440 2441 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2442 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 2443 ¬_faulty); 2444 2445 if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) { 2446 /* 2447 * If none of the suspects is faulty, replay the list.repaired. 2448 * If all suspects are already either usable or not present then 2449 * also transition straight to RESOLVED state. 2450 */ 2451 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2452 fmd_case_unusable_and_present, &any_unusable_and_present); 2453 if (!any_unusable_and_present) { 2454 cip->ci_state = FMD_CASE_RESOLVED; 2455 2456 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2457 cip->ci_uuid)); 2458 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2459 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2460 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2461 class); 2462 fmd_dispq_dispatch(fmd.d_disp, e, class); 2463 2464 TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", 2465 cip->ci_uuid)); 2466 fmd_case_publish(cp, FMD_CASE_RESOLVED); 2467 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2468 fmd_asru_log_resolved, NULL); 2469 cip->ci_flags |= FMD_CF_RES_CMPL; 2470 } else { 2471 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2472 cip->ci_uuid)); 2473 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2474 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2475 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2476 class); 2477 fmd_dispq_dispatch(fmd.d_disp, e, class); 2478 } 2479 } else if (faulty && not_faulty) { 2480 /* 2481 * if some but not all of the suspects are not faulty, replay 2482 * the list.updated. 2483 */ 2484 TRACE((FMD_DBG_CASE, "replay sending list.updated %s", 2485 cip->ci_uuid)); 2486 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2487 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2488 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2489 fmd_dispq_dispatch(fmd.d_disp, e, class); 2490 } 2491 } 2492 2493 void 2494 fmd_case_repair_replay() 2495 { 2496 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 2497 } 2498