1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * FMD Case Subsystem 29 * 30 * Diagnosis engines are expected to group telemetry events related to the 31 * diagnosis of a particular problem on the system into a set of cases. The 32 * diagnosis engine may have any number of cases open at a given point in time. 33 * Some cases may eventually be *solved* by associating a suspect list of one 34 * or more problems with the case, at which point fmd publishes a list.suspect 35 * event for the case and it becomes visible to administrators and agents. 36 * 37 * Every case is named using a UUID, and is globally visible in the case hash. 38 * Cases are reference-counted, except for the reference from the case hash 39 * itself. Consumers of case references include modules, which store active 40 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 41 * 42 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 43 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 44 * or transport) and the case is referenced by the mod_cases list. Once the 45 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 46 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 47 * 48 * +------------+ 49 * +----------| UNSOLVED | 50 * | +------------+ 51 * | 1 | 52 * | | 53 * | +-------v----+ 54 * 2 | | SOLVED | 55 * | +------------+ 56 * | 3 | 5 | 57 * +------------+ | | 58 * | | | 59 * +-v---v----v-+ 60 * | CLOSE_WAIT | 61 * +------------+ 62 * | | | 63 * +-----------+ | +------------+ 64 * | 4 | | 65 * v +-----v------+ | 66 * discard | CLOSED | 6 | 67 * +------------+ | 68 * | | 69 * | +------------+ 70 * 7 | | 71 * +-----v----v-+ 72 * | REPAIRED | 73 * +------------+ 74 * | 75 * 8 | 76 * +-----v------+ 77 * | RESOLVED | 78 * +------------+ 79 * | 80 * v 81 * discard 82 * 83 * The state machine changes are triggered by calls to fmd_case_transition() 84 * from various locations inside of fmd, as described below: 85 * 86 * [1] Called by: fmd_case_solve() 87 * Actions: FMD_CF_SOLVED flag is set in ci_flags 88 * conviction policy is applied to suspect list 89 * suspects convicted are marked faulty (F) in R$ 90 * list.suspect event logged and dispatched 91 * 92 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 93 * Actions: diagnosis engine fmdo_close() entry point scheduled 94 * case discarded upon exit from CLOSE_WAIT 95 * 96 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 97 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 98 * suspects convicted (F) are marked unusable (U) in R$ 99 * diagnosis engine fmdo_close() entry point scheduled 100 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 101 * 102 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 103 * Actions: list.isolated event dispatched 104 * case deleted from module's list of open cases 105 * 106 * [5] Called by: fmd_case_repair(), fmd_case_update() 107 * Actions: FMD_CF_REPAIR flag is set in ci_flags 108 * diagnosis engine fmdo_close() entry point scheduled 109 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 110 * 111 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 112 * Actions: suspects convicted are marked non faulty (!F) in R$ 113 * list.repaired or list.updated event dispatched 114 * 115 * [7] Called by: fmd_case_repair(), fmd_case_update() 116 * Actions: FMD_CF_REPAIR flag is set in ci_flags 117 * suspects convicted are marked non faulty (!F) in R$ 118 * list.repaired or list.updated event dispatched 119 * 120 * [8] Called by: fmd_case_uuresolve() 121 * Actions: list.resolved event dispatched 122 * case is discarded 123 */ 124 125 #include <sys/fm/protocol.h> 126 #include <uuid/uuid.h> 127 #include <alloca.h> 128 129 #include <fmd_alloc.h> 130 #include <fmd_module.h> 131 #include <fmd_error.h> 132 #include <fmd_conf.h> 133 #include <fmd_case.h> 134 #include <fmd_string.h> 135 #include <fmd_subr.h> 136 #include <fmd_protocol.h> 137 #include <fmd_event.h> 138 #include <fmd_eventq.h> 139 #include <fmd_dispq.h> 140 #include <fmd_buf.h> 141 #include <fmd_log.h> 142 #include <fmd_asru.h> 143 #include <fmd_fmri.h> 144 #include <fmd_xprt.h> 145 146 #include <fmd.h> 147 148 static const char *const _fmd_case_snames[] = { 149 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 150 "SOLVED", /* FMD_CASE_SOLVED */ 151 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 152 "CLOSED", /* FMD_CASE_CLOSED */ 153 "REPAIRED", /* FMD_CASE_REPAIRED */ 154 "RESOLVED" /* FMD_CASE_RESOLVED */ 155 }; 156 157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 158 159 fmd_case_hash_t * 160 fmd_case_hash_create(void) 161 { 162 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 163 164 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 165 chp->ch_hashlen = fmd.d_str_buckets; 166 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 167 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 168 FMD_SLEEP); 169 chp->ch_count = 0; 170 171 return (chp); 172 } 173 174 /* 175 * Destroy the case hash. Unlike most of our hash tables, no active references 176 * are kept by the case hash itself; all references come from other subsystems. 177 * The hash must be destroyed after all modules are unloaded; if anything was 178 * present in the hash it would be by definition a reference count leak. 179 */ 180 void 181 fmd_case_hash_destroy(fmd_case_hash_t *chp) 182 { 183 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 185 fmd_free(chp, sizeof (fmd_case_hash_t)); 186 } 187 188 /* 189 * Take a snapshot of the case hash by placing an additional hold on each 190 * member in an auxiliary array, and then call 'func' for each case. 191 */ 192 void 193 fmd_case_hash_apply(fmd_case_hash_t *chp, 194 void (*func)(fmd_case_t *, void *), void *arg) 195 { 196 fmd_case_impl_t *cp, **cps, **cpp; 197 uint_t cpc, i; 198 199 (void) pthread_rwlock_rdlock(&chp->ch_lock); 200 201 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 202 cpc = chp->ch_count; 203 204 for (i = 0; i < chp->ch_hashlen; i++) { 205 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 206 *cpp++ = fmd_case_tryhold(cp); 207 } 208 209 ASSERT(cpp == cps + cpc); 210 (void) pthread_rwlock_unlock(&chp->ch_lock); 211 212 for (i = 0; i < cpc; i++) { 213 if (cps[i] != NULL) { 214 func((fmd_case_t *)cps[i], arg); 215 fmd_case_rele((fmd_case_t *)cps[i]); 216 } 217 } 218 219 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 220 } 221 222 static void 223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 224 { 225 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 226 227 cip->ci_code_next = chp->ch_code_hash[h]; 228 chp->ch_code_hash[h] = cip; 229 } 230 231 static void 232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 233 { 234 fmd_case_impl_t **pp, *cp; 235 236 if (cip->ci_code) { 237 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 238 239 pp = &chp->ch_code_hash[h]; 240 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 241 if (cp != cip) 242 pp = &cp->ci_code_next; 243 else 244 break; 245 } 246 if (cp != NULL) { 247 *pp = cp->ci_code_next; 248 cp->ci_code_next = NULL; 249 } 250 } 251 } 252 253 /* 254 * Look up the diagcode for this case and cache it in ci_code. If no suspects 255 * were defined for this case or if the lookup fails, the event dictionary or 256 * module code is broken, and we set the event code to a precomputed default. 257 */ 258 static const char * 259 fmd_case_mkcode(fmd_case_t *cp) 260 { 261 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 262 fmd_case_susp_t *cis; 263 fmd_case_hash_t *chp = fmd.d_cases; 264 265 char **keys, **keyp; 266 const char *s; 267 268 ASSERT(MUTEX_HELD(&cip->ci_lock)); 269 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 270 271 /* 272 * delete any existing entry from code hash if it is on it 273 */ 274 fmd_case_code_hash_delete(chp, cip); 275 276 fmd_free(cip->ci_code, cip->ci_codelen); 277 cip->ci_codelen = cip->ci_mod->mod_codelen; 278 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 279 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 280 281 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 282 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 283 keyp++; 284 } 285 286 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 287 288 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 289 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 290 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 291 fmd_free(cip->ci_code, cip->ci_codelen); 292 cip->ci_codelen = strlen(s) + 1; 293 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 294 (void) strcpy(cip->ci_code, s); 295 } 296 297 /* 298 * add into hash of solved cases 299 */ 300 fmd_case_code_hash_insert(chp, cip); 301 302 return (cip->ci_code); 303 } 304 305 typedef struct { 306 int *fcl_countp; 307 int fcl_maxcount; 308 uint8_t *fcl_ba; 309 nvlist_t **fcl_nva; 310 int *fcl_msgp; 311 } fmd_case_lst_t; 312 313 static void 314 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 315 { 316 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 317 boolean_t b; 318 int state; 319 320 if (*entryp->fcl_countp >= entryp->fcl_maxcount) 321 return; 322 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 323 &b) == 0 && b == B_FALSE) 324 *entryp->fcl_msgp = B_FALSE; 325 entryp->fcl_ba[*entryp->fcl_countp] = 0; 326 state = fmd_asru_al_getstate(alp); 327 if (state & FMD_ASRU_DEGRADED) 328 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 329 if (state & FMD_ASRU_UNUSABLE) 330 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 331 if (state & FMD_ASRU_FAULTY) 332 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 333 if (!(state & FMD_ASRU_PRESENT)) 334 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 335 if (alp->al_reason == FMD_ASRU_REPAIRED) 336 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 337 else if (alp->al_reason == FMD_ASRU_REPLACED) 338 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 339 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 340 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 341 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 342 (*entryp->fcl_countp)++; 343 } 344 345 static void 346 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 347 { 348 int *faultyp = (int *)arg; 349 350 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 351 } 352 353 static void 354 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 355 { 356 int *usablep = (int *)arg; 357 358 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 359 } 360 361 static void 362 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 363 { 364 int *not_faultyp = (int *)arg; 365 366 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 367 } 368 369 /* 370 * Have we got any suspects with an asru that are still unusable and present? 371 */ 372 static void 373 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 374 { 375 int *rvalp = (int *)arg; 376 int state; 377 nvlist_t *asru; 378 379 /* 380 * if this a proxy case and this suspect doesn't have an local asru 381 * then state is unknown so we must assume it may still be unusable. 382 */ 383 if ((alp->al_flags & FMD_ASRU_PROXY) && 384 !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { 385 *rvalp |= B_TRUE; 386 return; 387 } 388 389 state = fmd_asru_al_getstate(alp); 390 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 391 return; 392 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 393 } 394 395 nvlist_t * 396 fmd_case_mkevent(fmd_case_t *cp, const char *class) 397 { 398 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 399 nvlist_t **nva, *nvl; 400 uint8_t *ba; 401 int msg = B_TRUE; 402 const char *code; 403 fmd_case_lst_t fcl; 404 int count = 0; 405 406 (void) pthread_mutex_lock(&cip->ci_lock); 407 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 408 409 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 410 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 411 412 /* 413 * For each suspect associated with the case, store its fault event 414 * nvlist in 'nva'. We also look to see if any of the suspect faults 415 * have asked not to be messaged. If any of them have made such a 416 * request, propagate that attribute to the composite list.* event. 417 * Finally, store each suspect's faulty status into the bitmap 'ba'. 418 */ 419 fcl.fcl_countp = &count; 420 fcl.fcl_maxcount = cip->ci_nsuspects; 421 fcl.fcl_msgp = &msg; 422 fcl.fcl_ba = ba; 423 fcl.fcl_nva = nva; 424 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 425 426 if (cip->ci_code == NULL) 427 (void) fmd_case_mkcode(cp); 428 /* 429 * For repair and updated event, we lookup diagcode from dict using key 430 * "list.repaired" or "list.updated" or "list.resolved". 431 */ 432 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 433 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 434 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 435 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 436 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 437 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 438 else 439 code = cip->ci_code; 440 441 if (msg == B_FALSE) 442 cip->ci_flags |= FMD_CF_INVISIBLE; 443 444 /* 445 * Use the ci_diag_de if one has been saved (eg for an injected fault). 446 * Otherwise use the authority for the current module. 447 */ 448 nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? 449 cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, 450 nva, ba, msg, &cip->ci_tv, cip->ci_injected); 451 452 (void) pthread_mutex_unlock(&cip->ci_lock); 453 return (nvl); 454 } 455 456 static int fmd_case_match_on_faulty_overlap = 1; 457 static int fmd_case_match_on_acquit_overlap = 1; 458 static int fmd_case_auto_acquit_isolated = 1; 459 static int fmd_case_auto_acquit_non_acquitted = 1; 460 static int fmd_case_too_recent = 10; /* time in seconds */ 461 462 static boolean_t 463 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 464 { 465 nvlist_t *new_rsrc; 466 nvlist_t *rsrc; 467 char *new_name = NULL; 468 char *name = NULL; 469 ssize_t new_namelen; 470 ssize_t namelen; 471 int fmri_present = 1; 472 int new_fmri_present = 1; 473 int match = B_FALSE; 474 fmd_topo_t *ftp = fmd_topo_hold(); 475 476 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 477 fmri_present = 0; 478 else { 479 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 480 goto done; 481 name = fmd_alloc(namelen + 1, FMD_SLEEP); 482 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 483 goto done; 484 } 485 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 486 new_fmri_present = 0; 487 else { 488 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 489 goto done; 490 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 491 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 492 goto done; 493 } 494 match = (fmri_present == new_fmri_present && 495 (fmri_present == 0 || 496 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 497 done: 498 if (name != NULL) 499 fmd_free(name, namelen + 1); 500 if (new_name != NULL) 501 fmd_free(new_name, new_namelen + 1); 502 fmd_topo_rele(ftp); 503 return (match); 504 } 505 506 static int 507 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2) 508 { 509 char *class, *new_class; 510 511 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU)) 512 return (0); 513 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE)) 514 return (0); 515 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU)) 516 return (0); 517 (void) nvlist_lookup_string(nvl2, FM_CLASS, &class); 518 (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class); 519 return (strcmp(class, new_class) == 0); 520 } 521 522 typedef struct { 523 int *fcms_countp; 524 int fcms_maxcount; 525 fmd_case_impl_t *fcms_cip; 526 uint8_t *fcms_new_susp_state; 527 uint8_t *fcms_old_susp_state; 528 uint8_t *fcms_old_match_state; 529 } fcms_t; 530 #define SUSPECT_STATE_FAULTY 0x1 531 #define SUSPECT_STATE_ISOLATED 0x2 532 #define SUSPECT_STATE_REMOVED 0x4 533 #define SUSPECT_STATE_ACQUITED 0x8 534 #define SUSPECT_STATE_REPAIRED 0x10 535 #define SUSPECT_STATE_REPLACED 0x20 536 #define SUSPECT_STATE_NO_MATCH 0x1 537 538 /* 539 * This is called for each suspect in the old case. Compare it against each 540 * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state 541 * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not 542 * found in the old case. 543 */ 544 static void 545 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg) 546 { 547 fcms_t *fcmsp = (fcms_t *)arg; 548 fmd_case_impl_t *cip = fcmsp->fcms_cip; 549 fmd_case_susp_t *cis; 550 int i = 0; 551 int state = fmd_asru_al_getstate(alp); 552 553 if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount) 554 return; 555 556 if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) && 557 alp->al_reason == FMD_ASRU_REMOVED)) 558 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 559 SUSPECT_STATE_REMOVED; 560 else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY)) 561 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 562 SUSPECT_STATE_ISOLATED; 563 else if (state & FMD_ASRU_FAULTY) 564 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 565 SUSPECT_STATE_FAULTY; 566 else if (alp->al_reason == FMD_ASRU_REPLACED) 567 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 568 SUSPECT_STATE_REPLACED; 569 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 570 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 571 SUSPECT_STATE_ACQUITED; 572 else 573 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 574 SUSPECT_STATE_REPAIRED; 575 576 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++) 577 if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1) 578 break; 579 if (cis != NULL) 580 fcmsp->fcms_new_susp_state[i] = 581 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp]; 582 else 583 fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |= 584 SUSPECT_STATE_NO_MATCH; 585 (*fcmsp->fcms_countp)++; 586 } 587 588 typedef struct { 589 int *fca_do_update; 590 fmd_case_impl_t *fca_cip; 591 } fca_t; 592 593 /* 594 * Re-fault all acquitted suspects that are still present in the new list. 595 */ 596 static void 597 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg) 598 { 599 fca_t *fcap = (fca_t *)arg; 600 fmd_case_impl_t *cip = fcap->fca_cip; 601 fmd_case_susp_t *cis; 602 int state = fmd_asru_al_getstate(alp); 603 604 if (!(state & FMD_ASRU_FAULTY) && 605 alp->al_reason == FMD_ASRU_ACQUITTED) { 606 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 607 if (fmd_case_match_suspect(cis->cis_nvl, 608 alp->al_event) == 1) 609 break; 610 if (cis != NULL) { 611 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 612 *fcap->fca_do_update = 1; 613 } 614 } 615 } 616 617 /* 618 * Re-fault all suspects that are still present in the new list. 619 */ 620 static void 621 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg) 622 { 623 fca_t *fcap = (fca_t *)arg; 624 fmd_case_impl_t *cip = fcap->fca_cip; 625 fmd_case_susp_t *cis; 626 int state = fmd_asru_al_getstate(alp); 627 628 if (!(state & FMD_ASRU_FAULTY)) { 629 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 630 if (fmd_case_match_suspect(cis->cis_nvl, 631 alp->al_event) == 1) 632 break; 633 if (cis != NULL) { 634 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 635 *fcap->fca_do_update = 1; 636 } 637 } 638 } 639 640 /* 641 * Acquit all suspects that are no longer present in the new list. 642 */ 643 static void 644 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg) 645 { 646 fca_t *fcap = (fca_t *)arg; 647 fmd_case_impl_t *cip = fcap->fca_cip; 648 fmd_case_susp_t *cis; 649 int state = fmd_asru_al_getstate(alp); 650 651 if (state & FMD_ASRU_FAULTY) { 652 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 653 if (fmd_case_match_suspect(cis->cis_nvl, 654 alp->al_event) == 1) 655 break; 656 if (cis == NULL) { 657 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 658 FMD_ASRU_ACQUITTED); 659 *fcap->fca_do_update = 1; 660 } 661 } 662 } 663 664 /* 665 * Acquit all isolated suspects. 666 */ 667 static void 668 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg) 669 { 670 int *do_update = (int *)arg; 671 int state = fmd_asru_al_getstate(alp); 672 673 if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) && 674 (state & FMD_ASRU_FAULTY)) { 675 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 676 FMD_ASRU_ACQUITTED); 677 *do_update = 1; 678 } 679 } 680 681 /* 682 * Acquit suspect which matches specified nvlist 683 */ 684 static void 685 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg) 686 { 687 nvlist_t *nvl = (nvlist_t *)arg; 688 int state = fmd_asru_al_getstate(alp); 689 690 if ((state & FMD_ASRU_FAULTY) && 691 fmd_case_match_suspect(nvl, alp->al_event) == 1) 692 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 693 FMD_ASRU_ACQUITTED); 694 } 695 696 typedef struct { 697 fmd_case_impl_t *fccd_cip; 698 uint8_t *fccd_new_susp_state; 699 uint8_t *fccd_new_match_state; 700 int *fccd_discard_new; 701 int *fccd_adjust_new; 702 } fccd_t; 703 704 /* 705 * see if a matching suspect list already exists in the cache 706 */ 707 static void 708 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg) 709 { 710 fccd_t *fccdp = (fccd_t *)arg; 711 fmd_case_impl_t *new_cip = fccdp->fccd_cip; 712 fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp; 713 int i, count = 0, do_update = 0, got_isolated_overlap = 0; 714 int got_faulty_overlap = 0; 715 int got_acquit_overlap = 0; 716 boolean_t too_recent; 717 uint64_t most_recent = 0; 718 fcms_t fcms; 719 fca_t fca; 720 uint8_t *new_susp_state; 721 uint8_t *old_susp_state; 722 uint8_t *old_match_state; 723 724 new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t)); 725 for (i = 0; i < new_cip->ci_nsuspects; i++) 726 new_susp_state[i] = 0; 727 old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 728 for (i = 0; i < old_cip->ci_nsuspects; i++) 729 old_susp_state[i] = 0; 730 old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 731 for (i = 0; i < old_cip->ci_nsuspects; i++) 732 old_match_state[i] = 0; 733 734 /* 735 * Compare with each suspect in the existing case. 736 */ 737 fcms.fcms_countp = &count; 738 fcms.fcms_maxcount = old_cip->ci_nsuspects; 739 fcms.fcms_cip = new_cip; 740 fcms.fcms_new_susp_state = new_susp_state; 741 fcms.fcms_old_susp_state = old_susp_state; 742 fcms.fcms_old_match_state = old_match_state; 743 fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip, 744 fmd_case_match_suspects, &fcms); 745 746 /* 747 * If we have some faulty, non-isolated suspects that overlap, then most 748 * likely it is the suspects that overlap in the suspect lists that are 749 * to blame. So we can consider this to be a match. 750 */ 751 for (i = 0; i < new_cip->ci_nsuspects; i++) 752 if (new_susp_state[i] == SUSPECT_STATE_FAULTY) 753 got_faulty_overlap = 1; 754 if (got_faulty_overlap && fmd_case_match_on_faulty_overlap) 755 goto got_match; 756 757 /* 758 * If we have no faulty, non-isolated suspects in the old case, but we 759 * do have some acquitted suspects that overlap, then most likely it is 760 * the acquitted suspects that overlap in the suspect lists that are 761 * to blame. So we can consider this to be a match. 762 */ 763 for (i = 0; i < new_cip->ci_nsuspects; i++) 764 if (new_susp_state[i] == SUSPECT_STATE_ACQUITED) 765 got_acquit_overlap = 1; 766 for (i = 0; i < old_cip->ci_nsuspects; i++) 767 if (old_susp_state[i] == SUSPECT_STATE_FAULTY) 768 got_acquit_overlap = 0; 769 if (got_acquit_overlap && fmd_case_match_on_acquit_overlap) 770 goto got_match; 771 772 /* 773 * Check that all suspects in the new list are present in the old list. 774 * Return if we find one that isn't. 775 */ 776 for (i = 0; i < new_cip->ci_nsuspects; i++) 777 if (new_susp_state[i] == 0) 778 return; 779 780 /* 781 * Check that all suspects in the old list are present in the new list 782 * *or* they are isolated or removed/replaced (which would explain why 783 * they are not present in the new list). Return if we find one that is 784 * faulty and unisolated or repaired or acquitted, and that is not 785 * present in the new case. 786 */ 787 for (i = 0; i < old_cip->ci_nsuspects; i++) 788 if (old_match_state[i] == SUSPECT_STATE_NO_MATCH && 789 (old_susp_state[i] == SUSPECT_STATE_FAULTY || 790 old_susp_state[i] == SUSPECT_STATE_ACQUITED || 791 old_susp_state[i] == SUSPECT_STATE_REPAIRED)) 792 return; 793 794 got_match: 795 /* 796 * If the old case is already in repaired/resolved state, we can't 797 * do anything more with it, so keep the new case, but acquit some 798 * of the suspects if appropriate. 799 */ 800 if (old_cip->ci_state >= FMD_CASE_REPAIRED) { 801 if (fmd_case_auto_acquit_non_acquitted) { 802 *fccdp->fccd_adjust_new = 1; 803 for (i = 0; i < new_cip->ci_nsuspects; i++) { 804 fccdp->fccd_new_susp_state[i] |= 805 new_susp_state[i]; 806 if (new_susp_state[i] == 0) 807 fccdp->fccd_new_susp_state[i] = 808 SUSPECT_STATE_NO_MATCH; 809 } 810 } 811 return; 812 } 813 814 /* 815 * Otherwise discard the new case and keep the old, again updating the 816 * state of the suspects as appropriate 817 */ 818 *fccdp->fccd_discard_new = 1; 819 fca.fca_cip = new_cip; 820 fca.fca_do_update = &do_update; 821 822 /* 823 * See if new case occurred within fmd_case_too_recent seconds of the 824 * most recent modification to the old case and if so don't do 825 * auto-acquit. This avoids problems if a flood of ereports come in and 826 * they don't all get diagnosed before the first case causes some of 827 * the devices to be isolated making it appear that an isolated device 828 * was in the suspect list. 829 */ 830 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 831 fmd_asru_most_recent, &most_recent); 832 too_recent = (new_cip->ci_tv.tv_sec - most_recent < 833 fmd_case_too_recent); 834 835 if (got_faulty_overlap) { 836 /* 837 * Acquit any suspects not present in the new list, plus 838 * any that are are present but are isolated. 839 */ 840 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 841 fmd_case_acquit_no_match, &fca); 842 if (fmd_case_auto_acquit_isolated && !too_recent) 843 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 844 fmd_case_acquit_isolated, &do_update); 845 } else if (got_acquit_overlap) { 846 /* 847 * Re-fault the acquitted matching suspects and acquit all 848 * isolated suspects. 849 */ 850 if (fmd_case_auto_acquit_isolated && !too_recent) { 851 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 852 fmd_case_fault_acquitted_matching, &fca); 853 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 854 fmd_case_acquit_isolated, &do_update); 855 } 856 } else if (fmd_case_auto_acquit_isolated) { 857 /* 858 * To get here, there must be no faulty or acquitted suspects, 859 * but there must be at least one isolated suspect. Just acquit 860 * non-matching isolated suspects. If there are no matching 861 * isolated suspects, then re-fault all matching suspects. 862 */ 863 for (i = 0; i < new_cip->ci_nsuspects; i++) 864 if (new_susp_state[i] == SUSPECT_STATE_ISOLATED) 865 got_isolated_overlap = 1; 866 if (!got_isolated_overlap) 867 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 868 fmd_case_fault_all_matching, &fca); 869 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 870 fmd_case_acquit_no_match, &fca); 871 } 872 873 /* 874 * If we've updated anything in the old case, call fmd_case_update() 875 */ 876 if (do_update) 877 fmd_case_update(old_cp); 878 } 879 880 /* 881 * Convict suspects in a case by applying a conviction policy and updating the 882 * resource cache prior to emitting the list.suspect event for the given case. 883 * At present, our policy is very simple: convict every suspect in the case. 884 * In the future, this policy can be extended and made configurable to permit: 885 * 886 * - convicting the suspect with the highest FIT rate 887 * - convicting the suspect with the cheapest FRU 888 * - convicting the suspect with the FRU that is in a depot's inventory 889 * - convicting the suspect with the longest lifetime 890 * 891 * and so forth. A word to the wise: this problem is significantly harder that 892 * it seems at first glance. Future work should heed the following advice: 893 * 894 * Hacking the policy into C code here is a very bad idea. The policy needs to 895 * be decided upon very carefully and fundamentally encodes knowledge of what 896 * suspect list combinations can be emitted by what diagnosis engines. As such 897 * fmd's code is the wrong location, because that would require fmd itself to 898 * be updated for every diagnosis engine change, defeating the entire design. 899 * The FMA Event Registry knows the suspect list combinations: policy inputs 900 * can be derived from it and used to produce per-module policy configuration. 901 * 902 * If the policy needs to be dynamic and not statically fixed at either fmd 903 * startup or module load time, any implementation of dynamic policy retrieval 904 * must employ some kind of caching mechanism or be part of a built-in module. 905 * The fmd_case_convict() function is called with locks held inside of fmd and 906 * is not a place where unbounded blocking on some inter-process or inter- 907 * system communication to another service (e.g. another daemon) can occur. 908 */ 909 static int 910 fmd_case_convict(fmd_case_t *cp) 911 { 912 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 913 fmd_asru_hash_t *ahp = fmd.d_asrus; 914 int discard_new = 0, i; 915 fmd_case_susp_t *cis; 916 fmd_asru_link_t *alp; 917 uint8_t *new_susp_state; 918 uint8_t *new_match_state; 919 int adjust_new = 0; 920 fccd_t fccd; 921 fmd_case_impl_t *ncp, **cps, **cpp; 922 uint_t cpc; 923 fmd_case_hash_t *chp; 924 925 /* 926 * First we must see if any matching cases already exist. 927 */ 928 new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 929 for (i = 0; i < cip->ci_nsuspects; i++) 930 new_susp_state[i] = 0; 931 new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 932 for (i = 0; i < cip->ci_nsuspects; i++) 933 new_match_state[i] = 0; 934 fccd.fccd_cip = cip; 935 fccd.fccd_adjust_new = &adjust_new; 936 fccd.fccd_new_susp_state = new_susp_state; 937 fccd.fccd_new_match_state = new_match_state; 938 fccd.fccd_discard_new = &discard_new; 939 940 /* 941 * Hold all cases 942 */ 943 chp = fmd.d_cases; 944 (void) pthread_rwlock_rdlock(&chp->ch_lock); 945 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 946 cpc = chp->ch_count; 947 for (i = 0; i < chp->ch_hashlen; i++) 948 for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next) 949 *cpp++ = fmd_case_tryhold(ncp); 950 ASSERT(cpp == cps + cpc); 951 (void) pthread_rwlock_unlock(&chp->ch_lock); 952 953 /* 954 * Run fmd_case_check_for_dups() on all cases except the current one. 955 */ 956 for (i = 0; i < cpc; i++) { 957 if (cps[i] != NULL) { 958 if (cps[i] != (fmd_case_impl_t *)cp) 959 fmd_case_check_for_dups((fmd_case_t *)cps[i], 960 &fccd); 961 fmd_case_rele((fmd_case_t *)cps[i]); 962 } 963 } 964 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 965 966 (void) pthread_mutex_lock(&cip->ci_lock); 967 if (cip->ci_code == NULL) 968 (void) fmd_case_mkcode(cp); 969 else if (cip->ci_precanned) 970 fmd_case_code_hash_insert(fmd.d_cases, cip); 971 972 if (discard_new) { 973 /* 974 * We've found an existing case that is a match and it is not 975 * already in repaired or resolved state. So we can close this 976 * one as a duplicate. 977 */ 978 (void) pthread_mutex_unlock(&cip->ci_lock); 979 return (1); 980 } 981 982 /* 983 * Allocate new cache entries 984 */ 985 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 986 if ((alp = fmd_asru_hash_create_entry(ahp, 987 cp, cis->cis_nvl)) == NULL) { 988 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 989 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 990 continue; 991 } 992 alp->al_flags |= FMD_ASRU_PRESENT; 993 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; 994 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 995 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 996 } 997 998 if (adjust_new) { 999 int some_suspect = 0, some_not_suspect = 0; 1000 1001 /* 1002 * There is one or more matching case but they are already in 1003 * repaired or resolved state. So we need to keep the new 1004 * case, but we can adjust it. Repaired/removed/replaced 1005 * suspects are unlikely to be to blame (unless there are 1006 * actually two separate faults). So if we have a combination of 1007 * repaired/replaced/removed suspects and acquitted suspects in 1008 * the old lists, then we should acquit in the new list those 1009 * that were repaired/replaced/removed in the old. 1010 */ 1011 for (i = 0; i < cip->ci_nsuspects; i++) { 1012 if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) || 1013 (new_susp_state[i] & SUSPECT_STATE_REPAIRED) || 1014 (new_susp_state[i] & SUSPECT_STATE_REMOVED) || 1015 (new_match_state[i] & SUSPECT_STATE_NO_MATCH)) 1016 some_not_suspect = 1; 1017 else 1018 some_suspect = 1; 1019 } 1020 if (some_suspect && some_not_suspect) { 1021 for (cis = cip->ci_suspects, i = 0; cis != NULL; 1022 cis = cis->cis_next, i++) 1023 if ((new_susp_state[i] & 1024 SUSPECT_STATE_REPLACED) || 1025 (new_susp_state[i] & 1026 SUSPECT_STATE_REPAIRED) || 1027 (new_susp_state[i] & 1028 SUSPECT_STATE_REMOVED) || 1029 (new_match_state[i] & 1030 SUSPECT_STATE_NO_MATCH)) 1031 fmd_asru_hash_apply_by_case(fmd.d_asrus, 1032 cp, fmd_case_acquit_suspect, 1033 cis->cis_nvl); 1034 } 1035 } 1036 1037 (void) pthread_mutex_unlock(&cip->ci_lock); 1038 return (0); 1039 } 1040 1041 void 1042 fmd_case_publish(fmd_case_t *cp, uint_t state) 1043 { 1044 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1045 fmd_event_t *e; 1046 nvlist_t *nvl; 1047 char *class; 1048 1049 if (state == FMD_CASE_CURRENT) 1050 state = cip->ci_state; /* use current state */ 1051 1052 switch (state) { 1053 case FMD_CASE_SOLVED: 1054 (void) pthread_mutex_lock(&cip->ci_lock); 1055 1056 /* 1057 * If we already have a code, then case is already solved. 1058 */ 1059 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && 1060 cip->ci_code != NULL) { 1061 (void) pthread_mutex_unlock(&cip->ci_lock); 1062 break; 1063 } 1064 1065 if (cip->ci_tv_valid == 0) { 1066 fmd_time_gettimeofday(&cip->ci_tv); 1067 cip->ci_tv_valid = 1; 1068 } 1069 (void) pthread_mutex_unlock(&cip->ci_lock); 1070 1071 if (fmd_case_convict(cp) == 1) { /* dupclose */ 1072 cip->ci_flags &= ~FMD_CF_SOLVED; 1073 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 1074 break; 1075 } 1076 if (cip->ci_xprt != NULL) { 1077 /* 1078 * For proxy, save some information about the transport 1079 * in the resource cache. 1080 */ 1081 int count = 0; 1082 fmd_asru_set_on_proxy_t fasp; 1083 fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; 1084 1085 fasp.fasp_countp = &count; 1086 fasp.fasp_maxcount = cip->ci_nsuspects; 1087 fasp.fasp_proxy_asru = cip->ci_proxy_asru; 1088 fasp.fasp_proxy_external = xip->xi_flags & 1089 FMD_XPRT_EXTERNAL; 1090 fasp.fasp_proxy_rdonly = ((xip->xi_flags & 1091 FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); 1092 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1093 fmd_asru_set_on_proxy, &fasp); 1094 } 1095 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 1096 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1097 1098 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1099 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1100 fmd_log_append(fmd.d_fltlog, e, cp); 1101 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1102 fmd_dispq_dispatch(fmd.d_disp, e, class); 1103 1104 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1105 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 1106 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1107 1108 break; 1109 1110 case FMD_CASE_CLOSE_WAIT: 1111 fmd_case_hold(cp); 1112 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 1113 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1114 1115 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1116 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 1117 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1118 1119 break; 1120 1121 case FMD_CASE_CLOSED: 1122 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 1123 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1124 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1125 fmd_dispq_dispatch(fmd.d_disp, e, class); 1126 break; 1127 1128 case FMD_CASE_REPAIRED: 1129 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1130 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1131 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1132 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1133 fmd_log_append(fmd.d_fltlog, e, cp); 1134 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1135 fmd_dispq_dispatch(fmd.d_disp, e, class); 1136 break; 1137 1138 case FMD_CASE_RESOLVED: 1139 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 1140 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1141 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1142 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1143 fmd_log_append(fmd.d_fltlog, e, cp); 1144 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1145 fmd_dispq_dispatch(fmd.d_disp, e, class); 1146 break; 1147 } 1148 } 1149 1150 fmd_case_t * 1151 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 1152 { 1153 fmd_case_impl_t *cip; 1154 uint_t h; 1155 1156 (void) pthread_rwlock_rdlock(&chp->ch_lock); 1157 h = fmd_strhash(uuid) % chp->ch_hashlen; 1158 1159 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 1160 if (strcmp(cip->ci_uuid, uuid) == 0) 1161 break; 1162 } 1163 1164 /* 1165 * If deleting bit is set, treat the case as if it doesn't exist. 1166 */ 1167 if (cip != NULL) 1168 cip = fmd_case_tryhold(cip); 1169 1170 if (cip == NULL) 1171 (void) fmd_set_errno(EFMD_CASE_INVAL); 1172 1173 (void) pthread_rwlock_unlock(&chp->ch_lock); 1174 return ((fmd_case_t *)cip); 1175 } 1176 1177 static fmd_case_impl_t * 1178 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1179 { 1180 fmd_case_impl_t *eip; 1181 uint_t h; 1182 1183 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1184 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1185 1186 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 1187 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 1188 fmd_case_tryhold(eip) != NULL) { 1189 (void) pthread_rwlock_unlock(&chp->ch_lock); 1190 return (eip); /* uuid already present */ 1191 } 1192 } 1193 1194 cip->ci_next = chp->ch_hash[h]; 1195 chp->ch_hash[h] = cip; 1196 1197 chp->ch_count++; 1198 ASSERT(chp->ch_count != 0); 1199 1200 (void) pthread_rwlock_unlock(&chp->ch_lock); 1201 return (cip); 1202 } 1203 1204 static void 1205 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1206 { 1207 fmd_case_impl_t *cp, **pp; 1208 uint_t h; 1209 1210 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1211 1212 cip->ci_flags |= FMD_CF_DELETING; 1213 (void) pthread_mutex_unlock(&cip->ci_lock); 1214 1215 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1216 1217 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1218 pp = &chp->ch_hash[h]; 1219 1220 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 1221 if (cp != cip) 1222 pp = &cp->ci_next; 1223 else 1224 break; 1225 } 1226 1227 if (cp == NULL) { 1228 fmd_panic("case %p (%s) not found on hash chain %u\n", 1229 (void *)cip, cip->ci_uuid, h); 1230 } 1231 1232 *pp = cp->ci_next; 1233 cp->ci_next = NULL; 1234 1235 /* 1236 * delete from code hash if it is on it 1237 */ 1238 fmd_case_code_hash_delete(chp, cip); 1239 1240 ASSERT(chp->ch_count != 0); 1241 chp->ch_count--; 1242 1243 (void) pthread_rwlock_unlock(&chp->ch_lock); 1244 1245 (void) pthread_mutex_lock(&cip->ci_lock); 1246 ASSERT(cip->ci_flags & FMD_CF_DELETING); 1247 } 1248 1249 fmd_case_t * 1250 fmd_case_create(fmd_module_t *mp, void *data) 1251 { 1252 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1253 fmd_case_impl_t *eip = NULL; 1254 uuid_t uuid; 1255 1256 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1257 fmd_buf_hash_create(&cip->ci_bufs); 1258 1259 fmd_module_hold(mp); 1260 cip->ci_mod = mp; 1261 cip->ci_refs = 1; 1262 cip->ci_state = FMD_CASE_UNSOLVED; 1263 cip->ci_flags = FMD_CF_DIRTY; 1264 cip->ci_data = data; 1265 1266 /* 1267 * Calling libuuid: get a clue. The library interfaces cleverly do not 1268 * define any constant for the length of an unparse string, and do not 1269 * permit the caller to specify a buffer length for safety. The spec 1270 * says it will be 36 bytes, but we make it tunable just in case. 1271 */ 1272 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 1273 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 1274 1275 /* 1276 * We expect this loop to execute only once, but code it defensively 1277 * against the possibility of libuuid bugs. Keep generating uuids and 1278 * attempting to do a hash insert until we get a unique one. 1279 */ 1280 do { 1281 if (eip != NULL) 1282 fmd_case_rele((fmd_case_t *)eip); 1283 uuid_generate(uuid); 1284 uuid_unparse(uuid, cip->ci_uuid); 1285 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 1286 1287 ASSERT(fmd_module_locked(mp)); 1288 fmd_list_append(&mp->mod_cases, cip); 1289 fmd_module_setcdirty(mp); 1290 1291 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1292 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1293 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1294 1295 return ((fmd_case_t *)cip); 1296 } 1297 1298 static void 1299 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 1300 { 1301 fmd_case_susp_t *cis, *ncis; 1302 1303 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1304 1305 if (cip->ci_proxy_asru) 1306 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * 1307 cip->ci_nsuspects); 1308 if (cip->ci_diag_de) 1309 nvlist_free(cip->ci_diag_de); 1310 if (cip->ci_diag_asru) 1311 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * 1312 cip->ci_nsuspects); 1313 1314 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 1315 ncis = cis->cis_next; 1316 nvlist_free(cis->cis_nvl); 1317 fmd_free(cis, sizeof (fmd_case_susp_t)); 1318 } 1319 1320 cip->ci_suspects = NULL; 1321 cip->ci_nsuspects = 0; 1322 } 1323 1324 fmd_case_t * 1325 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 1326 uint_t state, const char *uuid, const char *code) 1327 { 1328 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1329 fmd_case_impl_t *eip; 1330 1331 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1332 fmd_buf_hash_create(&cip->ci_bufs); 1333 1334 fmd_module_hold(mp); 1335 cip->ci_mod = mp; 1336 cip->ci_xprt = xp; 1337 cip->ci_refs = 1; 1338 cip->ci_state = state; 1339 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 1340 cip->ci_uuidlen = strlen(cip->ci_uuid); 1341 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 1342 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 1343 1344 if (state > FMD_CASE_CLOSE_WAIT) 1345 cip->ci_flags |= FMD_CF_SOLVED; 1346 1347 /* 1348 * Insert the case into the global case hash. If the specified UUID is 1349 * already present, check to see if it is an orphan: if so, reclaim it; 1350 * otherwise if it is owned by a different module then return NULL. 1351 */ 1352 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 1353 (void) pthread_mutex_lock(&cip->ci_lock); 1354 cip->ci_refs--; /* decrement to zero */ 1355 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 1356 1357 cip = eip; /* switch 'cip' to the existing case */ 1358 (void) pthread_mutex_lock(&cip->ci_lock); 1359 1360 /* 1361 * If the ASRU cache is trying to recreate an orphan, then just 1362 * return the existing case that we found without changing it. 1363 */ 1364 if (mp == fmd.d_rmod) { 1365 /* 1366 * In case the case has already been created from 1367 * a checkpoint file we need to set up code now. 1368 */ 1369 if (cip->ci_state < FMD_CASE_CLOSED) { 1370 if (code != NULL && cip->ci_code == NULL) { 1371 cip->ci_code = fmd_strdup(code, 1372 FMD_SLEEP); 1373 cip->ci_codelen = cip->ci_code ? 1374 strlen(cip->ci_code) + 1 : 0; 1375 fmd_case_code_hash_insert(fmd.d_cases, 1376 cip); 1377 } 1378 } 1379 1380 /* 1381 * When recreating an orphan case, state passed in may 1382 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If 1383 * any suspects are still CLOSED (faulty) then the 1384 * overall state needs to be CLOSED. 1385 */ 1386 if ((cip->ci_state == FMD_CASE_REPAIRED || 1387 cip->ci_state == FMD_CASE_RESOLVED) && 1388 state == FMD_CASE_CLOSED) 1389 cip->ci_state = FMD_CASE_CLOSED; 1390 (void) pthread_mutex_unlock(&cip->ci_lock); 1391 fmd_case_rele((fmd_case_t *)cip); 1392 return ((fmd_case_t *)cip); 1393 } 1394 1395 /* 1396 * If the existing case isn't an orphan or is being proxied, 1397 * then we have a UUID conflict: return failure to the caller. 1398 */ 1399 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 1400 (void) pthread_mutex_unlock(&cip->ci_lock); 1401 fmd_case_rele((fmd_case_t *)cip); 1402 return (NULL); 1403 } 1404 1405 /* 1406 * If the new module is reclaiming an orphaned case, remove 1407 * the case from the root module, switch ci_mod, and then fall 1408 * through to adding the case to the new owner module 'mp'. 1409 */ 1410 fmd_module_lock(cip->ci_mod); 1411 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1412 fmd_module_unlock(cip->ci_mod); 1413 1414 fmd_module_rele(cip->ci_mod); 1415 cip->ci_mod = mp; 1416 fmd_module_hold(mp); 1417 1418 /* 1419 * It's possible that fmd crashed or was restarted during a 1420 * previous solve operation between the asru cache being created 1421 * and the ckpt file being updated to SOLVED. Thus when the DE 1422 * recreates the case here from the checkpoint file, the state 1423 * will be UNSOLVED and yet we are having to reclaim because 1424 * the case was in the asru cache. If this happens, revert the 1425 * case back to the UNSOLVED state and let the DE solve it again 1426 */ 1427 if (state == FMD_CASE_UNSOLVED) { 1428 fmd_asru_hash_delete_case(fmd.d_asrus, 1429 (fmd_case_t *)cip); 1430 fmd_case_destroy_suspects(cip); 1431 fmd_case_code_hash_delete(fmd.d_cases, cip); 1432 fmd_free(cip->ci_code, cip->ci_codelen); 1433 cip->ci_code = NULL; 1434 cip->ci_codelen = 0; 1435 cip->ci_tv_valid = 0; 1436 } 1437 1438 cip->ci_state = state; 1439 1440 (void) pthread_mutex_unlock(&cip->ci_lock); 1441 fmd_case_rele((fmd_case_t *)cip); 1442 } else { 1443 /* 1444 * add into hash of solved cases 1445 */ 1446 if (cip->ci_code) 1447 fmd_case_code_hash_insert(fmd.d_cases, cip); 1448 } 1449 1450 ASSERT(fmd_module_locked(mp)); 1451 fmd_list_append(&mp->mod_cases, cip); 1452 1453 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1454 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1455 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1456 1457 return ((fmd_case_t *)cip); 1458 } 1459 1460 void 1461 fmd_case_destroy(fmd_case_t *cp, int visible) 1462 { 1463 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1464 fmd_case_item_t *cit, *ncit; 1465 1466 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1467 ASSERT(cip->ci_refs == 0); 1468 1469 if (visible) { 1470 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 1471 fmd_case_hash_delete(fmd.d_cases, cip); 1472 } 1473 1474 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 1475 ncit = cit->cit_next; 1476 fmd_event_rele(cit->cit_event); 1477 fmd_free(cit, sizeof (fmd_case_item_t)); 1478 } 1479 1480 fmd_case_destroy_suspects(cip); 1481 1482 if (cip->ci_principal != NULL) 1483 fmd_event_rele(cip->ci_principal); 1484 1485 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1486 fmd_free(cip->ci_code, cip->ci_codelen); 1487 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1488 1489 fmd_module_rele(cip->ci_mod); 1490 fmd_free(cip, sizeof (fmd_case_impl_t)); 1491 } 1492 1493 void 1494 fmd_case_hold(fmd_case_t *cp) 1495 { 1496 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1497 1498 (void) pthread_mutex_lock(&cip->ci_lock); 1499 fmd_case_hold_locked(cp); 1500 (void) pthread_mutex_unlock(&cip->ci_lock); 1501 } 1502 1503 void 1504 fmd_case_hold_locked(fmd_case_t *cp) 1505 { 1506 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1507 1508 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1509 if (cip->ci_flags & FMD_CF_DELETING) 1510 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1511 (void *)cip, cip->ci_uuid); 1512 cip->ci_refs++; 1513 ASSERT(cip->ci_refs != 0); 1514 } 1515 1516 static fmd_case_impl_t * 1517 fmd_case_tryhold(fmd_case_impl_t *cip) 1518 { 1519 /* 1520 * If the case's "deleting" bit is unset, hold and return case, 1521 * otherwise, return NULL. 1522 */ 1523 (void) pthread_mutex_lock(&cip->ci_lock); 1524 if (cip->ci_flags & FMD_CF_DELETING) { 1525 (void) pthread_mutex_unlock(&cip->ci_lock); 1526 cip = NULL; 1527 } else { 1528 fmd_case_hold_locked((fmd_case_t *)cip); 1529 (void) pthread_mutex_unlock(&cip->ci_lock); 1530 } 1531 return (cip); 1532 } 1533 1534 void 1535 fmd_case_rele(fmd_case_t *cp) 1536 { 1537 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1538 1539 (void) pthread_mutex_lock(&cip->ci_lock); 1540 ASSERT(cip->ci_refs != 0); 1541 1542 if (--cip->ci_refs == 0) 1543 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1544 else 1545 (void) pthread_mutex_unlock(&cip->ci_lock); 1546 } 1547 1548 void 1549 fmd_case_rele_locked(fmd_case_t *cp) 1550 { 1551 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1552 1553 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1554 --cip->ci_refs; 1555 ASSERT(cip->ci_refs != 0); 1556 } 1557 1558 int 1559 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1560 { 1561 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1562 fmd_case_item_t *cit; 1563 fmd_event_t *oep; 1564 uint_t state; 1565 int new; 1566 1567 fmd_event_hold(ep); 1568 (void) pthread_mutex_lock(&cip->ci_lock); 1569 1570 if (cip->ci_flags & FMD_CF_SOLVED) 1571 state = FMD_EVS_DIAGNOSED; 1572 else 1573 state = FMD_EVS_ACCEPTED; 1574 1575 oep = cip->ci_principal; 1576 cip->ci_principal = ep; 1577 1578 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1579 if (cit->cit_event == ep) 1580 break; 1581 } 1582 1583 cip->ci_flags |= FMD_CF_DIRTY; 1584 new = cit == NULL && ep != oep; 1585 1586 (void) pthread_mutex_unlock(&cip->ci_lock); 1587 1588 fmd_module_setcdirty(cip->ci_mod); 1589 fmd_event_transition(ep, state); 1590 1591 if (oep != NULL) 1592 fmd_event_rele(oep); 1593 1594 return (new); 1595 } 1596 1597 int 1598 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1599 { 1600 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1601 fmd_case_item_t *cit; 1602 uint_t state; 1603 int new; 1604 boolean_t injected; 1605 1606 (void) pthread_mutex_lock(&cip->ci_lock); 1607 1608 if (cip->ci_flags & FMD_CF_SOLVED) 1609 state = FMD_EVS_DIAGNOSED; 1610 else 1611 state = FMD_EVS_ACCEPTED; 1612 1613 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1614 if (cit->cit_event == ep) 1615 break; 1616 } 1617 1618 new = cit == NULL && ep != cip->ci_principal; 1619 1620 /* 1621 * If the event is already in the case or the case is already solved, 1622 * there is no reason to save it: just transition it appropriately. 1623 */ 1624 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1625 (void) pthread_mutex_unlock(&cip->ci_lock); 1626 fmd_event_transition(ep, state); 1627 return (new); 1628 } 1629 1630 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1631 fmd_event_hold(ep); 1632 1633 if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl, 1634 "__injected", &injected) == 0 && injected) 1635 fmd_case_set_injected(cp); 1636 1637 cit->cit_next = cip->ci_items; 1638 cit->cit_event = ep; 1639 1640 cip->ci_items = cit; 1641 cip->ci_nitems++; 1642 1643 cip->ci_flags |= FMD_CF_DIRTY; 1644 (void) pthread_mutex_unlock(&cip->ci_lock); 1645 1646 fmd_module_setcdirty(cip->ci_mod); 1647 fmd_event_transition(ep, state); 1648 1649 return (new); 1650 } 1651 1652 void 1653 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1654 { 1655 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1656 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1657 1658 (void) pthread_mutex_lock(&cip->ci_lock); 1659 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1660 cip->ci_flags |= FMD_CF_DIRTY; 1661 1662 cis->cis_next = cip->ci_suspects; 1663 cis->cis_nvl = nvl; 1664 1665 cip->ci_suspects = cis; 1666 cip->ci_nsuspects++; 1667 1668 (void) pthread_mutex_unlock(&cip->ci_lock); 1669 if (cip->ci_xprt == NULL) 1670 fmd_module_setcdirty(cip->ci_mod); 1671 } 1672 1673 void 1674 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1675 { 1676 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1677 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1678 boolean_t b; 1679 1680 (void) pthread_mutex_lock(&cip->ci_lock); 1681 1682 cis->cis_next = cip->ci_suspects; 1683 cis->cis_nvl = nvl; 1684 1685 if (nvlist_lookup_boolean_value(nvl, 1686 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1687 cip->ci_flags |= FMD_CF_INVISIBLE; 1688 1689 cip->ci_suspects = cis; 1690 cip->ci_nsuspects++; 1691 1692 (void) pthread_mutex_unlock(&cip->ci_lock); 1693 } 1694 1695 void 1696 fmd_case_reset_suspects(fmd_case_t *cp) 1697 { 1698 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1699 1700 (void) pthread_mutex_lock(&cip->ci_lock); 1701 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1702 1703 fmd_case_destroy_suspects(cip); 1704 cip->ci_flags |= FMD_CF_DIRTY; 1705 1706 (void) pthread_mutex_unlock(&cip->ci_lock); 1707 fmd_module_setcdirty(cip->ci_mod); 1708 } 1709 1710 /*ARGSUSED*/ 1711 static void 1712 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1713 { 1714 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1715 } 1716 1717 /* 1718 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1719 * whatever actions and emit whatever events are appropriate for the state. 1720 * Refer to the topmost block comment explaining the state machine for details. 1721 */ 1722 void 1723 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1724 { 1725 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1726 fmd_case_item_t *cit; 1727 fmd_event_t *e; 1728 int resolved = 0; 1729 int any_unusable_and_present = 0; 1730 1731 ASSERT(state <= FMD_CASE_RESOLVED); 1732 (void) pthread_mutex_lock(&cip->ci_lock); 1733 1734 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1735 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); 1736 1737 cip->ci_flags |= flags; 1738 1739 if (cip->ci_state >= state) { 1740 (void) pthread_mutex_unlock(&cip->ci_lock); 1741 return; /* already in specified state */ 1742 } 1743 1744 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1745 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1746 1747 cip->ci_state = state; 1748 cip->ci_flags |= FMD_CF_DIRTY; 1749 1750 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1751 fmd_module_setcdirty(cip->ci_mod); 1752 1753 switch (state) { 1754 case FMD_CASE_SOLVED: 1755 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1756 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1757 1758 if (cip->ci_principal != NULL) { 1759 fmd_event_transition(cip->ci_principal, 1760 FMD_EVS_DIAGNOSED); 1761 } 1762 break; 1763 1764 case FMD_CASE_CLOSE_WAIT: 1765 /* 1766 * If the case was never solved, do not change ASRUs. 1767 * If the case was never fmd_case_closed, do not change ASRUs. 1768 * If the case was repaired, do not change ASRUs. 1769 */ 1770 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1771 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1772 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1773 fmd_case_unusable, NULL); 1774 1775 /* 1776 * If an orphaned case transitions to CLOSE_WAIT, the owning 1777 * module is no longer loaded: continue on to CASE_CLOSED or 1778 * CASE_REPAIRED as appropriate. 1779 */ 1780 if (fmd_case_orphaned(cp)) { 1781 if (cip->ci_flags & FMD_CF_REPAIRED) { 1782 state = cip->ci_state = FMD_CASE_REPAIRED; 1783 TRACE((FMD_DBG_CASE, "case %s %s->%s", 1784 cip->ci_uuid, 1785 _fmd_case_snames[FMD_CASE_CLOSE_WAIT], 1786 _fmd_case_snames[FMD_CASE_REPAIRED])); 1787 goto do_repair; 1788 } else { 1789 state = cip->ci_state = FMD_CASE_CLOSED; 1790 TRACE((FMD_DBG_CASE, "case %s %s->%s", 1791 cip->ci_uuid, 1792 _fmd_case_snames[FMD_CASE_CLOSE_WAIT], 1793 _fmd_case_snames[FMD_CASE_CLOSED])); 1794 } 1795 } 1796 break; 1797 1798 case FMD_CASE_REPAIRED: 1799 do_repair: 1800 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); 1801 1802 /* 1803 * If we've been requested to transition straight on to the 1804 * RESOLVED state (which can happen with fault proxying where a 1805 * list.resolved or a uuresolved is received from the other 1806 * side), or if all suspects are already either usable or not 1807 * present then transition straight to RESOLVED state, 1808 * publishing both the list.repaired and list.resolved. For a 1809 * proxy, if we discover here that all suspects are already 1810 * either usable or not present, notify the diag side instead 1811 * using fmd_xprt_uuresolved(). 1812 */ 1813 if (flags & FMD_CF_RESOLVED) { 1814 if (cip->ci_xprt != NULL) 1815 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1816 } else { 1817 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1818 fmd_case_unusable_and_present, 1819 &any_unusable_and_present); 1820 if (any_unusable_and_present) 1821 break; 1822 if (cip->ci_xprt != NULL) { 1823 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); 1824 break; 1825 } 1826 } 1827 1828 cip->ci_state = FMD_CASE_RESOLVED; 1829 (void) pthread_mutex_unlock(&cip->ci_lock); 1830 fmd_case_publish(cp, state); 1831 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1832 _fmd_case_snames[FMD_CASE_REPAIRED], 1833 _fmd_case_snames[FMD_CASE_RESOLVED])); 1834 state = FMD_CASE_RESOLVED; 1835 resolved = 1; 1836 (void) pthread_mutex_lock(&cip->ci_lock); 1837 break; 1838 1839 case FMD_CASE_RESOLVED: 1840 /* 1841 * For a proxy, no need to check that all suspects are already 1842 * either usable or not present - this request has come from 1843 * the diagnosing side which makes the final decision on this. 1844 */ 1845 if (cip->ci_xprt != NULL) { 1846 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1847 resolved = 1; 1848 break; 1849 } 1850 1851 ASSERT(fmd_case_orphaned(cp)); 1852 1853 /* 1854 * If all suspects are already either usable or not present then 1855 * carry on, publish list.resolved and discard the case. 1856 */ 1857 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1858 fmd_case_unusable_and_present, &any_unusable_and_present); 1859 if (any_unusable_and_present) { 1860 (void) pthread_mutex_unlock(&cip->ci_lock); 1861 return; 1862 } 1863 1864 resolved = 1; 1865 break; 1866 } 1867 1868 (void) pthread_mutex_unlock(&cip->ci_lock); 1869 1870 /* 1871 * If the module has initialized, then publish the appropriate event 1872 * for the new case state. If not, we are being called from the 1873 * checkpoint code during module load, in which case the module's 1874 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1875 * may not be open yet, which will prevent us from computing the event 1876 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1877 * event in our queue: this won't be processed until _fmd_init is done. 1878 */ 1879 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1880 fmd_case_publish(cp, state); 1881 else { 1882 fmd_case_hold(cp); 1883 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1884 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1885 } 1886 1887 if (resolved) { 1888 if (cip->ci_xprt != NULL) { 1889 /* 1890 * If we transitioned to RESOLVED, adjust the reference 1891 * count to reflect our removal from 1892 * fmd.d_rmod->mod_cases above. If the caller has not 1893 * placed an additional hold on the case, it will now 1894 * be freed. 1895 */ 1896 (void) pthread_mutex_lock(&cip->ci_lock); 1897 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1898 (void) pthread_mutex_unlock(&cip->ci_lock); 1899 fmd_case_rele(cp); 1900 } else { 1901 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1902 fmd_asru_log_resolved, NULL); 1903 (void) pthread_mutex_lock(&cip->ci_lock); 1904 /* mark as "ready to be discarded */ 1905 cip->ci_flags |= FMD_CF_RES_CMPL; 1906 (void) pthread_mutex_unlock(&cip->ci_lock); 1907 } 1908 } 1909 } 1910 1911 /* 1912 * Discard any case if it is in RESOLVED state (and if check_if_aged argument 1913 * is set if all suspects have passed the rsrc.aged time). 1914 */ 1915 void 1916 fmd_case_discard_resolved(fmd_case_t *cp, void *arg) 1917 { 1918 int check_if_aged = *(int *)arg; 1919 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1920 1921 /* 1922 * First check if case has completed transition to resolved. 1923 */ 1924 (void) pthread_mutex_lock(&cip->ci_lock); 1925 if (!(cip->ci_flags & FMD_CF_RES_CMPL)) { 1926 (void) pthread_mutex_unlock(&cip->ci_lock); 1927 return; 1928 } 1929 1930 /* 1931 * Now if check_is_aged is set, see if all suspects have aged. 1932 */ 1933 if (check_if_aged) { 1934 int aged = 1; 1935 1936 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1937 fmd_asru_check_if_aged, &aged); 1938 if (!aged) { 1939 (void) pthread_mutex_unlock(&cip->ci_lock); 1940 return; 1941 } 1942 } 1943 1944 /* 1945 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't 1946 * do it twice. 1947 */ 1948 fmd_module_lock(cip->ci_mod); 1949 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1950 fmd_module_unlock(cip->ci_mod); 1951 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1952 cip->ci_flags &= ~FMD_CF_RES_CMPL; 1953 (void) pthread_mutex_unlock(&cip->ci_lock); 1954 fmd_case_rele(cp); 1955 } 1956 1957 /* 1958 * Transition the specified case to *at least* the specified state by first 1959 * re-validating the suspect list using the resource cache. This function is 1960 * employed by the checkpoint code when restoring a saved, solved case to see 1961 * if the state of the case has effectively changed while fmd was not running 1962 * or the module was not loaded. 1963 */ 1964 void 1965 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1966 { 1967 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1968 1969 int usable = 0; /* are any suspects usable? */ 1970 1971 ASSERT(state >= FMD_CASE_SOLVED); 1972 (void) pthread_mutex_lock(&cip->ci_lock); 1973 1974 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1975 1976 (void) pthread_mutex_unlock(&cip->ci_lock); 1977 1978 if (!usable) { 1979 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1980 flags |= FMD_CF_ISOLATED; 1981 } 1982 1983 fmd_case_transition(cp, state, flags); 1984 } 1985 1986 void 1987 fmd_case_setdirty(fmd_case_t *cp) 1988 { 1989 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1990 1991 (void) pthread_mutex_lock(&cip->ci_lock); 1992 cip->ci_flags |= FMD_CF_DIRTY; 1993 (void) pthread_mutex_unlock(&cip->ci_lock); 1994 1995 fmd_module_setcdirty(cip->ci_mod); 1996 } 1997 1998 void 1999 fmd_case_clrdirty(fmd_case_t *cp) 2000 { 2001 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2002 2003 (void) pthread_mutex_lock(&cip->ci_lock); 2004 cip->ci_flags &= ~FMD_CF_DIRTY; 2005 (void) pthread_mutex_unlock(&cip->ci_lock); 2006 } 2007 2008 void 2009 fmd_case_commit(fmd_case_t *cp) 2010 { 2011 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2012 fmd_case_item_t *cit; 2013 2014 (void) pthread_mutex_lock(&cip->ci_lock); 2015 2016 if (cip->ci_flags & FMD_CF_DIRTY) { 2017 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 2018 fmd_event_commit(cit->cit_event); 2019 2020 if (cip->ci_principal != NULL) 2021 fmd_event_commit(cip->ci_principal); 2022 2023 fmd_buf_hash_commit(&cip->ci_bufs); 2024 cip->ci_flags &= ~FMD_CF_DIRTY; 2025 } 2026 2027 (void) pthread_mutex_unlock(&cip->ci_lock); 2028 } 2029 2030 /* 2031 * On proxy side, send back repair/acquit/etc request to diagnosing side 2032 */ 2033 void 2034 fmd_case_xprt_updated(fmd_case_t *cp) 2035 { 2036 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2037 nvlist_t **nva; 2038 uint8_t *ba; 2039 int msg = B_TRUE; 2040 int count = 0; 2041 fmd_case_lst_t fcl; 2042 2043 ASSERT(cip->ci_xprt != NULL); 2044 (void) pthread_mutex_lock(&cip->ci_lock); 2045 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 2046 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 2047 fcl.fcl_countp = &count; 2048 fcl.fcl_maxcount = cip->ci_nsuspects; 2049 fcl.fcl_msgp = &msg; 2050 fcl.fcl_ba = ba; 2051 fcl.fcl_nva = nva; 2052 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 2053 (void) pthread_mutex_unlock(&cip->ci_lock); 2054 fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, 2055 count); 2056 } 2057 2058 /* 2059 * fmd_case_update_status() can be called on either the proxy side when a 2060 * list.suspect is received, or on the diagnosing side when an update request 2061 * is received from the proxy. It updates the status in the resource cache. 2062 */ 2063 void 2064 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, 2065 uint8_t *diag_asrup) 2066 { 2067 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2068 int count = 0; 2069 fmd_asru_update_status_t faus; 2070 2071 /* 2072 * update status of resource cache entries 2073 */ 2074 faus.faus_countp = &count; 2075 faus.faus_maxcount = cip->ci_nsuspects; 2076 faus.faus_ba = statusp; 2077 faus.faus_proxy_asru = proxy_asrup; 2078 faus.faus_diag_asru = diag_asrup; 2079 faus.faus_is_proxy = (cip->ci_xprt != NULL); 2080 (void) pthread_mutex_lock(&cip->ci_lock); 2081 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, 2082 &faus); 2083 (void) pthread_mutex_unlock(&cip->ci_lock); 2084 } 2085 2086 /* 2087 * Called on either the proxy side or the diag side when a repair has taken 2088 * place on the other side but this side may know the asru "contains" 2089 * relationships. 2090 */ 2091 void 2092 fmd_case_update_containees(fmd_case_t *cp) 2093 { 2094 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2095 2096 (void) pthread_mutex_lock(&cip->ci_lock); 2097 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2098 fmd_asru_update_containees, NULL); 2099 (void) pthread_mutex_unlock(&cip->ci_lock); 2100 } 2101 2102 /* 2103 * fmd_case_close_status() is called on diagnosing side when proxy side 2104 * has had a uuclose. It updates the status in the resource cache. 2105 */ 2106 void 2107 fmd_case_close_status(fmd_case_t *cp) 2108 { 2109 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2110 int count = 0; 2111 fmd_asru_close_status_t facs; 2112 2113 /* 2114 * update status of resource cache entries 2115 */ 2116 facs.facs_countp = &count; 2117 facs.facs_maxcount = cip->ci_nsuspects; 2118 (void) pthread_mutex_lock(&cip->ci_lock); 2119 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, 2120 &facs); 2121 (void) pthread_mutex_unlock(&cip->ci_lock); 2122 } 2123 2124 /* 2125 * Indicate that the case may need to change state because one or more of the 2126 * ASRUs named as a suspect has changed state. We examine all the suspects 2127 * and if none are still faulty, we initiate a case close transition. 2128 */ 2129 void 2130 fmd_case_update(fmd_case_t *cp) 2131 { 2132 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2133 uint_t cstate; 2134 int faulty = 0; 2135 2136 (void) pthread_mutex_lock(&cip->ci_lock); 2137 cstate = cip->ci_state; 2138 2139 if (cip->ci_state < FMD_CASE_SOLVED) { 2140 (void) pthread_mutex_unlock(&cip->ci_lock); 2141 return; /* update is not appropriate */ 2142 } 2143 2144 if (cip->ci_flags & FMD_CF_REPAIRED) { 2145 (void) pthread_mutex_unlock(&cip->ci_lock); 2146 return; /* already repaired */ 2147 } 2148 2149 TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); 2150 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2151 (void) pthread_mutex_unlock(&cip->ci_lock); 2152 2153 if (faulty) { 2154 nvlist_t *nvl; 2155 fmd_event_t *e; 2156 char *class; 2157 2158 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); 2159 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2160 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2161 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2162 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 2163 fmd_log_append(fmd.d_fltlog, e, cp); 2164 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 2165 fmd_dispq_dispatch(fmd.d_disp, e, class); 2166 return; /* one or more suspects are still marked faulty */ 2167 } 2168 2169 if (cstate == FMD_CASE_CLOSED) 2170 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2171 else 2172 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2173 } 2174 2175 /* 2176 * Delete a closed case from the module's case list once the fmdo_close() entry 2177 * point has run to completion. If the case is owned by a transport module, 2178 * tell the transport to proxy a case close on the other end of the transport. 2179 * Transition to the appropriate next state based on ci_flags. This 2180 * function represents the end of CLOSE_WAIT and transitions the case to either 2181 * CLOSED or REPAIRED or discards it entirely because it was never solved; 2182 * refer to the topmost block comment explaining the state machine for details. 2183 */ 2184 void 2185 fmd_case_delete(fmd_case_t *cp) 2186 { 2187 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2188 fmd_modstat_t *msp; 2189 size_t buftotal; 2190 2191 TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); 2192 ASSERT(fmd_module_locked(cip->ci_mod)); 2193 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2194 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 2195 2196 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2197 msp = cip->ci_mod->mod_stats; 2198 2199 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 2200 msp->ms_caseopen.fmds_value.ui64--; 2201 2202 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 2203 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 2204 2205 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2206 2207 if (cip->ci_xprt == NULL) 2208 fmd_module_setcdirty(cip->ci_mod); 2209 2210 fmd_module_rele(cip->ci_mod); 2211 cip->ci_mod = fmd.d_rmod; 2212 fmd_module_hold(cip->ci_mod); 2213 2214 /* 2215 * If the case has been solved, then retain it 2216 * on the root module's case list at least until we're transitioned. 2217 * Otherwise free the case with our final fmd_case_rele() below. 2218 */ 2219 if (cip->ci_flags & FMD_CF_SOLVED) { 2220 fmd_module_lock(cip->ci_mod); 2221 fmd_list_append(&cip->ci_mod->mod_cases, cip); 2222 fmd_module_unlock(cip->ci_mod); 2223 fmd_case_hold(cp); 2224 } 2225 2226 /* 2227 * Transition onwards to REPAIRED or CLOSED as originally requested. 2228 * Note that for proxy case if we're transitioning to CLOSED it means 2229 * the case was isolated locally, so call fmd_xprt_uuclose() to notify 2230 * the diagnosing side. No need to notify the diagnosing side if we are 2231 * transitioning to REPAIRED as we only do this when requested to do 2232 * so by the diagnosing side anyway. 2233 */ 2234 if (cip->ci_flags & FMD_CF_REPAIRED) 2235 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 2236 else if (cip->ci_flags & FMD_CF_ISOLATED) { 2237 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 2238 if (cip->ci_xprt != NULL) 2239 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 2240 } 2241 2242 fmd_case_rele(cp); 2243 } 2244 2245 void 2246 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) 2247 { 2248 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2249 2250 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2251 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 2252 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2253 2254 ASSERT(fmd_module_locked(cip->ci_mod)); 2255 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2256 if (delete_from_asru_cache) { 2257 (void) pthread_mutex_lock(&cip->ci_lock); 2258 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 2259 (void) pthread_mutex_unlock(&cip->ci_lock); 2260 } 2261 fmd_case_rele(cp); 2262 } 2263 2264 /* 2265 * Indicate that the problem corresponding to a case has been repaired by 2266 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 2267 * already been closed, this function initiates the transition to CLOSE_WAIT. 2268 * The caller must have the case held from fmd_case_hash_lookup(), so we can 2269 * grab and drop ci_lock without the case being able to be freed in between. 2270 */ 2271 int 2272 fmd_case_repair(fmd_case_t *cp) 2273 { 2274 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2275 uint_t cstate; 2276 fmd_asru_rep_arg_t fara; 2277 2278 (void) pthread_mutex_lock(&cip->ci_lock); 2279 cstate = cip->ci_state; 2280 2281 if (cstate < FMD_CASE_SOLVED) { 2282 (void) pthread_mutex_unlock(&cip->ci_lock); 2283 return (fmd_set_errno(EFMD_CASE_STATE)); 2284 } 2285 2286 if (cip->ci_flags & FMD_CF_REPAIRED) { 2287 (void) pthread_mutex_unlock(&cip->ci_lock); 2288 return (0); /* already repaired */ 2289 } 2290 2291 TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); 2292 fara.fara_reason = FMD_ASRU_REPAIRED; 2293 fara.fara_bywhat = FARA_BY_CASE; 2294 fara.fara_rval = NULL; 2295 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2296 (void) pthread_mutex_unlock(&cip->ci_lock); 2297 2298 /* 2299 * if this is a proxied case, send the repair across the transport. 2300 * The remote side will then do the repair and send a list.repaired back 2301 * again such that we can finally repair the case on this side. 2302 */ 2303 if (cip->ci_xprt != NULL) { 2304 fmd_case_xprt_updated(cp); 2305 return (0); 2306 } 2307 2308 if (cstate == FMD_CASE_CLOSED) 2309 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2310 else 2311 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2312 2313 return (0); 2314 } 2315 2316 int 2317 fmd_case_acquit(fmd_case_t *cp) 2318 { 2319 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2320 uint_t cstate; 2321 fmd_asru_rep_arg_t fara; 2322 2323 (void) pthread_mutex_lock(&cip->ci_lock); 2324 cstate = cip->ci_state; 2325 2326 if (cstate < FMD_CASE_SOLVED) { 2327 (void) pthread_mutex_unlock(&cip->ci_lock); 2328 return (fmd_set_errno(EFMD_CASE_STATE)); 2329 } 2330 2331 if (cip->ci_flags & FMD_CF_REPAIRED) { 2332 (void) pthread_mutex_unlock(&cip->ci_lock); 2333 return (0); /* already repaired */ 2334 } 2335 2336 TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); 2337 fara.fara_reason = FMD_ASRU_ACQUITTED; 2338 fara.fara_bywhat = FARA_BY_CASE; 2339 fara.fara_rval = NULL; 2340 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2341 (void) pthread_mutex_unlock(&cip->ci_lock); 2342 2343 /* 2344 * if this is a proxied case, send the repair across the transport. 2345 * The remote side will then do the repair and send a list.repaired back 2346 * again such that we can finally repair the case on this side. 2347 */ 2348 if (cip->ci_xprt != NULL) { 2349 fmd_case_xprt_updated(cp); 2350 return (0); 2351 } 2352 2353 if (cstate == FMD_CASE_CLOSED) 2354 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2355 else 2356 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2357 2358 return (0); 2359 } 2360 2361 int 2362 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 2363 { 2364 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2365 fmd_case_item_t *cit; 2366 uint_t state; 2367 int rv = 0; 2368 2369 (void) pthread_mutex_lock(&cip->ci_lock); 2370 2371 if (cip->ci_state >= FMD_CASE_SOLVED) 2372 state = FMD_EVS_DIAGNOSED; 2373 else 2374 state = FMD_EVS_ACCEPTED; 2375 2376 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 2377 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 2378 break; 2379 } 2380 2381 if (rv == 0 && cip->ci_principal != NULL) 2382 rv = fmd_event_equal(ep, cip->ci_principal); 2383 2384 (void) pthread_mutex_unlock(&cip->ci_lock); 2385 2386 if (rv != 0) 2387 fmd_event_transition(ep, state); 2388 2389 return (rv); 2390 } 2391 2392 int 2393 fmd_case_orphaned(fmd_case_t *cp) 2394 { 2395 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 2396 } 2397 2398 void 2399 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 2400 { 2401 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 2402 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 2403 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 2404 } 2405 2406 void 2407 fmd_case_set_injected(fmd_case_t *cp) 2408 { 2409 ((fmd_case_impl_t *)cp)->ci_injected = 1; 2410 } 2411 2412 void 2413 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) 2414 { 2415 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2416 2417 if (cip->ci_diag_de) 2418 nvlist_free(cip->ci_diag_de); 2419 cip->ci_diag_de = nvl; 2420 } 2421 2422 void 2423 fmd_case_setcode(fmd_case_t *cp, char *code) 2424 { 2425 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2426 2427 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 2428 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 2429 } 2430 2431 /*ARGSUSED*/ 2432 static void 2433 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 2434 { 2435 int not_faulty = 0; 2436 int faulty = 0; 2437 nvlist_t *nvl; 2438 fmd_event_t *e; 2439 char *class; 2440 int any_unusable_and_present = 0; 2441 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2442 2443 if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) 2444 return; 2445 2446 if (cip->ci_state == FMD_CASE_RESOLVED) { 2447 cip->ci_flags |= FMD_CF_RES_CMPL; 2448 return; 2449 } 2450 2451 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2452 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 2453 ¬_faulty); 2454 2455 if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) { 2456 /* 2457 * If none of the suspects is faulty, replay the list.repaired. 2458 * If all suspects are already either usable or not present then 2459 * also transition straight to RESOLVED state. 2460 */ 2461 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2462 fmd_case_unusable_and_present, &any_unusable_and_present); 2463 if (!any_unusable_and_present) { 2464 cip->ci_state = FMD_CASE_RESOLVED; 2465 2466 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2467 cip->ci_uuid)); 2468 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2469 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2470 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2471 class); 2472 fmd_dispq_dispatch(fmd.d_disp, e, class); 2473 2474 TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", 2475 cip->ci_uuid)); 2476 fmd_case_publish(cp, FMD_CASE_RESOLVED); 2477 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2478 fmd_asru_log_resolved, NULL); 2479 cip->ci_flags |= FMD_CF_RES_CMPL; 2480 } else { 2481 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2482 cip->ci_uuid)); 2483 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2484 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2485 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2486 class); 2487 fmd_dispq_dispatch(fmd.d_disp, e, class); 2488 } 2489 } else if (faulty && not_faulty) { 2490 /* 2491 * if some but not all of the suspects are not faulty, replay 2492 * the list.updated. 2493 */ 2494 TRACE((FMD_DBG_CASE, "replay sending list.updated %s", 2495 cip->ci_uuid)); 2496 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2497 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2498 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2499 fmd_dispq_dispatch(fmd.d_disp, e, class); 2500 } 2501 } 2502 2503 void 2504 fmd_case_repair_replay() 2505 { 2506 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 2507 } 2508