1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * FMD Case Subsystem 29 * 30 * Diagnosis engines are expected to group telemetry events related to the 31 * diagnosis of a particular problem on the system into a set of cases. The 32 * diagnosis engine may have any number of cases open at a given point in time. 33 * Some cases may eventually be *solved* by associating a suspect list of one 34 * or more problems with the case, at which point fmd publishes a list.suspect 35 * event for the case and it becomes visible to administrators and agents. 36 * 37 * Every case is named using a UUID, and is globally visible in the case hash. 38 * Cases are reference-counted, except for the reference from the case hash 39 * itself. Consumers of case references include modules, which store active 40 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 41 * 42 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 43 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 44 * or transport) and the case is referenced by the mod_cases list. Once the 45 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 46 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 47 * 48 * +------------+ 49 * +----------| UNSOLVED | 50 * | +------------+ 51 * | 1 | 52 * | | 53 * | +-------v----+ 54 * 2 | | SOLVED | 55 * | +------------+ 56 * | 3 | 5 | 57 * +------------+ | | 58 * | | | 59 * +-v---v----v-+ 60 * | CLOSE_WAIT | 61 * +------------+ 62 * | | | 63 * +-----------+ | +------------+ 64 * | 4 | | 65 * v +-----v------+ | 66 * discard | CLOSED | 6 | 67 * +------------+ | 68 * | | 69 * | +------------+ 70 * 7 | | 71 * +-----v----v-+ 72 * | REPAIRED | 73 * +------------+ 74 * | 75 * 8 | 76 * +-----v------+ 77 * | RESOLVED | 78 * +------------+ 79 * | 80 * v 81 * discard 82 * 83 * The state machine changes are triggered by calls to fmd_case_transition() 84 * from various locations inside of fmd, as described below: 85 * 86 * [1] Called by: fmd_case_solve() 87 * Actions: FMD_CF_SOLVED flag is set in ci_flags 88 * conviction policy is applied to suspect list 89 * suspects convicted are marked faulty (F) in R$ 90 * list.suspect event logged and dispatched 91 * 92 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 93 * Actions: diagnosis engine fmdo_close() entry point scheduled 94 * case discarded upon exit from CLOSE_WAIT 95 * 96 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 97 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 98 * suspects convicted (F) are marked unusable (U) in R$ 99 * diagnosis engine fmdo_close() entry point scheduled 100 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 101 * 102 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 103 * Actions: list.isolated event dispatched 104 * case deleted from module's list of open cases 105 * 106 * [5] Called by: fmd_case_repair(), fmd_case_update() 107 * Actions: FMD_CF_REPAIR flag is set in ci_flags 108 * diagnosis engine fmdo_close() entry point scheduled 109 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 110 * 111 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 112 * Actions: suspects convicted are marked non faulty (!F) in R$ 113 * list.repaired or list.updated event dispatched 114 * 115 * [7] Called by: fmd_case_repair(), fmd_case_update() 116 * Actions: FMD_CF_REPAIR flag is set in ci_flags 117 * suspects convicted are marked non faulty (!F) in R$ 118 * list.repaired or list.updated event dispatched 119 * 120 * [8] Called by: fmd_case_uuresolve() 121 * Actions: list.resolved event dispatched 122 * case is discarded 123 */ 124 125 #include <sys/fm/protocol.h> 126 #include <uuid/uuid.h> 127 #include <alloca.h> 128 129 #include <fmd_alloc.h> 130 #include <fmd_module.h> 131 #include <fmd_error.h> 132 #include <fmd_conf.h> 133 #include <fmd_case.h> 134 #include <fmd_string.h> 135 #include <fmd_subr.h> 136 #include <fmd_protocol.h> 137 #include <fmd_event.h> 138 #include <fmd_eventq.h> 139 #include <fmd_dispq.h> 140 #include <fmd_buf.h> 141 #include <fmd_log.h> 142 #include <fmd_asru.h> 143 #include <fmd_fmri.h> 144 #include <fmd_xprt.h> 145 146 #include <fmd.h> 147 148 static const char *const _fmd_case_snames[] = { 149 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 150 "SOLVED", /* FMD_CASE_SOLVED */ 151 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 152 "CLOSED", /* FMD_CASE_CLOSED */ 153 "REPAIRED", /* FMD_CASE_REPAIRED */ 154 "RESOLVED" /* FMD_CASE_RESOLVED */ 155 }; 156 157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 158 159 fmd_case_hash_t * 160 fmd_case_hash_create(void) 161 { 162 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 163 164 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 165 chp->ch_hashlen = fmd.d_str_buckets; 166 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 167 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 168 FMD_SLEEP); 169 chp->ch_count = 0; 170 171 return (chp); 172 } 173 174 /* 175 * Destroy the case hash. Unlike most of our hash tables, no active references 176 * are kept by the case hash itself; all references come from other subsystems. 177 * The hash must be destroyed after all modules are unloaded; if anything was 178 * present in the hash it would be by definition a reference count leak. 179 */ 180 void 181 fmd_case_hash_destroy(fmd_case_hash_t *chp) 182 { 183 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 185 fmd_free(chp, sizeof (fmd_case_hash_t)); 186 } 187 188 /* 189 * Take a snapshot of the case hash by placing an additional hold on each 190 * member in an auxiliary array, and then call 'func' for each case. 191 */ 192 void 193 fmd_case_hash_apply(fmd_case_hash_t *chp, 194 void (*func)(fmd_case_t *, void *), void *arg) 195 { 196 fmd_case_impl_t *cp, **cps, **cpp; 197 uint_t cpc, i; 198 199 (void) pthread_rwlock_rdlock(&chp->ch_lock); 200 201 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 202 cpc = chp->ch_count; 203 204 for (i = 0; i < chp->ch_hashlen; i++) { 205 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 206 *cpp++ = fmd_case_tryhold(cp); 207 } 208 209 ASSERT(cpp == cps + cpc); 210 (void) pthread_rwlock_unlock(&chp->ch_lock); 211 212 for (i = 0; i < cpc; i++) { 213 if (cps[i] != NULL) { 214 func((fmd_case_t *)cps[i], arg); 215 fmd_case_rele((fmd_case_t *)cps[i]); 216 } 217 } 218 219 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 220 } 221 222 static void 223 fmd_case_hash_apply_except_current(fmd_case_hash_t *chp, 224 void (*func)(fmd_case_t *, void *), void *arg, fmd_case_t *current) 225 { 226 fmd_case_impl_t *cp, **cps, **cpp; 227 uint_t cpc, i; 228 229 (void) pthread_rwlock_rdlock(&chp->ch_lock); 230 231 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 232 cpc = chp->ch_count; 233 234 for (i = 0; i < chp->ch_hashlen; i++) { 235 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 236 if (cp != (fmd_case_impl_t *)current) 237 *cpp++ = fmd_case_tryhold(cp); 238 else 239 *cpp++ = cp; 240 } 241 242 ASSERT(cpp == cps + cpc); 243 (void) pthread_rwlock_unlock(&chp->ch_lock); 244 245 for (i = 0; i < cpc; i++) { 246 if (cps[i] != NULL && cps[i] != (fmd_case_impl_t *)current) { 247 func((fmd_case_t *)cps[i], arg); 248 fmd_case_rele((fmd_case_t *)cps[i]); 249 } 250 } 251 252 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 253 } 254 255 static void 256 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 257 { 258 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 259 260 cip->ci_code_next = chp->ch_code_hash[h]; 261 chp->ch_code_hash[h] = cip; 262 } 263 264 static void 265 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 266 { 267 fmd_case_impl_t **pp, *cp; 268 269 if (cip->ci_code) { 270 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 271 272 pp = &chp->ch_code_hash[h]; 273 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 274 if (cp != cip) 275 pp = &cp->ci_code_next; 276 else 277 break; 278 } 279 if (cp != NULL) { 280 *pp = cp->ci_code_next; 281 cp->ci_code_next = NULL; 282 } 283 } 284 } 285 286 /* 287 * Look up the diagcode for this case and cache it in ci_code. If no suspects 288 * were defined for this case or if the lookup fails, the event dictionary or 289 * module code is broken, and we set the event code to a precomputed default. 290 */ 291 static const char * 292 fmd_case_mkcode(fmd_case_t *cp) 293 { 294 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 295 fmd_case_susp_t *cis; 296 fmd_case_hash_t *chp = fmd.d_cases; 297 298 char **keys, **keyp; 299 const char *s; 300 301 ASSERT(MUTEX_HELD(&cip->ci_lock)); 302 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 303 304 /* 305 * delete any existing entry from code hash if it is on it 306 */ 307 fmd_case_code_hash_delete(chp, cip); 308 309 fmd_free(cip->ci_code, cip->ci_codelen); 310 cip->ci_codelen = cip->ci_mod->mod_codelen; 311 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 312 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 313 314 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 315 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 316 keyp++; 317 } 318 319 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 320 321 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 322 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 323 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 324 fmd_free(cip->ci_code, cip->ci_codelen); 325 cip->ci_codelen = strlen(s) + 1; 326 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 327 (void) strcpy(cip->ci_code, s); 328 } 329 330 /* 331 * add into hash of solved cases 332 */ 333 fmd_case_code_hash_insert(chp, cip); 334 335 return (cip->ci_code); 336 } 337 338 typedef struct { 339 int *fcl_countp; 340 int fcl_maxcount; 341 uint8_t *fcl_ba; 342 nvlist_t **fcl_nva; 343 int *fcl_msgp; 344 } fmd_case_lst_t; 345 346 static void 347 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 348 { 349 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 350 boolean_t b; 351 int state; 352 353 if (*entryp->fcl_countp >= entryp->fcl_maxcount) 354 return; 355 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 356 &b) == 0 && b == B_FALSE) 357 *entryp->fcl_msgp = B_FALSE; 358 entryp->fcl_ba[*entryp->fcl_countp] = 0; 359 state = fmd_asru_al_getstate(alp); 360 if (state & FMD_ASRU_DEGRADED) 361 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 362 if (state & FMD_ASRU_UNUSABLE) 363 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 364 if (state & FMD_ASRU_FAULTY) 365 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 366 if (!(state & FMD_ASRU_PRESENT)) 367 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 368 if (alp->al_reason == FMD_ASRU_REPAIRED) 369 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 370 else if (alp->al_reason == FMD_ASRU_REPLACED) 371 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 372 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 373 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 374 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 375 (*entryp->fcl_countp)++; 376 } 377 378 static void 379 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 380 { 381 int *faultyp = (int *)arg; 382 383 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 384 } 385 386 static void 387 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 388 { 389 int *usablep = (int *)arg; 390 391 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 392 } 393 394 static void 395 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 396 { 397 int *not_faultyp = (int *)arg; 398 399 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 400 } 401 402 /* 403 * Have we got any suspects with an asru that are still unusable and present? 404 */ 405 static void 406 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 407 { 408 int *rvalp = (int *)arg; 409 int state; 410 nvlist_t *asru; 411 412 /* 413 * if this a proxy case and this suspect doesn't have an local asru 414 * then state is unknown so we must assume it may still be unusable. 415 */ 416 if ((alp->al_flags & FMD_ASRU_PROXY) && 417 !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { 418 *rvalp |= B_TRUE; 419 return; 420 } 421 422 state = fmd_asru_al_getstate(alp); 423 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 424 return; 425 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 426 } 427 428 nvlist_t * 429 fmd_case_mkevent(fmd_case_t *cp, const char *class) 430 { 431 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 432 nvlist_t **nva, *nvl; 433 uint8_t *ba; 434 int msg = B_TRUE; 435 const char *code; 436 fmd_case_lst_t fcl; 437 int count = 0; 438 439 (void) pthread_mutex_lock(&cip->ci_lock); 440 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 441 442 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 443 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 444 445 /* 446 * For each suspect associated with the case, store its fault event 447 * nvlist in 'nva'. We also look to see if any of the suspect faults 448 * have asked not to be messaged. If any of them have made such a 449 * request, propagate that attribute to the composite list.* event. 450 * Finally, store each suspect's faulty status into the bitmap 'ba'. 451 */ 452 fcl.fcl_countp = &count; 453 fcl.fcl_maxcount = cip->ci_nsuspects; 454 fcl.fcl_msgp = &msg; 455 fcl.fcl_ba = ba; 456 fcl.fcl_nva = nva; 457 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 458 459 if (cip->ci_code == NULL) 460 (void) fmd_case_mkcode(cp); 461 /* 462 * For repair and updated event, we lookup diagcode from dict using key 463 * "list.repaired" or "list.updated" or "list.resolved". 464 */ 465 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 466 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 467 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 468 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 469 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 470 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 471 else 472 code = cip->ci_code; 473 474 if (msg == B_FALSE) 475 cip->ci_flags |= FMD_CF_INVISIBLE; 476 477 /* 478 * Use the ci_diag_de if one has been saved (eg for an injected fault). 479 * Otherwise use the authority for the current module. 480 */ 481 nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? 482 cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, 483 nva, ba, msg, &cip->ci_tv); 484 485 (void) pthread_mutex_unlock(&cip->ci_lock); 486 return (nvl); 487 } 488 489 static int fmd_case_match_on_faulty_overlap = 1; 490 static int fmd_case_match_on_acquit_overlap = 1; 491 static int fmd_case_auto_acquit_isolated = 1; 492 static int fmd_case_auto_acquit_non_acquitted = 1; 493 static int fmd_case_too_recent = 10; /* time in seconds */ 494 495 static boolean_t 496 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 497 { 498 nvlist_t *new_rsrc; 499 nvlist_t *rsrc; 500 char *new_name = NULL; 501 char *name = NULL; 502 ssize_t new_namelen; 503 ssize_t namelen; 504 int fmri_present = 1; 505 int new_fmri_present = 1; 506 int match = B_FALSE; 507 fmd_topo_t *ftp = fmd_topo_hold(); 508 509 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 510 fmri_present = 0; 511 else { 512 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 513 goto done; 514 name = fmd_alloc(namelen + 1, FMD_SLEEP); 515 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 516 goto done; 517 } 518 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 519 new_fmri_present = 0; 520 else { 521 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 522 goto done; 523 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 524 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 525 goto done; 526 } 527 match = (fmri_present == new_fmri_present && 528 (fmri_present == 0 || 529 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 530 done: 531 if (name != NULL) 532 fmd_free(name, namelen + 1); 533 if (new_name != NULL) 534 fmd_free(new_name, new_namelen + 1); 535 fmd_topo_rele(ftp); 536 return (match); 537 } 538 539 static int 540 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2) 541 { 542 char *class, *new_class; 543 544 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU)) 545 return (0); 546 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE)) 547 return (0); 548 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU)) 549 return (0); 550 (void) nvlist_lookup_string(nvl2, FM_CLASS, &class); 551 (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class); 552 return (strcmp(class, new_class) == 0); 553 } 554 555 typedef struct { 556 int *fcms_countp; 557 int fcms_maxcount; 558 fmd_case_impl_t *fcms_cip; 559 uint8_t *fcms_new_susp_state; 560 uint8_t *fcms_old_susp_state; 561 uint8_t *fcms_old_match_state; 562 } fcms_t; 563 #define SUSPECT_STATE_FAULTY 0x1 564 #define SUSPECT_STATE_ISOLATED 0x2 565 #define SUSPECT_STATE_REMOVED 0x4 566 #define SUSPECT_STATE_ACQUITED 0x8 567 #define SUSPECT_STATE_REPAIRED 0x10 568 #define SUSPECT_STATE_REPLACED 0x20 569 #define SUSPECT_STATE_NO_MATCH 0x1 570 571 /* 572 * This is called for each suspect in the old case. Compare it against each 573 * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state 574 * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not 575 * found in the old case. 576 */ 577 static void 578 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg) 579 { 580 fcms_t *fcmsp = (fcms_t *)arg; 581 fmd_case_impl_t *cip = fcmsp->fcms_cip; 582 fmd_case_susp_t *cis; 583 int i = 0; 584 int state = fmd_asru_al_getstate(alp); 585 586 if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount) 587 return; 588 589 if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) && 590 alp->al_reason == FMD_ASRU_REMOVED)) 591 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 592 SUSPECT_STATE_REMOVED; 593 else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY)) 594 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 595 SUSPECT_STATE_ISOLATED; 596 else if (state & FMD_ASRU_FAULTY) 597 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 598 SUSPECT_STATE_FAULTY; 599 else if (alp->al_reason == FMD_ASRU_REPLACED) 600 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 601 SUSPECT_STATE_REPLACED; 602 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 603 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 604 SUSPECT_STATE_ACQUITED; 605 else 606 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 607 SUSPECT_STATE_REPAIRED; 608 609 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++) 610 if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1) 611 break; 612 if (cis != NULL) 613 fcmsp->fcms_new_susp_state[i] = 614 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp]; 615 else 616 fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |= 617 SUSPECT_STATE_NO_MATCH; 618 (*fcmsp->fcms_countp)++; 619 } 620 621 typedef struct { 622 int *fca_do_update; 623 fmd_case_impl_t *fca_cip; 624 } fca_t; 625 626 /* 627 * Re-fault all acquitted suspects that are still present in the new list. 628 */ 629 static void 630 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg) 631 { 632 fca_t *fcap = (fca_t *)arg; 633 fmd_case_impl_t *cip = fcap->fca_cip; 634 fmd_case_susp_t *cis; 635 int state = fmd_asru_al_getstate(alp); 636 637 if (!(state & FMD_ASRU_FAULTY) && 638 alp->al_reason == FMD_ASRU_ACQUITTED) { 639 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 640 if (fmd_case_match_suspect(cis->cis_nvl, 641 alp->al_event) == 1) 642 break; 643 if (cis != NULL) { 644 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 645 *fcap->fca_do_update = 1; 646 } 647 } 648 } 649 650 /* 651 * Re-fault all suspects that are still present in the new list. 652 */ 653 static void 654 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg) 655 { 656 fca_t *fcap = (fca_t *)arg; 657 fmd_case_impl_t *cip = fcap->fca_cip; 658 fmd_case_susp_t *cis; 659 int state = fmd_asru_al_getstate(alp); 660 661 if (!(state & FMD_ASRU_FAULTY)) { 662 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 663 if (fmd_case_match_suspect(cis->cis_nvl, 664 alp->al_event) == 1) 665 break; 666 if (cis != NULL) { 667 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 668 *fcap->fca_do_update = 1; 669 } 670 } 671 } 672 673 /* 674 * Acquit all suspects that are no longer present in the new list. 675 */ 676 static void 677 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg) 678 { 679 fca_t *fcap = (fca_t *)arg; 680 fmd_case_impl_t *cip = fcap->fca_cip; 681 fmd_case_susp_t *cis; 682 int state = fmd_asru_al_getstate(alp); 683 684 if (state & FMD_ASRU_FAULTY) { 685 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 686 if (fmd_case_match_suspect(cis->cis_nvl, 687 alp->al_event) == 1) 688 break; 689 if (cis == NULL) { 690 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 691 FMD_ASRU_ACQUITTED); 692 *fcap->fca_do_update = 1; 693 } 694 } 695 } 696 697 /* 698 * Acquit all isolated suspects. 699 */ 700 static void 701 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg) 702 { 703 int *do_update = (int *)arg; 704 int state = fmd_asru_al_getstate(alp); 705 706 if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) && 707 (state & FMD_ASRU_FAULTY)) { 708 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 709 FMD_ASRU_ACQUITTED); 710 *do_update = 1; 711 } 712 } 713 714 /* 715 * Acquit suspect which matches specified nvlist 716 */ 717 static void 718 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg) 719 { 720 nvlist_t *nvl = (nvlist_t *)arg; 721 int state = fmd_asru_al_getstate(alp); 722 723 if ((state & FMD_ASRU_FAULTY) && 724 fmd_case_match_suspect(nvl, alp->al_event) == 1) 725 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 726 FMD_ASRU_ACQUITTED); 727 } 728 729 typedef struct { 730 fmd_case_impl_t *fccd_cip; 731 uint8_t *fccd_new_susp_state; 732 uint8_t *fccd_new_match_state; 733 int *fccd_discard_new; 734 int *fccd_adjust_new; 735 } fccd_t; 736 737 /* 738 * see if a matching suspect list already exists in the cache 739 */ 740 static void 741 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg) 742 { 743 fccd_t *fccdp = (fccd_t *)arg; 744 fmd_case_impl_t *new_cip = fccdp->fccd_cip; 745 fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp; 746 int i, count = 0, do_update = 0, got_isolated_overlap = 0; 747 int got_faulty_overlap = 0; 748 int got_acquit_overlap = 0; 749 boolean_t too_recent; 750 uint64_t most_recent = 0; 751 fcms_t fcms; 752 fca_t fca; 753 uint8_t *new_susp_state; 754 uint8_t *old_susp_state; 755 uint8_t *old_match_state; 756 757 new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t)); 758 for (i = 0; i < new_cip->ci_nsuspects; i++) 759 new_susp_state[i] = 0; 760 old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 761 for (i = 0; i < old_cip->ci_nsuspects; i++) 762 old_susp_state[i] = 0; 763 old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 764 for (i = 0; i < old_cip->ci_nsuspects; i++) 765 old_match_state[i] = 0; 766 767 /* 768 * Compare with each suspect in the existing case. 769 */ 770 fcms.fcms_countp = &count; 771 fcms.fcms_maxcount = old_cip->ci_nsuspects; 772 fcms.fcms_cip = new_cip; 773 fcms.fcms_new_susp_state = new_susp_state; 774 fcms.fcms_old_susp_state = old_susp_state; 775 fcms.fcms_old_match_state = old_match_state; 776 fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip, 777 fmd_case_match_suspects, &fcms); 778 779 /* 780 * If we have some faulty, non-isolated suspects that overlap, then most 781 * likely it is the suspects that overlap in the suspect lists that are 782 * to blame. So we can consider this to be a match. 783 */ 784 for (i = 0; i < new_cip->ci_nsuspects; i++) 785 if (new_susp_state[i] == SUSPECT_STATE_FAULTY) 786 got_faulty_overlap = 1; 787 if (got_faulty_overlap && fmd_case_match_on_faulty_overlap) 788 goto got_match; 789 790 /* 791 * If we have no faulty, non-isolated suspects in the old case, but we 792 * do have some acquitted suspects that overlap, then most likely it is 793 * the acquitted suspects that overlap in the suspect lists that are 794 * to blame. So we can consider this to be a match. 795 */ 796 for (i = 0; i < new_cip->ci_nsuspects; i++) 797 if (new_susp_state[i] == SUSPECT_STATE_ACQUITED) 798 got_acquit_overlap = 1; 799 for (i = 0; i < old_cip->ci_nsuspects; i++) 800 if (old_susp_state[i] == SUSPECT_STATE_FAULTY) 801 got_acquit_overlap = 0; 802 if (got_acquit_overlap && fmd_case_match_on_acquit_overlap) 803 goto got_match; 804 805 /* 806 * Check that all suspects in the new list are present in the old list. 807 * Return if we find one that isn't. 808 */ 809 for (i = 0; i < new_cip->ci_nsuspects; i++) 810 if (new_susp_state[i] == 0) 811 return; 812 813 /* 814 * Check that all suspects in the old list are present in the new list 815 * *or* they are isolated or removed/replaced (which would explain why 816 * they are not present in the new list). Return if we find one that is 817 * faulty and unisolated or repaired or acquitted, and that is not 818 * present in the new case. 819 */ 820 for (i = 0; i < old_cip->ci_nsuspects; i++) 821 if (old_match_state[i] == SUSPECT_STATE_NO_MATCH && 822 (old_susp_state[i] == SUSPECT_STATE_FAULTY || 823 old_susp_state[i] == SUSPECT_STATE_ACQUITED || 824 old_susp_state[i] == SUSPECT_STATE_REPAIRED)) 825 return; 826 827 got_match: 828 /* 829 * If the old case is already in repaired/resolved state, we can't 830 * do anything more with it, so keep the new case, but acquit some 831 * of the suspects if appropriate. 832 */ 833 if (old_cip->ci_state >= FMD_CASE_REPAIRED) { 834 if (fmd_case_auto_acquit_non_acquitted) { 835 *fccdp->fccd_adjust_new = 1; 836 for (i = 0; i < new_cip->ci_nsuspects; i++) { 837 fccdp->fccd_new_susp_state[i] |= 838 new_susp_state[i]; 839 if (new_susp_state[i] == 0) 840 fccdp->fccd_new_susp_state[i] = 841 SUSPECT_STATE_NO_MATCH; 842 } 843 } 844 return; 845 } 846 847 /* 848 * Otherwise discard the new case and keep the old, again updating the 849 * state of the suspects as appropriate 850 */ 851 *fccdp->fccd_discard_new = 1; 852 fca.fca_cip = new_cip; 853 fca.fca_do_update = &do_update; 854 855 /* 856 * See if new case occurred within fmd_case_too_recent seconds of the 857 * most recent modification to the old case and if so don't do 858 * auto-acquit. This avoids problems if a flood of ereports come in and 859 * they don't all get diagnosed before the first case causes some of 860 * the devices to be isolated making it appear that an isolated device 861 * was in the suspect list. 862 */ 863 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 864 fmd_asru_most_recent, &most_recent); 865 too_recent = (new_cip->ci_tv.tv_sec - most_recent < 866 fmd_case_too_recent); 867 868 if (got_faulty_overlap) { 869 /* 870 * Acquit any suspects not present in the new list, plus 871 * any that are are present but are isolated. 872 */ 873 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 874 fmd_case_acquit_no_match, &fca); 875 if (fmd_case_auto_acquit_isolated && !too_recent) 876 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 877 fmd_case_acquit_isolated, &do_update); 878 } else if (got_acquit_overlap) { 879 /* 880 * Re-fault the acquitted matching suspects and acquit all 881 * isolated suspects. 882 */ 883 if (fmd_case_auto_acquit_isolated && !too_recent) { 884 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 885 fmd_case_fault_acquitted_matching, &fca); 886 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 887 fmd_case_acquit_isolated, &do_update); 888 } 889 } else if (fmd_case_auto_acquit_isolated) { 890 /* 891 * To get here, there must be no faulty or acquitted suspects, 892 * but there must be at least one isolated suspect. Just acquit 893 * non-matching isolated suspects. If there are no matching 894 * isolated suspects, then re-fault all matching suspects. 895 */ 896 for (i = 0; i < new_cip->ci_nsuspects; i++) 897 if (new_susp_state[i] == SUSPECT_STATE_ISOLATED) 898 got_isolated_overlap = 1; 899 if (!got_isolated_overlap) 900 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 901 fmd_case_fault_all_matching, &fca); 902 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 903 fmd_case_acquit_no_match, &fca); 904 } 905 906 /* 907 * If we've updated anything in the old case, call fmd_case_update() 908 */ 909 if (do_update) 910 fmd_case_update(old_cp); 911 } 912 913 /* 914 * Convict suspects in a case by applying a conviction policy and updating the 915 * resource cache prior to emitting the list.suspect event for the given case. 916 * At present, our policy is very simple: convict every suspect in the case. 917 * In the future, this policy can be extended and made configurable to permit: 918 * 919 * - convicting the suspect with the highest FIT rate 920 * - convicting the suspect with the cheapest FRU 921 * - convicting the suspect with the FRU that is in a depot's inventory 922 * - convicting the suspect with the longest lifetime 923 * 924 * and so forth. A word to the wise: this problem is significantly harder that 925 * it seems at first glance. Future work should heed the following advice: 926 * 927 * Hacking the policy into C code here is a very bad idea. The policy needs to 928 * be decided upon very carefully and fundamentally encodes knowledge of what 929 * suspect list combinations can be emitted by what diagnosis engines. As such 930 * fmd's code is the wrong location, because that would require fmd itself to 931 * be updated for every diagnosis engine change, defeating the entire design. 932 * The FMA Event Registry knows the suspect list combinations: policy inputs 933 * can be derived from it and used to produce per-module policy configuration. 934 * 935 * If the policy needs to be dynamic and not statically fixed at either fmd 936 * startup or module load time, any implementation of dynamic policy retrieval 937 * must employ some kind of caching mechanism or be part of a built-in module. 938 * The fmd_case_convict() function is called with locks held inside of fmd and 939 * is not a place where unbounded blocking on some inter-process or inter- 940 * system communication to another service (e.g. another daemon) can occur. 941 */ 942 static int 943 fmd_case_convict(fmd_case_t *cp) 944 { 945 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 946 fmd_asru_hash_t *ahp = fmd.d_asrus; 947 int discard_new = 0, i; 948 fmd_case_susp_t *cis; 949 fmd_asru_link_t *alp; 950 uint8_t *new_susp_state; 951 uint8_t *new_match_state; 952 int adjust_new = 0; 953 fccd_t fccd; 954 955 (void) pthread_mutex_lock(&cip->ci_lock); 956 if (cip->ci_code == NULL) 957 (void) fmd_case_mkcode(cp); 958 else if (cip->ci_precanned) 959 fmd_case_code_hash_insert(fmd.d_cases, cip); 960 961 /* 962 * First we must see if any matching cases already exist. 963 */ 964 new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 965 for (i = 0; i < cip->ci_nsuspects; i++) 966 new_susp_state[i] = 0; 967 new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 968 for (i = 0; i < cip->ci_nsuspects; i++) 969 new_match_state[i] = 0; 970 fccd.fccd_cip = cip; 971 fccd.fccd_adjust_new = &adjust_new; 972 fccd.fccd_new_susp_state = new_susp_state; 973 fccd.fccd_new_match_state = new_match_state; 974 fccd.fccd_discard_new = &discard_new; 975 fmd_case_hash_apply_except_current(fmd.d_cases, fmd_case_check_for_dups, 976 &fccd, cp); 977 978 if (discard_new) { 979 /* 980 * We've found an existing case that is a match and it is not 981 * already in repaired or resolved state. So we can close this 982 * one as a duplicate. 983 */ 984 (void) pthread_mutex_unlock(&cip->ci_lock); 985 return (1); 986 } 987 988 /* 989 * Allocate new cache entries 990 */ 991 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 992 if ((alp = fmd_asru_hash_create_entry(ahp, 993 cp, cis->cis_nvl)) == NULL) { 994 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 995 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 996 continue; 997 } 998 alp->al_flags |= FMD_ASRU_PRESENT; 999 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; 1000 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 1001 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 1002 } 1003 1004 if (adjust_new) { 1005 int some_suspect = 0, some_not_suspect = 0; 1006 1007 /* 1008 * There is one or more matching case but they are already in 1009 * repaired or resolved state. So we need to keep the new 1010 * case, but we can adjust it. Repaired/removed/replaced 1011 * suspects are unlikely to be to blame (unless there are 1012 * actually two separate faults). So if we have a combination of 1013 * repaired/replaced/removed suspects and acquitted suspects in 1014 * the old lists, then we should acquit in the new list those 1015 * that were repaired/replaced/removed in the old. 1016 */ 1017 for (i = 0; i < cip->ci_nsuspects; i++) { 1018 if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) || 1019 (new_susp_state[i] & SUSPECT_STATE_REPAIRED) || 1020 (new_susp_state[i] & SUSPECT_STATE_REMOVED) || 1021 (new_match_state[i] & SUSPECT_STATE_NO_MATCH)) 1022 some_not_suspect = 1; 1023 else 1024 some_suspect = 1; 1025 } 1026 if (some_suspect && some_not_suspect) { 1027 for (cis = cip->ci_suspects, i = 0; cis != NULL; 1028 cis = cis->cis_next, i++) 1029 if ((new_susp_state[i] & 1030 SUSPECT_STATE_REPLACED) || 1031 (new_susp_state[i] & 1032 SUSPECT_STATE_REPAIRED) || 1033 (new_susp_state[i] & 1034 SUSPECT_STATE_REMOVED) || 1035 (new_match_state[i] & 1036 SUSPECT_STATE_NO_MATCH)) 1037 fmd_asru_hash_apply_by_case(fmd.d_asrus, 1038 cp, fmd_case_acquit_suspect, 1039 cis->cis_nvl); 1040 } 1041 } 1042 1043 (void) pthread_mutex_unlock(&cip->ci_lock); 1044 return (0); 1045 } 1046 1047 void 1048 fmd_case_publish(fmd_case_t *cp, uint_t state) 1049 { 1050 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1051 fmd_event_t *e; 1052 nvlist_t *nvl; 1053 char *class; 1054 1055 if (state == FMD_CASE_CURRENT) 1056 state = cip->ci_state; /* use current state */ 1057 1058 switch (state) { 1059 case FMD_CASE_SOLVED: 1060 (void) pthread_mutex_lock(&cip->ci_lock); 1061 1062 /* 1063 * If we already have a code, then case is already solved. 1064 */ 1065 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && 1066 cip->ci_code != NULL) { 1067 (void) pthread_mutex_unlock(&cip->ci_lock); 1068 break; 1069 } 1070 1071 if (cip->ci_tv_valid == 0) { 1072 fmd_time_gettimeofday(&cip->ci_tv); 1073 cip->ci_tv_valid = 1; 1074 } 1075 (void) pthread_mutex_unlock(&cip->ci_lock); 1076 1077 if (fmd_case_convict(cp) == 1) { /* dupclose */ 1078 cip->ci_flags &= ~FMD_CF_SOLVED; 1079 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 1080 break; 1081 } 1082 if (cip->ci_xprt != NULL) { 1083 /* 1084 * For proxy, save some information about the transport 1085 * in the resource cache. 1086 */ 1087 int count = 0; 1088 fmd_asru_set_on_proxy_t fasp; 1089 fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; 1090 1091 fasp.fasp_countp = &count; 1092 fasp.fasp_maxcount = cip->ci_nsuspects; 1093 fasp.fasp_proxy_asru = cip->ci_proxy_asru; 1094 fasp.fasp_proxy_external = xip->xi_flags & 1095 FMD_XPRT_EXTERNAL; 1096 fasp.fasp_proxy_rdonly = ((xip->xi_flags & 1097 FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); 1098 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1099 fmd_asru_set_on_proxy, &fasp); 1100 } 1101 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 1102 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1103 1104 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1105 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1106 fmd_log_append(fmd.d_fltlog, e, cp); 1107 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1108 fmd_dispq_dispatch(fmd.d_disp, e, class); 1109 1110 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1111 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 1112 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1113 1114 break; 1115 1116 case FMD_CASE_CLOSE_WAIT: 1117 fmd_case_hold(cp); 1118 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 1119 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1120 1121 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1122 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 1123 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1124 1125 break; 1126 1127 case FMD_CASE_CLOSED: 1128 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 1129 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1130 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1131 fmd_dispq_dispatch(fmd.d_disp, e, class); 1132 break; 1133 1134 case FMD_CASE_REPAIRED: 1135 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1136 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1137 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1138 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1139 fmd_log_append(fmd.d_fltlog, e, cp); 1140 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1141 fmd_dispq_dispatch(fmd.d_disp, e, class); 1142 break; 1143 1144 case FMD_CASE_RESOLVED: 1145 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 1146 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1147 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1148 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1149 fmd_log_append(fmd.d_fltlog, e, cp); 1150 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1151 fmd_dispq_dispatch(fmd.d_disp, e, class); 1152 break; 1153 } 1154 } 1155 1156 fmd_case_t * 1157 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 1158 { 1159 fmd_case_impl_t *cip; 1160 uint_t h; 1161 1162 (void) pthread_rwlock_rdlock(&chp->ch_lock); 1163 h = fmd_strhash(uuid) % chp->ch_hashlen; 1164 1165 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 1166 if (strcmp(cip->ci_uuid, uuid) == 0) 1167 break; 1168 } 1169 1170 /* 1171 * If deleting bit is set, treat the case as if it doesn't exist. 1172 */ 1173 if (cip != NULL) 1174 cip = fmd_case_tryhold(cip); 1175 1176 if (cip == NULL) 1177 (void) fmd_set_errno(EFMD_CASE_INVAL); 1178 1179 (void) pthread_rwlock_unlock(&chp->ch_lock); 1180 return ((fmd_case_t *)cip); 1181 } 1182 1183 static fmd_case_impl_t * 1184 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1185 { 1186 fmd_case_impl_t *eip; 1187 uint_t h; 1188 1189 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1190 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1191 1192 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 1193 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 1194 fmd_case_tryhold(eip) != NULL) { 1195 (void) pthread_rwlock_unlock(&chp->ch_lock); 1196 return (eip); /* uuid already present */ 1197 } 1198 } 1199 1200 cip->ci_next = chp->ch_hash[h]; 1201 chp->ch_hash[h] = cip; 1202 1203 chp->ch_count++; 1204 ASSERT(chp->ch_count != 0); 1205 1206 (void) pthread_rwlock_unlock(&chp->ch_lock); 1207 return (cip); 1208 } 1209 1210 static void 1211 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1212 { 1213 fmd_case_impl_t *cp, **pp; 1214 uint_t h; 1215 1216 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1217 1218 cip->ci_flags |= FMD_CF_DELETING; 1219 (void) pthread_mutex_unlock(&cip->ci_lock); 1220 1221 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1222 1223 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1224 pp = &chp->ch_hash[h]; 1225 1226 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 1227 if (cp != cip) 1228 pp = &cp->ci_next; 1229 else 1230 break; 1231 } 1232 1233 if (cp == NULL) { 1234 fmd_panic("case %p (%s) not found on hash chain %u\n", 1235 (void *)cip, cip->ci_uuid, h); 1236 } 1237 1238 *pp = cp->ci_next; 1239 cp->ci_next = NULL; 1240 1241 /* 1242 * delete from code hash if it is on it 1243 */ 1244 fmd_case_code_hash_delete(chp, cip); 1245 1246 ASSERT(chp->ch_count != 0); 1247 chp->ch_count--; 1248 1249 (void) pthread_rwlock_unlock(&chp->ch_lock); 1250 1251 (void) pthread_mutex_lock(&cip->ci_lock); 1252 ASSERT(cip->ci_flags & FMD_CF_DELETING); 1253 } 1254 1255 fmd_case_t * 1256 fmd_case_create(fmd_module_t *mp, void *data) 1257 { 1258 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1259 fmd_case_impl_t *eip = NULL; 1260 uuid_t uuid; 1261 1262 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1263 fmd_buf_hash_create(&cip->ci_bufs); 1264 1265 fmd_module_hold(mp); 1266 cip->ci_mod = mp; 1267 cip->ci_refs = 1; 1268 cip->ci_state = FMD_CASE_UNSOLVED; 1269 cip->ci_flags = FMD_CF_DIRTY; 1270 cip->ci_data = data; 1271 1272 /* 1273 * Calling libuuid: get a clue. The library interfaces cleverly do not 1274 * define any constant for the length of an unparse string, and do not 1275 * permit the caller to specify a buffer length for safety. The spec 1276 * says it will be 36 bytes, but we make it tunable just in case. 1277 */ 1278 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 1279 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 1280 1281 /* 1282 * We expect this loop to execute only once, but code it defensively 1283 * against the possibility of libuuid bugs. Keep generating uuids and 1284 * attempting to do a hash insert until we get a unique one. 1285 */ 1286 do { 1287 if (eip != NULL) 1288 fmd_case_rele((fmd_case_t *)eip); 1289 uuid_generate(uuid); 1290 uuid_unparse(uuid, cip->ci_uuid); 1291 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 1292 1293 ASSERT(fmd_module_locked(mp)); 1294 fmd_list_append(&mp->mod_cases, cip); 1295 fmd_module_setcdirty(mp); 1296 1297 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1298 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1299 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1300 1301 return ((fmd_case_t *)cip); 1302 } 1303 1304 static void 1305 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 1306 { 1307 fmd_case_susp_t *cis, *ncis; 1308 1309 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1310 1311 if (cip->ci_proxy_asru) 1312 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * 1313 cip->ci_nsuspects); 1314 if (cip->ci_diag_de) 1315 nvlist_free(cip->ci_diag_de); 1316 if (cip->ci_diag_asru) 1317 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * 1318 cip->ci_nsuspects); 1319 1320 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 1321 ncis = cis->cis_next; 1322 nvlist_free(cis->cis_nvl); 1323 fmd_free(cis, sizeof (fmd_case_susp_t)); 1324 } 1325 1326 cip->ci_suspects = NULL; 1327 cip->ci_nsuspects = 0; 1328 } 1329 1330 fmd_case_t * 1331 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 1332 uint_t state, const char *uuid, const char *code) 1333 { 1334 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1335 fmd_case_impl_t *eip; 1336 1337 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1338 fmd_buf_hash_create(&cip->ci_bufs); 1339 1340 fmd_module_hold(mp); 1341 cip->ci_mod = mp; 1342 cip->ci_xprt = xp; 1343 cip->ci_refs = 1; 1344 cip->ci_state = state; 1345 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 1346 cip->ci_uuidlen = strlen(cip->ci_uuid); 1347 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 1348 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 1349 1350 if (state > FMD_CASE_CLOSE_WAIT) 1351 cip->ci_flags |= FMD_CF_SOLVED; 1352 1353 /* 1354 * Insert the case into the global case hash. If the specified UUID is 1355 * already present, check to see if it is an orphan: if so, reclaim it; 1356 * otherwise if it is owned by a different module then return NULL. 1357 */ 1358 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 1359 (void) pthread_mutex_lock(&cip->ci_lock); 1360 cip->ci_refs--; /* decrement to zero */ 1361 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 1362 1363 cip = eip; /* switch 'cip' to the existing case */ 1364 (void) pthread_mutex_lock(&cip->ci_lock); 1365 1366 /* 1367 * If the ASRU cache is trying to recreate an orphan, then just 1368 * return the existing case that we found without changing it. 1369 */ 1370 if (mp == fmd.d_rmod) { 1371 /* 1372 * In case the case has already been created from 1373 * a checkpoint file we need to set up code now. 1374 */ 1375 if (cip->ci_state < FMD_CASE_CLOSED) { 1376 if (code != NULL && cip->ci_code == NULL) { 1377 cip->ci_code = fmd_strdup(code, 1378 FMD_SLEEP); 1379 cip->ci_codelen = cip->ci_code ? 1380 strlen(cip->ci_code) + 1 : 0; 1381 fmd_case_code_hash_insert(fmd.d_cases, 1382 cip); 1383 } 1384 } 1385 1386 /* 1387 * When recreating an orphan case, state passed in may 1388 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If 1389 * any suspects are still CLOSED (faulty) then the 1390 * overall state needs to be CLOSED. 1391 */ 1392 if ((cip->ci_state == FMD_CASE_REPAIRED || 1393 cip->ci_state == FMD_CASE_RESOLVED) && 1394 state == FMD_CASE_CLOSED) 1395 cip->ci_state = FMD_CASE_CLOSED; 1396 (void) pthread_mutex_unlock(&cip->ci_lock); 1397 fmd_case_rele((fmd_case_t *)cip); 1398 return ((fmd_case_t *)cip); 1399 } 1400 1401 /* 1402 * If the existing case isn't an orphan or is being proxied, 1403 * then we have a UUID conflict: return failure to the caller. 1404 */ 1405 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 1406 (void) pthread_mutex_unlock(&cip->ci_lock); 1407 fmd_case_rele((fmd_case_t *)cip); 1408 return (NULL); 1409 } 1410 1411 /* 1412 * If the new module is reclaiming an orphaned case, remove 1413 * the case from the root module, switch ci_mod, and then fall 1414 * through to adding the case to the new owner module 'mp'. 1415 */ 1416 fmd_module_lock(cip->ci_mod); 1417 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1418 fmd_module_unlock(cip->ci_mod); 1419 1420 fmd_module_rele(cip->ci_mod); 1421 cip->ci_mod = mp; 1422 fmd_module_hold(mp); 1423 1424 /* 1425 * It's possible that fmd crashed or was restarted during a 1426 * previous solve operation between the asru cache being created 1427 * and the ckpt file being updated to SOLVED. Thus when the DE 1428 * recreates the case here from the checkpoint file, the state 1429 * will be UNSOLVED and yet we are having to reclaim because 1430 * the case was in the asru cache. If this happens, revert the 1431 * case back to the UNSOLVED state and let the DE solve it again 1432 */ 1433 if (state == FMD_CASE_UNSOLVED) { 1434 fmd_asru_hash_delete_case(fmd.d_asrus, 1435 (fmd_case_t *)cip); 1436 fmd_case_destroy_suspects(cip); 1437 fmd_case_code_hash_delete(fmd.d_cases, cip); 1438 fmd_free(cip->ci_code, cip->ci_codelen); 1439 cip->ci_code = NULL; 1440 cip->ci_codelen = 0; 1441 cip->ci_tv_valid = 0; 1442 } 1443 1444 cip->ci_state = state; 1445 1446 (void) pthread_mutex_unlock(&cip->ci_lock); 1447 fmd_case_rele((fmd_case_t *)cip); 1448 } else { 1449 /* 1450 * add into hash of solved cases 1451 */ 1452 if (cip->ci_code) 1453 fmd_case_code_hash_insert(fmd.d_cases, cip); 1454 } 1455 1456 ASSERT(fmd_module_locked(mp)); 1457 fmd_list_append(&mp->mod_cases, cip); 1458 1459 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1460 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1461 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1462 1463 return ((fmd_case_t *)cip); 1464 } 1465 1466 void 1467 fmd_case_destroy(fmd_case_t *cp, int visible) 1468 { 1469 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1470 fmd_case_item_t *cit, *ncit; 1471 1472 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1473 ASSERT(cip->ci_refs == 0); 1474 1475 if (visible) { 1476 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 1477 fmd_case_hash_delete(fmd.d_cases, cip); 1478 } 1479 1480 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 1481 ncit = cit->cit_next; 1482 fmd_event_rele(cit->cit_event); 1483 fmd_free(cit, sizeof (fmd_case_item_t)); 1484 } 1485 1486 fmd_case_destroy_suspects(cip); 1487 1488 if (cip->ci_principal != NULL) 1489 fmd_event_rele(cip->ci_principal); 1490 1491 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1492 fmd_free(cip->ci_code, cip->ci_codelen); 1493 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1494 1495 fmd_module_rele(cip->ci_mod); 1496 fmd_free(cip, sizeof (fmd_case_impl_t)); 1497 } 1498 1499 void 1500 fmd_case_hold(fmd_case_t *cp) 1501 { 1502 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1503 1504 (void) pthread_mutex_lock(&cip->ci_lock); 1505 fmd_case_hold_locked(cp); 1506 (void) pthread_mutex_unlock(&cip->ci_lock); 1507 } 1508 1509 void 1510 fmd_case_hold_locked(fmd_case_t *cp) 1511 { 1512 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1513 1514 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1515 if (cip->ci_flags & FMD_CF_DELETING) 1516 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1517 (void *)cip, cip->ci_uuid); 1518 cip->ci_refs++; 1519 ASSERT(cip->ci_refs != 0); 1520 } 1521 1522 static fmd_case_impl_t * 1523 fmd_case_tryhold(fmd_case_impl_t *cip) 1524 { 1525 /* 1526 * If the case's "deleting" bit is unset, hold and return case, 1527 * otherwise, return NULL. 1528 */ 1529 (void) pthread_mutex_lock(&cip->ci_lock); 1530 if (cip->ci_flags & FMD_CF_DELETING) { 1531 (void) pthread_mutex_unlock(&cip->ci_lock); 1532 cip = NULL; 1533 } else { 1534 fmd_case_hold_locked((fmd_case_t *)cip); 1535 (void) pthread_mutex_unlock(&cip->ci_lock); 1536 } 1537 return (cip); 1538 } 1539 1540 void 1541 fmd_case_rele(fmd_case_t *cp) 1542 { 1543 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1544 1545 (void) pthread_mutex_lock(&cip->ci_lock); 1546 ASSERT(cip->ci_refs != 0); 1547 1548 if (--cip->ci_refs == 0) 1549 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1550 else 1551 (void) pthread_mutex_unlock(&cip->ci_lock); 1552 } 1553 1554 void 1555 fmd_case_rele_locked(fmd_case_t *cp) 1556 { 1557 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1558 1559 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1560 --cip->ci_refs; 1561 ASSERT(cip->ci_refs != 0); 1562 } 1563 1564 int 1565 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1566 { 1567 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1568 fmd_case_item_t *cit; 1569 fmd_event_t *oep; 1570 uint_t state; 1571 int new; 1572 1573 fmd_event_hold(ep); 1574 (void) pthread_mutex_lock(&cip->ci_lock); 1575 1576 if (cip->ci_flags & FMD_CF_SOLVED) 1577 state = FMD_EVS_DIAGNOSED; 1578 else 1579 state = FMD_EVS_ACCEPTED; 1580 1581 oep = cip->ci_principal; 1582 cip->ci_principal = ep; 1583 1584 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1585 if (cit->cit_event == ep) 1586 break; 1587 } 1588 1589 cip->ci_flags |= FMD_CF_DIRTY; 1590 new = cit == NULL && ep != oep; 1591 1592 (void) pthread_mutex_unlock(&cip->ci_lock); 1593 1594 fmd_module_setcdirty(cip->ci_mod); 1595 fmd_event_transition(ep, state); 1596 1597 if (oep != NULL) 1598 fmd_event_rele(oep); 1599 1600 return (new); 1601 } 1602 1603 int 1604 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1605 { 1606 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1607 fmd_case_item_t *cit; 1608 uint_t state; 1609 int new; 1610 1611 (void) pthread_mutex_lock(&cip->ci_lock); 1612 1613 if (cip->ci_flags & FMD_CF_SOLVED) 1614 state = FMD_EVS_DIAGNOSED; 1615 else 1616 state = FMD_EVS_ACCEPTED; 1617 1618 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1619 if (cit->cit_event == ep) 1620 break; 1621 } 1622 1623 new = cit == NULL && ep != cip->ci_principal; 1624 1625 /* 1626 * If the event is already in the case or the case is already solved, 1627 * there is no reason to save it: just transition it appropriately. 1628 */ 1629 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1630 (void) pthread_mutex_unlock(&cip->ci_lock); 1631 fmd_event_transition(ep, state); 1632 return (new); 1633 } 1634 1635 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1636 fmd_event_hold(ep); 1637 1638 cit->cit_next = cip->ci_items; 1639 cit->cit_event = ep; 1640 1641 cip->ci_items = cit; 1642 cip->ci_nitems++; 1643 1644 cip->ci_flags |= FMD_CF_DIRTY; 1645 (void) pthread_mutex_unlock(&cip->ci_lock); 1646 1647 fmd_module_setcdirty(cip->ci_mod); 1648 fmd_event_transition(ep, state); 1649 1650 return (new); 1651 } 1652 1653 void 1654 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1655 { 1656 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1657 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1658 1659 (void) pthread_mutex_lock(&cip->ci_lock); 1660 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1661 cip->ci_flags |= FMD_CF_DIRTY; 1662 1663 cis->cis_next = cip->ci_suspects; 1664 cis->cis_nvl = nvl; 1665 1666 cip->ci_suspects = cis; 1667 cip->ci_nsuspects++; 1668 1669 (void) pthread_mutex_unlock(&cip->ci_lock); 1670 if (cip->ci_xprt == NULL) 1671 fmd_module_setcdirty(cip->ci_mod); 1672 } 1673 1674 void 1675 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1676 { 1677 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1678 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1679 boolean_t b; 1680 1681 (void) pthread_mutex_lock(&cip->ci_lock); 1682 1683 cis->cis_next = cip->ci_suspects; 1684 cis->cis_nvl = nvl; 1685 1686 if (nvlist_lookup_boolean_value(nvl, 1687 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1688 cip->ci_flags |= FMD_CF_INVISIBLE; 1689 1690 cip->ci_suspects = cis; 1691 cip->ci_nsuspects++; 1692 1693 (void) pthread_mutex_unlock(&cip->ci_lock); 1694 } 1695 1696 void 1697 fmd_case_reset_suspects(fmd_case_t *cp) 1698 { 1699 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1700 1701 (void) pthread_mutex_lock(&cip->ci_lock); 1702 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1703 1704 fmd_case_destroy_suspects(cip); 1705 cip->ci_flags |= FMD_CF_DIRTY; 1706 1707 (void) pthread_mutex_unlock(&cip->ci_lock); 1708 fmd_module_setcdirty(cip->ci_mod); 1709 } 1710 1711 /*ARGSUSED*/ 1712 static void 1713 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1714 { 1715 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1716 } 1717 1718 /* 1719 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1720 * whatever actions and emit whatever events are appropriate for the state. 1721 * Refer to the topmost block comment explaining the state machine for details. 1722 */ 1723 void 1724 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1725 { 1726 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1727 fmd_case_item_t *cit; 1728 fmd_event_t *e; 1729 int resolved = 0; 1730 int any_unusable_and_present = 0; 1731 1732 ASSERT(state <= FMD_CASE_RESOLVED); 1733 (void) pthread_mutex_lock(&cip->ci_lock); 1734 1735 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1736 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); 1737 1738 cip->ci_flags |= flags; 1739 1740 if (cip->ci_state >= state) { 1741 (void) pthread_mutex_unlock(&cip->ci_lock); 1742 return; /* already in specified state */ 1743 } 1744 1745 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1746 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1747 1748 cip->ci_state = state; 1749 cip->ci_flags |= FMD_CF_DIRTY; 1750 1751 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1752 fmd_module_setcdirty(cip->ci_mod); 1753 1754 switch (state) { 1755 case FMD_CASE_SOLVED: 1756 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1757 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1758 1759 if (cip->ci_principal != NULL) { 1760 fmd_event_transition(cip->ci_principal, 1761 FMD_EVS_DIAGNOSED); 1762 } 1763 break; 1764 1765 case FMD_CASE_CLOSE_WAIT: 1766 /* 1767 * If the case was never solved, do not change ASRUs. 1768 * If the case was never fmd_case_closed, do not change ASRUs. 1769 * If the case was repaired, do not change ASRUs. 1770 */ 1771 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1772 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1773 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1774 fmd_case_unusable, NULL); 1775 1776 /* 1777 * If an orphaned case transitions to CLOSE_WAIT, the owning 1778 * module is no longer loaded: continue on to CASE_CLOSED. 1779 */ 1780 if (fmd_case_orphaned(cp)) 1781 state = cip->ci_state = FMD_CASE_CLOSED; 1782 break; 1783 1784 case FMD_CASE_REPAIRED: 1785 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); 1786 1787 /* 1788 * If we've been requested to transition straight on to the 1789 * RESOLVED state (which can happen with fault proxying where a 1790 * list.resolved or a uuresolved is received from the other 1791 * side), or if all suspects are already either usable or not 1792 * present then transition straight to RESOLVED state, 1793 * publishing both the list.repaired and list.resolved. For a 1794 * proxy, if we discover here that all suspects are already 1795 * either usable or not present, notify the diag side instead 1796 * using fmd_xprt_uuresolved(). 1797 */ 1798 if (flags & FMD_CF_RESOLVED) { 1799 if (cip->ci_xprt != NULL) 1800 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1801 } else { 1802 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1803 fmd_case_unusable_and_present, 1804 &any_unusable_and_present); 1805 if (any_unusable_and_present) 1806 break; 1807 if (cip->ci_xprt != NULL) { 1808 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); 1809 break; 1810 } 1811 } 1812 1813 cip->ci_state = FMD_CASE_RESOLVED; 1814 (void) pthread_mutex_unlock(&cip->ci_lock); 1815 fmd_case_publish(cp, state); 1816 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1817 _fmd_case_snames[FMD_CASE_REPAIRED], 1818 _fmd_case_snames[FMD_CASE_RESOLVED])); 1819 state = FMD_CASE_RESOLVED; 1820 resolved = 1; 1821 (void) pthread_mutex_lock(&cip->ci_lock); 1822 break; 1823 1824 case FMD_CASE_RESOLVED: 1825 /* 1826 * For a proxy, no need to check that all suspects are already 1827 * either usable or not present - this request has come from 1828 * the diagnosing side which makes the final decision on this. 1829 */ 1830 if (cip->ci_xprt != NULL) { 1831 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1832 resolved = 1; 1833 break; 1834 } 1835 1836 ASSERT(fmd_case_orphaned(cp)); 1837 1838 /* 1839 * If all suspects are already either usable or not present then 1840 * carry on, publish list.resolved and discard the case. 1841 */ 1842 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1843 fmd_case_unusable_and_present, &any_unusable_and_present); 1844 if (any_unusable_and_present) { 1845 (void) pthread_mutex_unlock(&cip->ci_lock); 1846 return; 1847 } 1848 1849 resolved = 1; 1850 break; 1851 } 1852 1853 (void) pthread_mutex_unlock(&cip->ci_lock); 1854 1855 /* 1856 * If the module has initialized, then publish the appropriate event 1857 * for the new case state. If not, we are being called from the 1858 * checkpoint code during module load, in which case the module's 1859 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1860 * may not be open yet, which will prevent us from computing the event 1861 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1862 * event in our queue: this won't be processed until _fmd_init is done. 1863 */ 1864 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1865 fmd_case_publish(cp, state); 1866 else { 1867 fmd_case_hold(cp); 1868 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1869 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1870 } 1871 1872 if (resolved) { 1873 if (cip->ci_xprt != NULL) { 1874 /* 1875 * If we transitioned to RESOLVED, adjust the reference 1876 * count to reflect our removal from 1877 * fmd.d_rmod->mod_cases above. If the caller has not 1878 * placed an additional hold on the case, it will now 1879 * be freed. 1880 */ 1881 (void) pthread_mutex_lock(&cip->ci_lock); 1882 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1883 (void) pthread_mutex_unlock(&cip->ci_lock); 1884 fmd_case_rele(cp); 1885 } else { 1886 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1887 fmd_asru_log_resolved, NULL); 1888 (void) pthread_mutex_lock(&cip->ci_lock); 1889 /* mark as "ready to be discarded */ 1890 cip->ci_flags |= FMD_CF_RES_CMPL; 1891 (void) pthread_mutex_unlock(&cip->ci_lock); 1892 } 1893 } 1894 } 1895 1896 /* 1897 * Discard any case if it is in RESOLVED state (and if check_if_aged argument 1898 * is set if all suspects have passed the rsrc.aged time). 1899 */ 1900 void 1901 fmd_case_discard_resolved(fmd_case_t *cp, void *arg) 1902 { 1903 int check_if_aged = *(int *)arg; 1904 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1905 1906 /* 1907 * First check if case has completed transition to resolved. 1908 */ 1909 (void) pthread_mutex_lock(&cip->ci_lock); 1910 if (!(cip->ci_flags & FMD_CF_RES_CMPL)) { 1911 (void) pthread_mutex_unlock(&cip->ci_lock); 1912 return; 1913 } 1914 1915 /* 1916 * Now if check_is_aged is set, see if all suspects have aged. 1917 */ 1918 if (check_if_aged) { 1919 int aged = 1; 1920 1921 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1922 fmd_asru_check_if_aged, &aged); 1923 if (!aged) { 1924 (void) pthread_mutex_unlock(&cip->ci_lock); 1925 return; 1926 } 1927 } 1928 1929 /* 1930 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't 1931 * do it twice. 1932 */ 1933 fmd_module_lock(cip->ci_mod); 1934 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1935 fmd_module_unlock(cip->ci_mod); 1936 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1937 cip->ci_flags &= ~FMD_CF_RES_CMPL; 1938 (void) pthread_mutex_unlock(&cip->ci_lock); 1939 fmd_case_rele(cp); 1940 } 1941 1942 /* 1943 * Transition the specified case to *at least* the specified state by first 1944 * re-validating the suspect list using the resource cache. This function is 1945 * employed by the checkpoint code when restoring a saved, solved case to see 1946 * if the state of the case has effectively changed while fmd was not running 1947 * or the module was not loaded. 1948 */ 1949 void 1950 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1951 { 1952 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1953 1954 int usable = 0; /* are any suspects usable? */ 1955 1956 ASSERT(state >= FMD_CASE_SOLVED); 1957 (void) pthread_mutex_lock(&cip->ci_lock); 1958 1959 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1960 1961 (void) pthread_mutex_unlock(&cip->ci_lock); 1962 1963 if (!usable) { 1964 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1965 flags |= FMD_CF_ISOLATED; 1966 } 1967 1968 fmd_case_transition(cp, state, flags); 1969 } 1970 1971 void 1972 fmd_case_setdirty(fmd_case_t *cp) 1973 { 1974 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1975 1976 (void) pthread_mutex_lock(&cip->ci_lock); 1977 cip->ci_flags |= FMD_CF_DIRTY; 1978 (void) pthread_mutex_unlock(&cip->ci_lock); 1979 1980 fmd_module_setcdirty(cip->ci_mod); 1981 } 1982 1983 void 1984 fmd_case_clrdirty(fmd_case_t *cp) 1985 { 1986 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1987 1988 (void) pthread_mutex_lock(&cip->ci_lock); 1989 cip->ci_flags &= ~FMD_CF_DIRTY; 1990 (void) pthread_mutex_unlock(&cip->ci_lock); 1991 } 1992 1993 void 1994 fmd_case_commit(fmd_case_t *cp) 1995 { 1996 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1997 fmd_case_item_t *cit; 1998 1999 (void) pthread_mutex_lock(&cip->ci_lock); 2000 2001 if (cip->ci_flags & FMD_CF_DIRTY) { 2002 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 2003 fmd_event_commit(cit->cit_event); 2004 2005 if (cip->ci_principal != NULL) 2006 fmd_event_commit(cip->ci_principal); 2007 2008 fmd_buf_hash_commit(&cip->ci_bufs); 2009 cip->ci_flags &= ~FMD_CF_DIRTY; 2010 } 2011 2012 (void) pthread_mutex_unlock(&cip->ci_lock); 2013 } 2014 2015 /* 2016 * On proxy side, send back repair/acquit/etc request to diagnosing side 2017 */ 2018 void 2019 fmd_case_xprt_updated(fmd_case_t *cp) 2020 { 2021 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2022 nvlist_t **nva; 2023 uint8_t *ba; 2024 int msg = B_TRUE; 2025 int count = 0; 2026 fmd_case_lst_t fcl; 2027 2028 ASSERT(cip->ci_xprt != NULL); 2029 (void) pthread_mutex_lock(&cip->ci_lock); 2030 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 2031 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 2032 fcl.fcl_countp = &count; 2033 fcl.fcl_maxcount = cip->ci_nsuspects; 2034 fcl.fcl_msgp = &msg; 2035 fcl.fcl_ba = ba; 2036 fcl.fcl_nva = nva; 2037 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 2038 (void) pthread_mutex_unlock(&cip->ci_lock); 2039 fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, 2040 count); 2041 } 2042 2043 /* 2044 * fmd_case_update_status() can be called on either the proxy side when a 2045 * list.suspect is received, or on the diagnosing side when an update request 2046 * is received from the proxy. It updates the status in the resource cache. 2047 */ 2048 void 2049 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, 2050 uint8_t *diag_asrup) 2051 { 2052 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2053 int count = 0; 2054 fmd_asru_update_status_t faus; 2055 2056 /* 2057 * update status of resource cache entries 2058 */ 2059 faus.faus_countp = &count; 2060 faus.faus_maxcount = cip->ci_nsuspects; 2061 faus.faus_ba = statusp; 2062 faus.faus_proxy_asru = proxy_asrup; 2063 faus.faus_diag_asru = diag_asrup; 2064 faus.faus_is_proxy = (cip->ci_xprt != NULL); 2065 (void) pthread_mutex_lock(&cip->ci_lock); 2066 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, 2067 &faus); 2068 (void) pthread_mutex_unlock(&cip->ci_lock); 2069 } 2070 2071 /* 2072 * Called on either the proxy side or the diag side when a repair has taken 2073 * place on the other side but this side may know the asru "contains" 2074 * relationships. 2075 */ 2076 void 2077 fmd_case_update_containees(fmd_case_t *cp) 2078 { 2079 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2080 2081 (void) pthread_mutex_lock(&cip->ci_lock); 2082 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2083 fmd_asru_update_containees, NULL); 2084 (void) pthread_mutex_unlock(&cip->ci_lock); 2085 } 2086 2087 /* 2088 * fmd_case_close_status() is called on diagnosing side when proxy side 2089 * has had a uuclose. It updates the status in the resource cache. 2090 */ 2091 void 2092 fmd_case_close_status(fmd_case_t *cp) 2093 { 2094 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2095 int count = 0; 2096 fmd_asru_close_status_t facs; 2097 2098 /* 2099 * update status of resource cache entries 2100 */ 2101 facs.facs_countp = &count; 2102 facs.facs_maxcount = cip->ci_nsuspects; 2103 (void) pthread_mutex_lock(&cip->ci_lock); 2104 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, 2105 &facs); 2106 (void) pthread_mutex_unlock(&cip->ci_lock); 2107 } 2108 2109 /* 2110 * Indicate that the case may need to change state because one or more of the 2111 * ASRUs named as a suspect has changed state. We examine all the suspects 2112 * and if none are still faulty, we initiate a case close transition. 2113 */ 2114 void 2115 fmd_case_update(fmd_case_t *cp) 2116 { 2117 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2118 uint_t cstate; 2119 int faulty = 0; 2120 2121 (void) pthread_mutex_lock(&cip->ci_lock); 2122 cstate = cip->ci_state; 2123 2124 if (cip->ci_state < FMD_CASE_SOLVED) { 2125 (void) pthread_mutex_unlock(&cip->ci_lock); 2126 return; /* update is not appropriate */ 2127 } 2128 2129 if (cip->ci_flags & FMD_CF_REPAIRED) { 2130 (void) pthread_mutex_unlock(&cip->ci_lock); 2131 return; /* already repaired */ 2132 } 2133 2134 TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); 2135 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2136 (void) pthread_mutex_unlock(&cip->ci_lock); 2137 2138 if (faulty) { 2139 nvlist_t *nvl; 2140 fmd_event_t *e; 2141 char *class; 2142 2143 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); 2144 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2145 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2146 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2147 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 2148 fmd_log_append(fmd.d_fltlog, e, cp); 2149 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 2150 fmd_dispq_dispatch(fmd.d_disp, e, class); 2151 return; /* one or more suspects are still marked faulty */ 2152 } 2153 2154 if (cstate == FMD_CASE_CLOSED) 2155 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2156 else 2157 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2158 } 2159 2160 /* 2161 * Delete a closed case from the module's case list once the fmdo_close() entry 2162 * point has run to completion. If the case is owned by a transport module, 2163 * tell the transport to proxy a case close on the other end of the transport. 2164 * Transition to the appropriate next state based on ci_flags. This 2165 * function represents the end of CLOSE_WAIT and transitions the case to either 2166 * CLOSED or REPAIRED or discards it entirely because it was never solved; 2167 * refer to the topmost block comment explaining the state machine for details. 2168 */ 2169 void 2170 fmd_case_delete(fmd_case_t *cp) 2171 { 2172 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2173 fmd_modstat_t *msp; 2174 size_t buftotal; 2175 2176 TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); 2177 ASSERT(fmd_module_locked(cip->ci_mod)); 2178 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2179 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 2180 2181 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2182 msp = cip->ci_mod->mod_stats; 2183 2184 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 2185 msp->ms_caseopen.fmds_value.ui64--; 2186 2187 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 2188 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 2189 2190 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2191 2192 if (cip->ci_xprt == NULL) 2193 fmd_module_setcdirty(cip->ci_mod); 2194 2195 fmd_module_rele(cip->ci_mod); 2196 cip->ci_mod = fmd.d_rmod; 2197 fmd_module_hold(cip->ci_mod); 2198 2199 /* 2200 * If the case has been solved, then retain it 2201 * on the root module's case list at least until we're transitioned. 2202 * Otherwise free the case with our final fmd_case_rele() below. 2203 */ 2204 if (cip->ci_flags & FMD_CF_SOLVED) { 2205 fmd_module_lock(cip->ci_mod); 2206 fmd_list_append(&cip->ci_mod->mod_cases, cip); 2207 fmd_module_unlock(cip->ci_mod); 2208 fmd_case_hold(cp); 2209 } 2210 2211 /* 2212 * Transition onwards to REPAIRED or CLOSED as originally requested. 2213 * Note that for proxy case if we're transitioning to CLOSED it means 2214 * the case was isolated locally, so call fmd_xprt_uuclose() to notify 2215 * the diagnosing side. No need to notify the diagnosing side if we are 2216 * transitioning to REPAIRED as we only do this when requested to do 2217 * so by the diagnosing side anyway. 2218 */ 2219 if (cip->ci_flags & FMD_CF_REPAIRED) 2220 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 2221 else if (cip->ci_flags & FMD_CF_ISOLATED) { 2222 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 2223 if (cip->ci_xprt != NULL) 2224 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 2225 } 2226 2227 fmd_case_rele(cp); 2228 } 2229 2230 void 2231 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) 2232 { 2233 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2234 2235 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2236 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 2237 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2238 2239 ASSERT(fmd_module_locked(cip->ci_mod)); 2240 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2241 if (delete_from_asru_cache) { 2242 (void) pthread_mutex_lock(&cip->ci_lock); 2243 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 2244 (void) pthread_mutex_unlock(&cip->ci_lock); 2245 } 2246 fmd_case_rele(cp); 2247 } 2248 2249 /* 2250 * Indicate that the problem corresponding to a case has been repaired by 2251 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 2252 * already been closed, this function initiates the transition to CLOSE_WAIT. 2253 * The caller must have the case held from fmd_case_hash_lookup(), so we can 2254 * grab and drop ci_lock without the case being able to be freed in between. 2255 */ 2256 int 2257 fmd_case_repair(fmd_case_t *cp) 2258 { 2259 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2260 uint_t cstate; 2261 fmd_asru_rep_arg_t fara; 2262 2263 (void) pthread_mutex_lock(&cip->ci_lock); 2264 cstate = cip->ci_state; 2265 2266 if (cstate < FMD_CASE_SOLVED) { 2267 (void) pthread_mutex_unlock(&cip->ci_lock); 2268 return (fmd_set_errno(EFMD_CASE_STATE)); 2269 } 2270 2271 if (cip->ci_flags & FMD_CF_REPAIRED) { 2272 (void) pthread_mutex_unlock(&cip->ci_lock); 2273 return (0); /* already repaired */ 2274 } 2275 2276 TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); 2277 fara.fara_reason = FMD_ASRU_REPAIRED; 2278 fara.fara_bywhat = FARA_BY_CASE; 2279 fara.fara_rval = NULL; 2280 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2281 (void) pthread_mutex_unlock(&cip->ci_lock); 2282 2283 /* 2284 * if this is a proxied case, send the repair across the transport. 2285 * The remote side will then do the repair and send a list.repaired back 2286 * again such that we can finally repair the case on this side. 2287 */ 2288 if (cip->ci_xprt != NULL) { 2289 fmd_case_xprt_updated(cp); 2290 return (0); 2291 } 2292 2293 if (cstate == FMD_CASE_CLOSED) 2294 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2295 else 2296 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2297 2298 return (0); 2299 } 2300 2301 int 2302 fmd_case_acquit(fmd_case_t *cp) 2303 { 2304 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2305 uint_t cstate; 2306 fmd_asru_rep_arg_t fara; 2307 2308 (void) pthread_mutex_lock(&cip->ci_lock); 2309 cstate = cip->ci_state; 2310 2311 if (cstate < FMD_CASE_SOLVED) { 2312 (void) pthread_mutex_unlock(&cip->ci_lock); 2313 return (fmd_set_errno(EFMD_CASE_STATE)); 2314 } 2315 2316 if (cip->ci_flags & FMD_CF_REPAIRED) { 2317 (void) pthread_mutex_unlock(&cip->ci_lock); 2318 return (0); /* already repaired */ 2319 } 2320 2321 TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); 2322 fara.fara_reason = FMD_ASRU_ACQUITTED; 2323 fara.fara_bywhat = FARA_BY_CASE; 2324 fara.fara_rval = NULL; 2325 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2326 (void) pthread_mutex_unlock(&cip->ci_lock); 2327 2328 /* 2329 * if this is a proxied case, send the repair across the transport. 2330 * The remote side will then do the repair and send a list.repaired back 2331 * again such that we can finally repair the case on this side. 2332 */ 2333 if (cip->ci_xprt != NULL) { 2334 fmd_case_xprt_updated(cp); 2335 return (0); 2336 } 2337 2338 if (cstate == FMD_CASE_CLOSED) 2339 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2340 else 2341 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2342 2343 return (0); 2344 } 2345 2346 int 2347 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 2348 { 2349 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2350 fmd_case_item_t *cit; 2351 uint_t state; 2352 int rv = 0; 2353 2354 (void) pthread_mutex_lock(&cip->ci_lock); 2355 2356 if (cip->ci_state >= FMD_CASE_SOLVED) 2357 state = FMD_EVS_DIAGNOSED; 2358 else 2359 state = FMD_EVS_ACCEPTED; 2360 2361 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 2362 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 2363 break; 2364 } 2365 2366 if (rv == 0 && cip->ci_principal != NULL) 2367 rv = fmd_event_equal(ep, cip->ci_principal); 2368 2369 (void) pthread_mutex_unlock(&cip->ci_lock); 2370 2371 if (rv != 0) 2372 fmd_event_transition(ep, state); 2373 2374 return (rv); 2375 } 2376 2377 int 2378 fmd_case_orphaned(fmd_case_t *cp) 2379 { 2380 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 2381 } 2382 2383 void 2384 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 2385 { 2386 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 2387 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 2388 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 2389 } 2390 2391 void 2392 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) 2393 { 2394 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2395 2396 if (cip->ci_diag_de) 2397 nvlist_free(cip->ci_diag_de); 2398 cip->ci_diag_de = nvl; 2399 } 2400 2401 void 2402 fmd_case_setcode(fmd_case_t *cp, char *code) 2403 { 2404 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2405 2406 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 2407 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 2408 } 2409 2410 /*ARGSUSED*/ 2411 static void 2412 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 2413 { 2414 int not_faulty = 0; 2415 int faulty = 0; 2416 nvlist_t *nvl; 2417 fmd_event_t *e; 2418 char *class; 2419 int any_unusable_and_present = 0; 2420 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2421 2422 if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) 2423 return; 2424 2425 if (cip->ci_state == FMD_CASE_RESOLVED) { 2426 cip->ci_flags |= FMD_CF_RES_CMPL; 2427 return; 2428 } 2429 2430 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2431 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 2432 ¬_faulty); 2433 2434 if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) { 2435 /* 2436 * If none of the suspects is faulty, replay the list.repaired. 2437 * If all suspects are already either usable or not present then 2438 * also transition straight to RESOLVED state. 2439 */ 2440 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2441 fmd_case_unusable_and_present, &any_unusable_and_present); 2442 if (!any_unusable_and_present) { 2443 cip->ci_state = FMD_CASE_RESOLVED; 2444 2445 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2446 cip->ci_uuid)); 2447 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2448 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2449 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2450 class); 2451 fmd_dispq_dispatch(fmd.d_disp, e, class); 2452 2453 TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", 2454 cip->ci_uuid)); 2455 fmd_case_publish(cp, FMD_CASE_RESOLVED); 2456 cip->ci_flags |= FMD_CF_RES_CMPL; 2457 } else { 2458 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2459 cip->ci_uuid)); 2460 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2461 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2462 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2463 class); 2464 fmd_dispq_dispatch(fmd.d_disp, e, class); 2465 } 2466 } else if (faulty && not_faulty) { 2467 /* 2468 * if some but not all of the suspects are not faulty, replay 2469 * the list.updated. 2470 */ 2471 TRACE((FMD_DBG_CASE, "replay sending list.updated %s", 2472 cip->ci_uuid)); 2473 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2474 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2475 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2476 fmd_dispq_dispatch(fmd.d_disp, e, class); 2477 } 2478 } 2479 2480 void 2481 fmd_case_repair_replay() 2482 { 2483 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 2484 } 2485