1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * FMD Case Subsystem 28 * 29 * Diagnosis engines are expected to group telemetry events related to the 30 * diagnosis of a particular problem on the system into a set of cases. The 31 * diagnosis engine may have any number of cases open at a given point in time. 32 * Some cases may eventually be *solved* by associating a suspect list of one 33 * or more problems with the case, at which point fmd publishes a list.suspect 34 * event for the case and it becomes visible to administrators and agents. 35 * 36 * Every case is named using a UUID, and is globally visible in the case hash. 37 * Cases are reference-counted, except for the reference from the case hash 38 * itself. Consumers of case references include modules, which store active 39 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 40 * 41 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 42 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 43 * or transport) and the case is referenced by the mod_cases list. Once the 44 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 45 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 46 * 47 * +------------+ 48 * +----------| UNSOLVED | 49 * | +------------+ 50 * | 1 | 51 * | | 52 * | +-------v----+ 53 * 2 | | SOLVED | 54 * | +------------+ 55 * | 3 | 5 | 56 * +------------+ | | 57 * | | | 58 * +-v---v----v-+ 59 * | CLOSE_WAIT | 60 * +------------+ 61 * | | | 62 * +-----------+ | +------------+ 63 * | 4 | | 64 * v +-----v------+ | 65 * discard | CLOSED | 6 | 66 * +------------+ | 67 * | | 68 * | +------------+ 69 * 7 | | 70 * +-----v----v-+ 71 * | REPAIRED | 72 * +------------+ 73 * | 74 * 8 | 75 * +-----v------+ 76 * | RESOLVED | 77 * +------------+ 78 * | 79 * v 80 * discard 81 * 82 * The state machine changes are triggered by calls to fmd_case_transition() 83 * from various locations inside of fmd, as described below: 84 * 85 * [1] Called by: fmd_case_solve() 86 * Actions: FMD_CF_SOLVED flag is set in ci_flags 87 * conviction policy is applied to suspect list 88 * suspects convicted are marked faulty (F) in R$ 89 * list.suspect event logged and dispatched 90 * 91 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 92 * Actions: diagnosis engine fmdo_close() entry point scheduled 93 * case discarded upon exit from CLOSE_WAIT 94 * 95 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 96 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 97 * suspects convicted (F) are marked unusable (U) in R$ 98 * diagnosis engine fmdo_close() entry point scheduled 99 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 100 * 101 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 102 * Actions: list.isolated event dispatched 103 * case deleted from module's list of open cases 104 * 105 * [5] Called by: fmd_case_repair(), fmd_case_update() 106 * Actions: FMD_CF_REPAIR flag is set in ci_flags 107 * diagnosis engine fmdo_close() entry point scheduled 108 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 109 * 110 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 111 * Actions: suspects convicted are marked non faulty (!F) in R$ 112 * list.repaired or list.updated event dispatched 113 * 114 * [7] Called by: fmd_case_repair(), fmd_case_update() 115 * Actions: FMD_CF_REPAIR flag is set in ci_flags 116 * suspects convicted are marked non faulty (!F) in R$ 117 * list.repaired or list.updated event dispatched 118 * 119 * [8] Called by: fmd_case_uuresolve() 120 * Actions: list.resolved event dispatched 121 * case is discarded 122 */ 123 124 #include <sys/fm/protocol.h> 125 #include <uuid/uuid.h> 126 #include <alloca.h> 127 128 #include <fmd_alloc.h> 129 #include <fmd_module.h> 130 #include <fmd_error.h> 131 #include <fmd_conf.h> 132 #include <fmd_case.h> 133 #include <fmd_string.h> 134 #include <fmd_subr.h> 135 #include <fmd_protocol.h> 136 #include <fmd_event.h> 137 #include <fmd_eventq.h> 138 #include <fmd_dispq.h> 139 #include <fmd_buf.h> 140 #include <fmd_log.h> 141 #include <fmd_asru.h> 142 #include <fmd_fmri.h> 143 #include <fmd_xprt.h> 144 145 #include <fmd.h> 146 147 static const char *const _fmd_case_snames[] = { 148 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 149 "SOLVED", /* FMD_CASE_SOLVED */ 150 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 151 "CLOSED", /* FMD_CASE_CLOSED */ 152 "REPAIRED", /* FMD_CASE_REPAIRED */ 153 "RESOLVED" /* FMD_CASE_RESOLVED */ 154 }; 155 156 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 157 158 fmd_case_hash_t * 159 fmd_case_hash_create(void) 160 { 161 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 162 163 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 164 chp->ch_hashlen = fmd.d_str_buckets; 165 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 166 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 167 FMD_SLEEP); 168 chp->ch_count = 0; 169 170 return (chp); 171 } 172 173 /* 174 * Destroy the case hash. Unlike most of our hash tables, no active references 175 * are kept by the case hash itself; all references come from other subsystems. 176 * The hash must be destroyed after all modules are unloaded; if anything was 177 * present in the hash it would be by definition a reference count leak. 178 */ 179 void 180 fmd_case_hash_destroy(fmd_case_hash_t *chp) 181 { 182 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 183 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp, sizeof (fmd_case_hash_t)); 185 } 186 187 /* 188 * Take a snapshot of the case hash by placing an additional hold on each 189 * member in an auxiliary array, and then call 'func' for each case. 190 */ 191 void 192 fmd_case_hash_apply(fmd_case_hash_t *chp, 193 void (*func)(fmd_case_t *, void *), void *arg) 194 { 195 fmd_case_impl_t *cp, **cps, **cpp; 196 uint_t cpc, i; 197 198 (void) pthread_rwlock_rdlock(&chp->ch_lock); 199 200 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 201 cpc = chp->ch_count; 202 203 for (i = 0; i < chp->ch_hashlen; i++) { 204 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 205 *cpp++ = fmd_case_tryhold(cp); 206 } 207 208 ASSERT(cpp == cps + cpc); 209 (void) pthread_rwlock_unlock(&chp->ch_lock); 210 211 for (i = 0; i < cpc; i++) { 212 if (cps[i] != NULL) { 213 func((fmd_case_t *)cps[i], arg); 214 fmd_case_rele((fmd_case_t *)cps[i]); 215 } 216 } 217 218 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 219 } 220 221 static void 222 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 223 { 224 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 225 226 cip->ci_code_next = chp->ch_code_hash[h]; 227 chp->ch_code_hash[h] = cip; 228 } 229 230 static void 231 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 232 { 233 fmd_case_impl_t **pp, *cp; 234 235 if (cip->ci_code) { 236 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 237 238 pp = &chp->ch_code_hash[h]; 239 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 240 if (cp != cip) 241 pp = &cp->ci_code_next; 242 else 243 break; 244 } 245 if (cp != NULL) { 246 *pp = cp->ci_code_next; 247 cp->ci_code_next = NULL; 248 } 249 } 250 } 251 252 /* 253 * Look up the diagcode for this case and cache it in ci_code. If no suspects 254 * were defined for this case or if the lookup fails, the event dictionary or 255 * module code is broken, and we set the event code to a precomputed default. 256 */ 257 static const char * 258 fmd_case_mkcode(fmd_case_t *cp) 259 { 260 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 261 fmd_case_susp_t *cis; 262 fmd_case_hash_t *chp = fmd.d_cases; 263 264 char **keys, **keyp; 265 const char *s; 266 267 ASSERT(MUTEX_HELD(&cip->ci_lock)); 268 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 269 270 /* 271 * delete any existing entry from code hash if it is on it 272 */ 273 fmd_case_code_hash_delete(chp, cip); 274 275 fmd_free(cip->ci_code, cip->ci_codelen); 276 cip->ci_codelen = cip->ci_mod->mod_codelen; 277 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 278 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 279 280 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 281 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 282 keyp++; 283 } 284 285 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 286 287 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 288 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 289 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 290 fmd_free(cip->ci_code, cip->ci_codelen); 291 cip->ci_codelen = strlen(s) + 1; 292 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 293 (void) strcpy(cip->ci_code, s); 294 } 295 296 /* 297 * add into hash of solved cases 298 */ 299 fmd_case_code_hash_insert(chp, cip); 300 301 return (cip->ci_code); 302 } 303 304 typedef struct { 305 int *fcl_countp; 306 int fcl_maxcount; 307 uint8_t *fcl_ba; 308 nvlist_t **fcl_nva; 309 int *fcl_msgp; 310 } fmd_case_lst_t; 311 312 static void 313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 314 { 315 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 316 boolean_t b; 317 int state; 318 319 if (*entryp->fcl_countp >= entryp->fcl_maxcount) 320 return; 321 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 322 &b) == 0 && b == B_FALSE) 323 *entryp->fcl_msgp = B_FALSE; 324 entryp->fcl_ba[*entryp->fcl_countp] = 0; 325 state = fmd_asru_al_getstate(alp); 326 if (state & FMD_ASRU_DEGRADED) 327 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 328 if (state & FMD_ASRU_UNUSABLE) 329 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 330 if (state & FMD_ASRU_FAULTY) 331 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 332 if (!(state & FMD_ASRU_PRESENT)) 333 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 334 if (alp->al_reason == FMD_ASRU_REPAIRED) 335 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 336 else if (alp->al_reason == FMD_ASRU_REPLACED) 337 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 338 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 339 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 340 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 341 (*entryp->fcl_countp)++; 342 } 343 344 static void 345 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 346 { 347 int *faultyp = (int *)arg; 348 349 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 350 } 351 352 static void 353 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 354 { 355 int *usablep = (int *)arg; 356 357 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 358 } 359 360 static void 361 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 362 { 363 int *not_faultyp = (int *)arg; 364 365 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 366 } 367 368 /* 369 * Have we got any suspects with an asru that are still unusable and present? 370 */ 371 static void 372 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 373 { 374 int *rvalp = (int *)arg; 375 int state; 376 nvlist_t *asru; 377 378 /* 379 * if this a proxy case and this suspect doesn't have an local asru 380 * then state is unknown so we must assume it may still be unusable. 381 */ 382 if ((alp->al_flags & FMD_ASRU_PROXY) && 383 !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) { 384 *rvalp |= B_TRUE; 385 return; 386 } 387 388 state = fmd_asru_al_getstate(alp); 389 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 390 return; 391 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 392 } 393 394 nvlist_t * 395 fmd_case_mkevent(fmd_case_t *cp, const char *class) 396 { 397 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 398 nvlist_t **nva, *nvl; 399 uint8_t *ba; 400 int msg = B_TRUE; 401 const char *code; 402 fmd_case_lst_t fcl; 403 int count = 0; 404 405 (void) pthread_mutex_lock(&cip->ci_lock); 406 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 407 408 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 409 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 410 411 /* 412 * For each suspect associated with the case, store its fault event 413 * nvlist in 'nva'. We also look to see if any of the suspect faults 414 * have asked not to be messaged. If any of them have made such a 415 * request, propagate that attribute to the composite list.* event. 416 * Finally, store each suspect's faulty status into the bitmap 'ba'. 417 */ 418 fcl.fcl_countp = &count; 419 fcl.fcl_maxcount = cip->ci_nsuspects; 420 fcl.fcl_msgp = &msg; 421 fcl.fcl_ba = ba; 422 fcl.fcl_nva = nva; 423 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 424 425 if (cip->ci_code == NULL) 426 (void) fmd_case_mkcode(cp); 427 /* 428 * For repair and updated event, we lookup diagcode from dict using key 429 * "list.repaired" or "list.updated" or "list.resolved". 430 */ 431 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 432 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 433 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 434 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 435 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 436 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 437 else 438 code = cip->ci_code; 439 440 if (msg == B_FALSE) 441 cip->ci_flags |= FMD_CF_INVISIBLE; 442 443 /* 444 * Use the ci_diag_de if one has been saved (eg for an injected fault). 445 * Otherwise use the authority for the current module. 446 */ 447 nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ? 448 cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count, 449 nva, ba, msg, &cip->ci_tv, cip->ci_injected); 450 451 (void) pthread_mutex_unlock(&cip->ci_lock); 452 return (nvl); 453 } 454 455 static int fmd_case_match_on_faulty_overlap = 1; 456 static int fmd_case_match_on_acquit_overlap = 1; 457 static int fmd_case_auto_acquit_isolated = 1; 458 static int fmd_case_auto_acquit_non_acquitted = 1; 459 static int fmd_case_too_recent = 10; /* time in seconds */ 460 461 static boolean_t 462 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 463 { 464 nvlist_t *new_rsrc; 465 nvlist_t *rsrc; 466 char *new_name = NULL; 467 char *name = NULL; 468 ssize_t new_namelen; 469 ssize_t namelen; 470 int fmri_present = 1; 471 int new_fmri_present = 1; 472 int match = B_FALSE; 473 fmd_topo_t *ftp = fmd_topo_hold(); 474 475 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 476 fmri_present = 0; 477 else { 478 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 479 goto done; 480 name = fmd_alloc(namelen + 1, FMD_SLEEP); 481 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 482 goto done; 483 } 484 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 485 new_fmri_present = 0; 486 else { 487 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 488 goto done; 489 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 490 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 491 goto done; 492 } 493 match = (fmri_present == new_fmri_present && 494 (fmri_present == 0 || 495 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 496 done: 497 if (name != NULL) 498 fmd_free(name, namelen + 1); 499 if (new_name != NULL) 500 fmd_free(new_name, new_namelen + 1); 501 fmd_topo_rele(ftp); 502 return (match); 503 } 504 505 static int 506 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2) 507 { 508 char *class, *new_class; 509 510 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU)) 511 return (0); 512 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE)) 513 return (0); 514 if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU)) 515 return (0); 516 (void) nvlist_lookup_string(nvl2, FM_CLASS, &class); 517 (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class); 518 return (strcmp(class, new_class) == 0); 519 } 520 521 typedef struct { 522 int *fcms_countp; 523 int fcms_maxcount; 524 fmd_case_impl_t *fcms_cip; 525 uint8_t *fcms_new_susp_state; 526 uint8_t *fcms_old_susp_state; 527 uint8_t *fcms_old_match_state; 528 } fcms_t; 529 #define SUSPECT_STATE_FAULTY 0x1 530 #define SUSPECT_STATE_ISOLATED 0x2 531 #define SUSPECT_STATE_REMOVED 0x4 532 #define SUSPECT_STATE_ACQUITED 0x8 533 #define SUSPECT_STATE_REPAIRED 0x10 534 #define SUSPECT_STATE_REPLACED 0x20 535 #define SUSPECT_STATE_NO_MATCH 0x1 536 537 /* 538 * This is called for each suspect in the old case. Compare it against each 539 * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state 540 * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not 541 * found in the old case. 542 */ 543 static void 544 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg) 545 { 546 fcms_t *fcmsp = (fcms_t *)arg; 547 fmd_case_impl_t *cip = fcmsp->fcms_cip; 548 fmd_case_susp_t *cis; 549 int i = 0; 550 int state = fmd_asru_al_getstate(alp); 551 552 if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount) 553 return; 554 555 if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) && 556 alp->al_reason == FMD_ASRU_REMOVED)) 557 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 558 SUSPECT_STATE_REMOVED; 559 else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY)) 560 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 561 SUSPECT_STATE_ISOLATED; 562 else if (state & FMD_ASRU_FAULTY) 563 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 564 SUSPECT_STATE_FAULTY; 565 else if (alp->al_reason == FMD_ASRU_REPLACED) 566 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 567 SUSPECT_STATE_REPLACED; 568 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 569 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 570 SUSPECT_STATE_ACQUITED; 571 else 572 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] = 573 SUSPECT_STATE_REPAIRED; 574 575 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++) 576 if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1) 577 break; 578 if (cis != NULL) 579 fcmsp->fcms_new_susp_state[i] = 580 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp]; 581 else 582 fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |= 583 SUSPECT_STATE_NO_MATCH; 584 (*fcmsp->fcms_countp)++; 585 } 586 587 typedef struct { 588 int *fca_do_update; 589 fmd_case_impl_t *fca_cip; 590 } fca_t; 591 592 /* 593 * Re-fault all acquitted suspects that are still present in the new list. 594 */ 595 static void 596 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg) 597 { 598 fca_t *fcap = (fca_t *)arg; 599 fmd_case_impl_t *cip = fcap->fca_cip; 600 fmd_case_susp_t *cis; 601 int state = fmd_asru_al_getstate(alp); 602 603 if (!(state & FMD_ASRU_FAULTY) && 604 alp->al_reason == FMD_ASRU_ACQUITTED) { 605 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 606 if (fmd_case_match_suspect(cis->cis_nvl, 607 alp->al_event) == 1) 608 break; 609 if (cis != NULL) { 610 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 611 *fcap->fca_do_update = 1; 612 } 613 } 614 } 615 616 /* 617 * Re-fault all suspects that are still present in the new list. 618 */ 619 static void 620 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg) 621 { 622 fca_t *fcap = (fca_t *)arg; 623 fmd_case_impl_t *cip = fcap->fca_cip; 624 fmd_case_susp_t *cis; 625 int state = fmd_asru_al_getstate(alp); 626 627 if (!(state & FMD_ASRU_FAULTY)) { 628 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 629 if (fmd_case_match_suspect(cis->cis_nvl, 630 alp->al_event) == 1) 631 break; 632 if (cis != NULL) { 633 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 634 *fcap->fca_do_update = 1; 635 } 636 } 637 } 638 639 /* 640 * Acquit all suspects that are no longer present in the new list. 641 */ 642 static void 643 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg) 644 { 645 fca_t *fcap = (fca_t *)arg; 646 fmd_case_impl_t *cip = fcap->fca_cip; 647 fmd_case_susp_t *cis; 648 int state = fmd_asru_al_getstate(alp); 649 650 if (state & FMD_ASRU_FAULTY) { 651 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) 652 if (fmd_case_match_suspect(cis->cis_nvl, 653 alp->al_event) == 1) 654 break; 655 if (cis == NULL) { 656 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 657 FMD_ASRU_ACQUITTED); 658 *fcap->fca_do_update = 1; 659 } 660 } 661 } 662 663 /* 664 * Acquit all isolated suspects. 665 */ 666 static void 667 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg) 668 { 669 int *do_update = (int *)arg; 670 int state = fmd_asru_al_getstate(alp); 671 672 if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) && 673 (state & FMD_ASRU_FAULTY)) { 674 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 675 FMD_ASRU_ACQUITTED); 676 *do_update = 1; 677 } 678 } 679 680 /* 681 * Acquit suspect which matches specified nvlist 682 */ 683 static void 684 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg) 685 { 686 nvlist_t *nvl = (nvlist_t *)arg; 687 int state = fmd_asru_al_getstate(alp); 688 689 if ((state & FMD_ASRU_FAULTY) && 690 fmd_case_match_suspect(nvl, alp->al_event) == 1) 691 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY, 692 FMD_ASRU_ACQUITTED); 693 } 694 695 typedef struct { 696 fmd_case_impl_t *fccd_cip; 697 uint8_t *fccd_new_susp_state; 698 uint8_t *fccd_new_match_state; 699 int *fccd_discard_new; 700 int *fccd_adjust_new; 701 } fccd_t; 702 703 /* 704 * see if a matching suspect list already exists in the cache 705 */ 706 static void 707 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg) 708 { 709 fccd_t *fccdp = (fccd_t *)arg; 710 fmd_case_impl_t *new_cip = fccdp->fccd_cip; 711 fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp; 712 int i, count = 0, do_update = 0, got_isolated_overlap = 0; 713 int got_faulty_overlap = 0; 714 int got_acquit_overlap = 0; 715 boolean_t too_recent; 716 uint64_t most_recent = 0; 717 fcms_t fcms; 718 fca_t fca; 719 uint8_t *new_susp_state; 720 uint8_t *old_susp_state; 721 uint8_t *old_match_state; 722 723 new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t)); 724 for (i = 0; i < new_cip->ci_nsuspects; i++) 725 new_susp_state[i] = 0; 726 old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 727 for (i = 0; i < old_cip->ci_nsuspects; i++) 728 old_susp_state[i] = 0; 729 old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t)); 730 for (i = 0; i < old_cip->ci_nsuspects; i++) 731 old_match_state[i] = 0; 732 733 /* 734 * Compare with each suspect in the existing case. 735 */ 736 fcms.fcms_countp = &count; 737 fcms.fcms_maxcount = old_cip->ci_nsuspects; 738 fcms.fcms_cip = new_cip; 739 fcms.fcms_new_susp_state = new_susp_state; 740 fcms.fcms_old_susp_state = old_susp_state; 741 fcms.fcms_old_match_state = old_match_state; 742 fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip, 743 fmd_case_match_suspects, &fcms); 744 745 /* 746 * If we have some faulty, non-isolated suspects that overlap, then most 747 * likely it is the suspects that overlap in the suspect lists that are 748 * to blame. So we can consider this to be a match. 749 */ 750 for (i = 0; i < new_cip->ci_nsuspects; i++) 751 if (new_susp_state[i] == SUSPECT_STATE_FAULTY) 752 got_faulty_overlap = 1; 753 if (got_faulty_overlap && fmd_case_match_on_faulty_overlap) 754 goto got_match; 755 756 /* 757 * If we have no faulty, non-isolated suspects in the old case, but we 758 * do have some acquitted suspects that overlap, then most likely it is 759 * the acquitted suspects that overlap in the suspect lists that are 760 * to blame. So we can consider this to be a match. 761 */ 762 for (i = 0; i < new_cip->ci_nsuspects; i++) 763 if (new_susp_state[i] == SUSPECT_STATE_ACQUITED) 764 got_acquit_overlap = 1; 765 for (i = 0; i < old_cip->ci_nsuspects; i++) 766 if (old_susp_state[i] == SUSPECT_STATE_FAULTY) 767 got_acquit_overlap = 0; 768 if (got_acquit_overlap && fmd_case_match_on_acquit_overlap) 769 goto got_match; 770 771 /* 772 * Check that all suspects in the new list are present in the old list. 773 * Return if we find one that isn't. 774 */ 775 for (i = 0; i < new_cip->ci_nsuspects; i++) 776 if (new_susp_state[i] == 0) 777 return; 778 779 /* 780 * Check that all suspects in the old list are present in the new list 781 * *or* they are isolated or removed/replaced (which would explain why 782 * they are not present in the new list). Return if we find one that is 783 * faulty and unisolated or repaired or acquitted, and that is not 784 * present in the new case. 785 */ 786 for (i = 0; i < old_cip->ci_nsuspects; i++) 787 if (old_match_state[i] == SUSPECT_STATE_NO_MATCH && 788 (old_susp_state[i] == SUSPECT_STATE_FAULTY || 789 old_susp_state[i] == SUSPECT_STATE_ACQUITED || 790 old_susp_state[i] == SUSPECT_STATE_REPAIRED)) 791 return; 792 793 got_match: 794 /* 795 * If the old case is already in repaired/resolved state, we can't 796 * do anything more with it, so keep the new case, but acquit some 797 * of the suspects if appropriate. 798 */ 799 if (old_cip->ci_state >= FMD_CASE_REPAIRED) { 800 if (fmd_case_auto_acquit_non_acquitted) { 801 *fccdp->fccd_adjust_new = 1; 802 for (i = 0; i < new_cip->ci_nsuspects; i++) { 803 fccdp->fccd_new_susp_state[i] |= 804 new_susp_state[i]; 805 if (new_susp_state[i] == 0) 806 fccdp->fccd_new_susp_state[i] = 807 SUSPECT_STATE_NO_MATCH; 808 } 809 } 810 return; 811 } 812 813 /* 814 * Otherwise discard the new case and keep the old, again updating the 815 * state of the suspects as appropriate 816 */ 817 *fccdp->fccd_discard_new = 1; 818 fca.fca_cip = new_cip; 819 fca.fca_do_update = &do_update; 820 821 /* 822 * See if new case occurred within fmd_case_too_recent seconds of the 823 * most recent modification to the old case and if so don't do 824 * auto-acquit. This avoids problems if a flood of ereports come in and 825 * they don't all get diagnosed before the first case causes some of 826 * the devices to be isolated making it appear that an isolated device 827 * was in the suspect list. 828 */ 829 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 830 fmd_asru_most_recent, &most_recent); 831 too_recent = (new_cip->ci_tv.tv_sec - most_recent < 832 fmd_case_too_recent); 833 834 if (got_faulty_overlap) { 835 /* 836 * Acquit any suspects not present in the new list, plus 837 * any that are are present but are isolated. 838 */ 839 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 840 fmd_case_acquit_no_match, &fca); 841 if (fmd_case_auto_acquit_isolated && !too_recent) 842 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 843 fmd_case_acquit_isolated, &do_update); 844 } else if (got_acquit_overlap) { 845 /* 846 * Re-fault the acquitted matching suspects and acquit all 847 * isolated suspects. 848 */ 849 if (fmd_case_auto_acquit_isolated && !too_recent) { 850 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 851 fmd_case_fault_acquitted_matching, &fca); 852 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 853 fmd_case_acquit_isolated, &do_update); 854 } 855 } else if (fmd_case_auto_acquit_isolated) { 856 /* 857 * To get here, there must be no faulty or acquitted suspects, 858 * but there must be at least one isolated suspect. Just acquit 859 * non-matching isolated suspects. If there are no matching 860 * isolated suspects, then re-fault all matching suspects. 861 */ 862 for (i = 0; i < new_cip->ci_nsuspects; i++) 863 if (new_susp_state[i] == SUSPECT_STATE_ISOLATED) 864 got_isolated_overlap = 1; 865 if (!got_isolated_overlap) 866 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 867 fmd_case_fault_all_matching, &fca); 868 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp, 869 fmd_case_acquit_no_match, &fca); 870 } 871 872 /* 873 * If we've updated anything in the old case, call fmd_case_update() 874 */ 875 if (do_update) 876 fmd_case_update(old_cp); 877 } 878 879 /* 880 * Convict suspects in a case by applying a conviction policy and updating the 881 * resource cache prior to emitting the list.suspect event for the given case. 882 * At present, our policy is very simple: convict every suspect in the case. 883 * In the future, this policy can be extended and made configurable to permit: 884 * 885 * - convicting the suspect with the highest FIT rate 886 * - convicting the suspect with the cheapest FRU 887 * - convicting the suspect with the FRU that is in a depot's inventory 888 * - convicting the suspect with the longest lifetime 889 * 890 * and so forth. A word to the wise: this problem is significantly harder that 891 * it seems at first glance. Future work should heed the following advice: 892 * 893 * Hacking the policy into C code here is a very bad idea. The policy needs to 894 * be decided upon very carefully and fundamentally encodes knowledge of what 895 * suspect list combinations can be emitted by what diagnosis engines. As such 896 * fmd's code is the wrong location, because that would require fmd itself to 897 * be updated for every diagnosis engine change, defeating the entire design. 898 * The FMA Event Registry knows the suspect list combinations: policy inputs 899 * can be derived from it and used to produce per-module policy configuration. 900 * 901 * If the policy needs to be dynamic and not statically fixed at either fmd 902 * startup or module load time, any implementation of dynamic policy retrieval 903 * must employ some kind of caching mechanism or be part of a built-in module. 904 * The fmd_case_convict() function is called with locks held inside of fmd and 905 * is not a place where unbounded blocking on some inter-process or inter- 906 * system communication to another service (e.g. another daemon) can occur. 907 */ 908 static int 909 fmd_case_convict(fmd_case_t *cp) 910 { 911 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 912 fmd_asru_hash_t *ahp = fmd.d_asrus; 913 int discard_new = 0, i; 914 fmd_case_susp_t *cis; 915 fmd_asru_link_t *alp; 916 uint8_t *new_susp_state; 917 uint8_t *new_match_state; 918 int adjust_new = 0; 919 fccd_t fccd; 920 fmd_case_impl_t *ncp, **cps, **cpp; 921 uint_t cpc; 922 fmd_case_hash_t *chp; 923 924 /* 925 * First we must see if any matching cases already exist. 926 */ 927 new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 928 for (i = 0; i < cip->ci_nsuspects; i++) 929 new_susp_state[i] = 0; 930 new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t)); 931 for (i = 0; i < cip->ci_nsuspects; i++) 932 new_match_state[i] = 0; 933 fccd.fccd_cip = cip; 934 fccd.fccd_adjust_new = &adjust_new; 935 fccd.fccd_new_susp_state = new_susp_state; 936 fccd.fccd_new_match_state = new_match_state; 937 fccd.fccd_discard_new = &discard_new; 938 939 /* 940 * Hold all cases 941 */ 942 chp = fmd.d_cases; 943 (void) pthread_rwlock_rdlock(&chp->ch_lock); 944 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 945 cpc = chp->ch_count; 946 for (i = 0; i < chp->ch_hashlen; i++) 947 for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next) 948 *cpp++ = fmd_case_tryhold(ncp); 949 ASSERT(cpp == cps + cpc); 950 (void) pthread_rwlock_unlock(&chp->ch_lock); 951 952 /* 953 * Run fmd_case_check_for_dups() on all cases except the current one. 954 */ 955 for (i = 0; i < cpc; i++) { 956 if (cps[i] != NULL) { 957 if (cps[i] != (fmd_case_impl_t *)cp) 958 fmd_case_check_for_dups((fmd_case_t *)cps[i], 959 &fccd); 960 fmd_case_rele((fmd_case_t *)cps[i]); 961 } 962 } 963 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 964 965 (void) pthread_mutex_lock(&cip->ci_lock); 966 if (cip->ci_code == NULL) 967 (void) fmd_case_mkcode(cp); 968 else if (cip->ci_precanned) 969 fmd_case_code_hash_insert(fmd.d_cases, cip); 970 971 if (discard_new) { 972 /* 973 * We've found an existing case that is a match and it is not 974 * already in repaired or resolved state. So we can close this 975 * one as a duplicate. 976 */ 977 (void) pthread_mutex_unlock(&cip->ci_lock); 978 return (1); 979 } 980 981 /* 982 * Allocate new cache entries 983 */ 984 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 985 if ((alp = fmd_asru_hash_create_entry(ahp, 986 cp, cis->cis_nvl)) == NULL) { 987 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 988 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 989 continue; 990 } 991 alp->al_flags |= FMD_ASRU_PRESENT; 992 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT; 993 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 994 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 995 } 996 997 if (adjust_new) { 998 int some_suspect = 0, some_not_suspect = 0; 999 1000 /* 1001 * There is one or more matching case but they are already in 1002 * repaired or resolved state. So we need to keep the new 1003 * case, but we can adjust it. Repaired/removed/replaced 1004 * suspects are unlikely to be to blame (unless there are 1005 * actually two separate faults). So if we have a combination of 1006 * repaired/replaced/removed suspects and acquitted suspects in 1007 * the old lists, then we should acquit in the new list those 1008 * that were repaired/replaced/removed in the old. 1009 */ 1010 for (i = 0; i < cip->ci_nsuspects; i++) { 1011 if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) || 1012 (new_susp_state[i] & SUSPECT_STATE_REPAIRED) || 1013 (new_susp_state[i] & SUSPECT_STATE_REMOVED) || 1014 (new_match_state[i] & SUSPECT_STATE_NO_MATCH)) 1015 some_not_suspect = 1; 1016 else 1017 some_suspect = 1; 1018 } 1019 if (some_suspect && some_not_suspect) { 1020 for (cis = cip->ci_suspects, i = 0; cis != NULL; 1021 cis = cis->cis_next, i++) 1022 if ((new_susp_state[i] & 1023 SUSPECT_STATE_REPLACED) || 1024 (new_susp_state[i] & 1025 SUSPECT_STATE_REPAIRED) || 1026 (new_susp_state[i] & 1027 SUSPECT_STATE_REMOVED) || 1028 (new_match_state[i] & 1029 SUSPECT_STATE_NO_MATCH)) 1030 fmd_asru_hash_apply_by_case(fmd.d_asrus, 1031 cp, fmd_case_acquit_suspect, 1032 cis->cis_nvl); 1033 } 1034 } 1035 1036 (void) pthread_mutex_unlock(&cip->ci_lock); 1037 return (0); 1038 } 1039 1040 void 1041 fmd_case_publish(fmd_case_t *cp, uint_t state) 1042 { 1043 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1044 fmd_event_t *e; 1045 nvlist_t *nvl; 1046 char *class; 1047 1048 if (state == FMD_CASE_CURRENT) 1049 state = cip->ci_state; /* use current state */ 1050 1051 switch (state) { 1052 case FMD_CASE_SOLVED: 1053 (void) pthread_mutex_lock(&cip->ci_lock); 1054 1055 /* 1056 * If we already have a code, then case is already solved. 1057 */ 1058 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL && 1059 cip->ci_code != NULL) { 1060 (void) pthread_mutex_unlock(&cip->ci_lock); 1061 break; 1062 } 1063 1064 if (cip->ci_tv_valid == 0) { 1065 fmd_time_gettimeofday(&cip->ci_tv); 1066 cip->ci_tv_valid = 1; 1067 } 1068 (void) pthread_mutex_unlock(&cip->ci_lock); 1069 1070 if (fmd_case_convict(cp) == 1) { /* dupclose */ 1071 cip->ci_flags &= ~FMD_CF_SOLVED; 1072 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 1073 break; 1074 } 1075 if (cip->ci_xprt != NULL) { 1076 /* 1077 * For proxy, save some information about the transport 1078 * in the resource cache. 1079 */ 1080 int count = 0; 1081 fmd_asru_set_on_proxy_t fasp; 1082 fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt; 1083 1084 fasp.fasp_countp = &count; 1085 fasp.fasp_maxcount = cip->ci_nsuspects; 1086 fasp.fasp_proxy_asru = cip->ci_proxy_asru; 1087 fasp.fasp_proxy_external = xip->xi_flags & 1088 FMD_XPRT_EXTERNAL; 1089 fasp.fasp_proxy_rdonly = ((xip->xi_flags & 1090 FMD_XPRT_RDWR) == FMD_XPRT_RDONLY); 1091 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1092 fmd_asru_set_on_proxy, &fasp); 1093 } 1094 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 1095 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1096 1097 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1098 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1099 fmd_log_append(fmd.d_fltlog, e, cp); 1100 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1101 fmd_dispq_dispatch(fmd.d_disp, e, class); 1102 1103 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1104 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 1105 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1106 1107 break; 1108 1109 case FMD_CASE_CLOSE_WAIT: 1110 fmd_case_hold(cp); 1111 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 1112 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1113 1114 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1115 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 1116 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1117 1118 break; 1119 1120 case FMD_CASE_CLOSED: 1121 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 1122 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1123 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1124 fmd_dispq_dispatch(fmd.d_disp, e, class); 1125 break; 1126 1127 case FMD_CASE_REPAIRED: 1128 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1129 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1130 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1131 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1132 fmd_log_append(fmd.d_fltlog, e, cp); 1133 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1134 fmd_dispq_dispatch(fmd.d_disp, e, class); 1135 break; 1136 1137 case FMD_CASE_RESOLVED: 1138 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 1139 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1140 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1141 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1142 fmd_log_append(fmd.d_fltlog, e, cp); 1143 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1144 fmd_dispq_dispatch(fmd.d_disp, e, class); 1145 break; 1146 } 1147 } 1148 1149 fmd_case_t * 1150 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 1151 { 1152 fmd_case_impl_t *cip; 1153 uint_t h; 1154 1155 (void) pthread_rwlock_rdlock(&chp->ch_lock); 1156 h = fmd_strhash(uuid) % chp->ch_hashlen; 1157 1158 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 1159 if (strcmp(cip->ci_uuid, uuid) == 0) 1160 break; 1161 } 1162 1163 /* 1164 * If deleting bit is set, treat the case as if it doesn't exist. 1165 */ 1166 if (cip != NULL) 1167 cip = fmd_case_tryhold(cip); 1168 1169 if (cip == NULL) 1170 (void) fmd_set_errno(EFMD_CASE_INVAL); 1171 1172 (void) pthread_rwlock_unlock(&chp->ch_lock); 1173 return ((fmd_case_t *)cip); 1174 } 1175 1176 static fmd_case_impl_t * 1177 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1178 { 1179 fmd_case_impl_t *eip; 1180 uint_t h; 1181 1182 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1183 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1184 1185 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 1186 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 1187 fmd_case_tryhold(eip) != NULL) { 1188 (void) pthread_rwlock_unlock(&chp->ch_lock); 1189 return (eip); /* uuid already present */ 1190 } 1191 } 1192 1193 cip->ci_next = chp->ch_hash[h]; 1194 chp->ch_hash[h] = cip; 1195 1196 chp->ch_count++; 1197 ASSERT(chp->ch_count != 0); 1198 1199 (void) pthread_rwlock_unlock(&chp->ch_lock); 1200 return (cip); 1201 } 1202 1203 static void 1204 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 1205 { 1206 fmd_case_impl_t *cp, **pp; 1207 uint_t h; 1208 1209 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1210 1211 cip->ci_flags |= FMD_CF_DELETING; 1212 (void) pthread_mutex_unlock(&cip->ci_lock); 1213 1214 (void) pthread_rwlock_wrlock(&chp->ch_lock); 1215 1216 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 1217 pp = &chp->ch_hash[h]; 1218 1219 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 1220 if (cp != cip) 1221 pp = &cp->ci_next; 1222 else 1223 break; 1224 } 1225 1226 if (cp == NULL) { 1227 fmd_panic("case %p (%s) not found on hash chain %u\n", 1228 (void *)cip, cip->ci_uuid, h); 1229 } 1230 1231 *pp = cp->ci_next; 1232 cp->ci_next = NULL; 1233 1234 /* 1235 * delete from code hash if it is on it 1236 */ 1237 fmd_case_code_hash_delete(chp, cip); 1238 1239 ASSERT(chp->ch_count != 0); 1240 chp->ch_count--; 1241 1242 (void) pthread_rwlock_unlock(&chp->ch_lock); 1243 1244 (void) pthread_mutex_lock(&cip->ci_lock); 1245 ASSERT(cip->ci_flags & FMD_CF_DELETING); 1246 } 1247 1248 fmd_case_t * 1249 fmd_case_create(fmd_module_t *mp, const char *uuidstr, void *data) 1250 { 1251 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1252 fmd_case_impl_t *eip = NULL; 1253 uuid_t uuid; 1254 1255 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1256 fmd_buf_hash_create(&cip->ci_bufs); 1257 1258 fmd_module_hold(mp); 1259 cip->ci_mod = mp; 1260 cip->ci_refs = 1; 1261 cip->ci_state = FMD_CASE_UNSOLVED; 1262 cip->ci_flags = FMD_CF_DIRTY; 1263 cip->ci_data = data; 1264 1265 /* 1266 * Calling libuuid: get a clue. The library interfaces cleverly do not 1267 * define any constant for the length of an unparse string, and do not 1268 * permit the caller to specify a buffer length for safety. The spec 1269 * says it will be 36 bytes, but we make it tunable just in case. 1270 */ 1271 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 1272 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 1273 1274 if (uuidstr == NULL) { 1275 /* 1276 * We expect this loop to execute only once, but code it 1277 * defensively against the possibility of libuuid bugs. 1278 * Keep generating uuids and attempting to do a hash insert 1279 * until we get a unique one. 1280 */ 1281 do { 1282 if (eip != NULL) 1283 fmd_case_rele((fmd_case_t *)eip); 1284 uuid_generate(uuid); 1285 uuid_unparse(uuid, cip->ci_uuid); 1286 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 1287 } else { 1288 /* 1289 * If a uuid was specified we must succeed with that uuid, 1290 * or return NULL indicating a case with that uuid already 1291 * exists. 1292 */ 1293 (void) strncpy(cip->ci_uuid, uuidstr, cip->ci_uuidlen + 1); 1294 if (fmd_case_hash_insert(fmd.d_cases, cip) != cip) { 1295 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1296 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1297 fmd_module_rele(mp); 1298 pthread_mutex_destroy(&cip->ci_lock); 1299 fmd_free(cip, sizeof (*cip)); 1300 return (NULL); 1301 } 1302 } 1303 1304 ASSERT(fmd_module_locked(mp)); 1305 fmd_list_append(&mp->mod_cases, cip); 1306 fmd_module_setcdirty(mp); 1307 1308 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1309 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1310 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1311 1312 return ((fmd_case_t *)cip); 1313 } 1314 1315 static void 1316 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 1317 { 1318 fmd_case_susp_t *cis, *ncis; 1319 1320 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1321 1322 if (cip->ci_proxy_asru) 1323 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) * 1324 cip->ci_nsuspects); 1325 if (cip->ci_diag_de) 1326 nvlist_free(cip->ci_diag_de); 1327 if (cip->ci_diag_asru) 1328 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) * 1329 cip->ci_nsuspects); 1330 1331 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 1332 ncis = cis->cis_next; 1333 nvlist_free(cis->cis_nvl); 1334 fmd_free(cis, sizeof (fmd_case_susp_t)); 1335 } 1336 1337 cip->ci_suspects = NULL; 1338 cip->ci_nsuspects = 0; 1339 } 1340 1341 fmd_case_t * 1342 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 1343 uint_t state, const char *uuid, const char *code) 1344 { 1345 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 1346 fmd_case_impl_t *eip; 1347 1348 (void) pthread_mutex_init(&cip->ci_lock, NULL); 1349 fmd_buf_hash_create(&cip->ci_bufs); 1350 1351 fmd_module_hold(mp); 1352 cip->ci_mod = mp; 1353 cip->ci_xprt = xp; 1354 cip->ci_refs = 1; 1355 cip->ci_state = state; 1356 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 1357 cip->ci_uuidlen = strlen(cip->ci_uuid); 1358 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 1359 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 1360 1361 if (state > FMD_CASE_CLOSE_WAIT) 1362 cip->ci_flags |= FMD_CF_SOLVED; 1363 1364 /* 1365 * Insert the case into the global case hash. If the specified UUID is 1366 * already present, check to see if it is an orphan: if so, reclaim it; 1367 * otherwise if it is owned by a different module then return NULL. 1368 */ 1369 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 1370 (void) pthread_mutex_lock(&cip->ci_lock); 1371 cip->ci_refs--; /* decrement to zero */ 1372 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 1373 1374 cip = eip; /* switch 'cip' to the existing case */ 1375 (void) pthread_mutex_lock(&cip->ci_lock); 1376 1377 /* 1378 * If the ASRU cache is trying to recreate an orphan, then just 1379 * return the existing case that we found without changing it. 1380 */ 1381 if (mp == fmd.d_rmod) { 1382 /* 1383 * In case the case has already been created from 1384 * a checkpoint file we need to set up code now. 1385 */ 1386 if (cip->ci_state < FMD_CASE_CLOSED) { 1387 if (code != NULL && cip->ci_code == NULL) { 1388 cip->ci_code = fmd_strdup(code, 1389 FMD_SLEEP); 1390 cip->ci_codelen = cip->ci_code ? 1391 strlen(cip->ci_code) + 1 : 0; 1392 fmd_case_code_hash_insert(fmd.d_cases, 1393 cip); 1394 } 1395 } 1396 1397 /* 1398 * When recreating an orphan case, state passed in may 1399 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If 1400 * any suspects are still CLOSED (faulty) then the 1401 * overall state needs to be CLOSED. 1402 */ 1403 if ((cip->ci_state == FMD_CASE_REPAIRED || 1404 cip->ci_state == FMD_CASE_RESOLVED) && 1405 state == FMD_CASE_CLOSED) 1406 cip->ci_state = FMD_CASE_CLOSED; 1407 (void) pthread_mutex_unlock(&cip->ci_lock); 1408 fmd_case_rele((fmd_case_t *)cip); 1409 return ((fmd_case_t *)cip); 1410 } 1411 1412 /* 1413 * If the existing case isn't an orphan or is being proxied, 1414 * then we have a UUID conflict: return failure to the caller. 1415 */ 1416 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 1417 (void) pthread_mutex_unlock(&cip->ci_lock); 1418 fmd_case_rele((fmd_case_t *)cip); 1419 return (NULL); 1420 } 1421 1422 /* 1423 * If the new module is reclaiming an orphaned case, remove 1424 * the case from the root module, switch ci_mod, and then fall 1425 * through to adding the case to the new owner module 'mp'. 1426 */ 1427 fmd_module_lock(cip->ci_mod); 1428 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1429 fmd_module_unlock(cip->ci_mod); 1430 1431 fmd_module_rele(cip->ci_mod); 1432 cip->ci_mod = mp; 1433 fmd_module_hold(mp); 1434 1435 /* 1436 * It's possible that fmd crashed or was restarted during a 1437 * previous solve operation between the asru cache being created 1438 * and the ckpt file being updated to SOLVED. Thus when the DE 1439 * recreates the case here from the checkpoint file, the state 1440 * will be UNSOLVED and yet we are having to reclaim because 1441 * the case was in the asru cache. If this happens, revert the 1442 * case back to the UNSOLVED state and let the DE solve it again 1443 */ 1444 if (state == FMD_CASE_UNSOLVED) { 1445 fmd_asru_hash_delete_case(fmd.d_asrus, 1446 (fmd_case_t *)cip); 1447 fmd_case_destroy_suspects(cip); 1448 fmd_case_code_hash_delete(fmd.d_cases, cip); 1449 fmd_free(cip->ci_code, cip->ci_codelen); 1450 cip->ci_code = NULL; 1451 cip->ci_codelen = 0; 1452 cip->ci_tv_valid = 0; 1453 } 1454 1455 cip->ci_state = state; 1456 1457 (void) pthread_mutex_unlock(&cip->ci_lock); 1458 fmd_case_rele((fmd_case_t *)cip); 1459 } else { 1460 /* 1461 * add into hash of solved cases 1462 */ 1463 if (cip->ci_code) 1464 fmd_case_code_hash_insert(fmd.d_cases, cip); 1465 } 1466 1467 ASSERT(fmd_module_locked(mp)); 1468 fmd_list_append(&mp->mod_cases, cip); 1469 1470 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1471 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 1472 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1473 1474 return ((fmd_case_t *)cip); 1475 } 1476 1477 void 1478 fmd_case_destroy(fmd_case_t *cp, int visible) 1479 { 1480 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1481 fmd_case_item_t *cit, *ncit; 1482 1483 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1484 ASSERT(cip->ci_refs == 0); 1485 1486 if (visible) { 1487 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 1488 fmd_case_hash_delete(fmd.d_cases, cip); 1489 } 1490 1491 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 1492 ncit = cit->cit_next; 1493 fmd_event_rele(cit->cit_event); 1494 fmd_free(cit, sizeof (fmd_case_item_t)); 1495 } 1496 1497 fmd_case_destroy_suspects(cip); 1498 1499 if (cip->ci_principal != NULL) 1500 fmd_event_rele(cip->ci_principal); 1501 1502 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 1503 fmd_free(cip->ci_code, cip->ci_codelen); 1504 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 1505 1506 fmd_module_rele(cip->ci_mod); 1507 fmd_free(cip, sizeof (fmd_case_impl_t)); 1508 } 1509 1510 void 1511 fmd_case_hold(fmd_case_t *cp) 1512 { 1513 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1514 1515 (void) pthread_mutex_lock(&cip->ci_lock); 1516 fmd_case_hold_locked(cp); 1517 (void) pthread_mutex_unlock(&cip->ci_lock); 1518 } 1519 1520 void 1521 fmd_case_hold_locked(fmd_case_t *cp) 1522 { 1523 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1524 1525 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1526 if (cip->ci_flags & FMD_CF_DELETING) 1527 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1528 (void *)cip, cip->ci_uuid); 1529 cip->ci_refs++; 1530 ASSERT(cip->ci_refs != 0); 1531 } 1532 1533 static fmd_case_impl_t * 1534 fmd_case_tryhold(fmd_case_impl_t *cip) 1535 { 1536 /* 1537 * If the case's "deleting" bit is unset, hold and return case, 1538 * otherwise, return NULL. 1539 */ 1540 (void) pthread_mutex_lock(&cip->ci_lock); 1541 if (cip->ci_flags & FMD_CF_DELETING) { 1542 (void) pthread_mutex_unlock(&cip->ci_lock); 1543 cip = NULL; 1544 } else { 1545 fmd_case_hold_locked((fmd_case_t *)cip); 1546 (void) pthread_mutex_unlock(&cip->ci_lock); 1547 } 1548 return (cip); 1549 } 1550 1551 void 1552 fmd_case_rele(fmd_case_t *cp) 1553 { 1554 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1555 1556 (void) pthread_mutex_lock(&cip->ci_lock); 1557 ASSERT(cip->ci_refs != 0); 1558 1559 if (--cip->ci_refs == 0) 1560 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1561 else 1562 (void) pthread_mutex_unlock(&cip->ci_lock); 1563 } 1564 1565 void 1566 fmd_case_rele_locked(fmd_case_t *cp) 1567 { 1568 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1569 1570 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1571 --cip->ci_refs; 1572 ASSERT(cip->ci_refs != 0); 1573 } 1574 1575 int 1576 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1577 { 1578 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1579 fmd_case_item_t *cit; 1580 fmd_event_t *oep; 1581 uint_t state; 1582 int new; 1583 1584 fmd_event_hold(ep); 1585 (void) pthread_mutex_lock(&cip->ci_lock); 1586 1587 if (cip->ci_flags & FMD_CF_SOLVED) 1588 state = FMD_EVS_DIAGNOSED; 1589 else 1590 state = FMD_EVS_ACCEPTED; 1591 1592 oep = cip->ci_principal; 1593 cip->ci_principal = ep; 1594 1595 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1596 if (cit->cit_event == ep) 1597 break; 1598 } 1599 1600 cip->ci_flags |= FMD_CF_DIRTY; 1601 new = cit == NULL && ep != oep; 1602 1603 (void) pthread_mutex_unlock(&cip->ci_lock); 1604 1605 fmd_module_setcdirty(cip->ci_mod); 1606 fmd_event_transition(ep, state); 1607 1608 if (oep != NULL) 1609 fmd_event_rele(oep); 1610 1611 return (new); 1612 } 1613 1614 int 1615 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1616 { 1617 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1618 fmd_case_item_t *cit; 1619 uint_t state; 1620 int new; 1621 boolean_t injected; 1622 1623 (void) pthread_mutex_lock(&cip->ci_lock); 1624 1625 if (cip->ci_flags & FMD_CF_SOLVED) 1626 state = FMD_EVS_DIAGNOSED; 1627 else 1628 state = FMD_EVS_ACCEPTED; 1629 1630 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1631 if (cit->cit_event == ep) 1632 break; 1633 } 1634 1635 new = cit == NULL && ep != cip->ci_principal; 1636 1637 /* 1638 * If the event is already in the case or the case is already solved, 1639 * there is no reason to save it: just transition it appropriately. 1640 */ 1641 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1642 (void) pthread_mutex_unlock(&cip->ci_lock); 1643 fmd_event_transition(ep, state); 1644 return (new); 1645 } 1646 1647 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1648 fmd_event_hold(ep); 1649 1650 if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl, 1651 "__injected", &injected) == 0 && injected) 1652 fmd_case_set_injected(cp); 1653 1654 cit->cit_next = cip->ci_items; 1655 cit->cit_event = ep; 1656 1657 cip->ci_items = cit; 1658 cip->ci_nitems++; 1659 1660 cip->ci_flags |= FMD_CF_DIRTY; 1661 (void) pthread_mutex_unlock(&cip->ci_lock); 1662 1663 fmd_module_setcdirty(cip->ci_mod); 1664 fmd_event_transition(ep, state); 1665 1666 return (new); 1667 } 1668 1669 void 1670 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1671 { 1672 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1673 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1674 1675 (void) pthread_mutex_lock(&cip->ci_lock); 1676 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1677 cip->ci_flags |= FMD_CF_DIRTY; 1678 1679 cis->cis_next = cip->ci_suspects; 1680 cis->cis_nvl = nvl; 1681 1682 cip->ci_suspects = cis; 1683 cip->ci_nsuspects++; 1684 1685 (void) pthread_mutex_unlock(&cip->ci_lock); 1686 if (cip->ci_xprt == NULL) 1687 fmd_module_setcdirty(cip->ci_mod); 1688 } 1689 1690 void 1691 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1692 { 1693 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1694 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1695 boolean_t b; 1696 1697 (void) pthread_mutex_lock(&cip->ci_lock); 1698 1699 cis->cis_next = cip->ci_suspects; 1700 cis->cis_nvl = nvl; 1701 1702 if (nvlist_lookup_boolean_value(nvl, 1703 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1704 cip->ci_flags |= FMD_CF_INVISIBLE; 1705 1706 cip->ci_suspects = cis; 1707 cip->ci_nsuspects++; 1708 1709 (void) pthread_mutex_unlock(&cip->ci_lock); 1710 } 1711 1712 void 1713 fmd_case_reset_suspects(fmd_case_t *cp) 1714 { 1715 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1716 1717 (void) pthread_mutex_lock(&cip->ci_lock); 1718 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1719 1720 fmd_case_destroy_suspects(cip); 1721 cip->ci_flags |= FMD_CF_DIRTY; 1722 1723 (void) pthread_mutex_unlock(&cip->ci_lock); 1724 fmd_module_setcdirty(cip->ci_mod); 1725 } 1726 1727 /*ARGSUSED*/ 1728 static void 1729 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1730 { 1731 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1732 } 1733 1734 /* 1735 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1736 * whatever actions and emit whatever events are appropriate for the state. 1737 * Refer to the topmost block comment explaining the state machine for details. 1738 */ 1739 void 1740 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1741 { 1742 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1743 fmd_case_item_t *cit; 1744 fmd_event_t *e; 1745 int resolved = 0; 1746 int any_unusable_and_present = 0; 1747 1748 ASSERT(state <= FMD_CASE_RESOLVED); 1749 (void) pthread_mutex_lock(&cip->ci_lock); 1750 1751 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1752 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED); 1753 1754 cip->ci_flags |= flags; 1755 1756 if (cip->ci_state >= state) { 1757 (void) pthread_mutex_unlock(&cip->ci_lock); 1758 return; /* already in specified state */ 1759 } 1760 1761 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1762 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1763 1764 cip->ci_state = state; 1765 cip->ci_flags |= FMD_CF_DIRTY; 1766 1767 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1768 fmd_module_setcdirty(cip->ci_mod); 1769 1770 switch (state) { 1771 case FMD_CASE_SOLVED: 1772 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1773 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1774 1775 if (cip->ci_principal != NULL) { 1776 fmd_event_transition(cip->ci_principal, 1777 FMD_EVS_DIAGNOSED); 1778 } 1779 break; 1780 1781 case FMD_CASE_CLOSE_WAIT: 1782 /* 1783 * If the case was never solved, do not change ASRUs. 1784 * If the case was never fmd_case_closed, do not change ASRUs. 1785 * If the case was repaired, do not change ASRUs. 1786 */ 1787 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1788 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1789 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1790 fmd_case_unusable, NULL); 1791 1792 /* 1793 * If an orphaned case transitions to CLOSE_WAIT, the owning 1794 * module is no longer loaded: continue on to CASE_CLOSED or 1795 * CASE_REPAIRED as appropriate. 1796 */ 1797 if (fmd_case_orphaned(cp)) { 1798 if (cip->ci_flags & FMD_CF_REPAIRED) { 1799 state = cip->ci_state = FMD_CASE_REPAIRED; 1800 TRACE((FMD_DBG_CASE, "case %s %s->%s", 1801 cip->ci_uuid, 1802 _fmd_case_snames[FMD_CASE_CLOSE_WAIT], 1803 _fmd_case_snames[FMD_CASE_REPAIRED])); 1804 goto do_repair; 1805 } else { 1806 state = cip->ci_state = FMD_CASE_CLOSED; 1807 TRACE((FMD_DBG_CASE, "case %s %s->%s", 1808 cip->ci_uuid, 1809 _fmd_case_snames[FMD_CASE_CLOSE_WAIT], 1810 _fmd_case_snames[FMD_CASE_CLOSED])); 1811 } 1812 } 1813 break; 1814 1815 case FMD_CASE_REPAIRED: 1816 do_repair: 1817 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp)); 1818 1819 /* 1820 * If we've been requested to transition straight on to the 1821 * RESOLVED state (which can happen with fault proxying where a 1822 * list.resolved or a uuresolved is received from the other 1823 * side), or if all suspects are already either usable or not 1824 * present then transition straight to RESOLVED state, 1825 * publishing both the list.repaired and list.resolved. For a 1826 * proxy, if we discover here that all suspects are already 1827 * either usable or not present, notify the diag side instead 1828 * using fmd_xprt_uuresolved(). 1829 */ 1830 if (flags & FMD_CF_RESOLVED) { 1831 if (cip->ci_xprt != NULL) 1832 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1833 } else { 1834 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1835 fmd_case_unusable_and_present, 1836 &any_unusable_and_present); 1837 if (any_unusable_and_present) 1838 break; 1839 if (cip->ci_xprt != NULL) { 1840 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid); 1841 break; 1842 } 1843 } 1844 1845 cip->ci_state = FMD_CASE_RESOLVED; 1846 (void) pthread_mutex_unlock(&cip->ci_lock); 1847 fmd_case_publish(cp, state); 1848 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1849 _fmd_case_snames[FMD_CASE_REPAIRED], 1850 _fmd_case_snames[FMD_CASE_RESOLVED])); 1851 state = FMD_CASE_RESOLVED; 1852 resolved = 1; 1853 (void) pthread_mutex_lock(&cip->ci_lock); 1854 break; 1855 1856 case FMD_CASE_RESOLVED: 1857 /* 1858 * For a proxy, no need to check that all suspects are already 1859 * either usable or not present - this request has come from 1860 * the diagnosing side which makes the final decision on this. 1861 */ 1862 if (cip->ci_xprt != NULL) { 1863 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1864 resolved = 1; 1865 break; 1866 } 1867 1868 ASSERT(fmd_case_orphaned(cp)); 1869 1870 /* 1871 * If all suspects are already either usable or not present then 1872 * carry on, publish list.resolved and discard the case. 1873 */ 1874 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1875 fmd_case_unusable_and_present, &any_unusable_and_present); 1876 if (any_unusable_and_present) { 1877 (void) pthread_mutex_unlock(&cip->ci_lock); 1878 return; 1879 } 1880 1881 resolved = 1; 1882 break; 1883 } 1884 1885 (void) pthread_mutex_unlock(&cip->ci_lock); 1886 1887 /* 1888 * If the module has initialized, then publish the appropriate event 1889 * for the new case state. If not, we are being called from the 1890 * checkpoint code during module load, in which case the module's 1891 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1892 * may not be open yet, which will prevent us from computing the event 1893 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1894 * event in our queue: this won't be processed until _fmd_init is done. 1895 */ 1896 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1897 fmd_case_publish(cp, state); 1898 else { 1899 fmd_case_hold(cp); 1900 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1901 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1902 } 1903 1904 if (resolved) { 1905 if (cip->ci_xprt != NULL) { 1906 /* 1907 * If we transitioned to RESOLVED, adjust the reference 1908 * count to reflect our removal from 1909 * fmd.d_rmod->mod_cases above. If the caller has not 1910 * placed an additional hold on the case, it will now 1911 * be freed. 1912 */ 1913 (void) pthread_mutex_lock(&cip->ci_lock); 1914 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1915 (void) pthread_mutex_unlock(&cip->ci_lock); 1916 fmd_case_rele(cp); 1917 } else { 1918 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1919 fmd_asru_log_resolved, NULL); 1920 (void) pthread_mutex_lock(&cip->ci_lock); 1921 /* mark as "ready to be discarded */ 1922 cip->ci_flags |= FMD_CF_RES_CMPL; 1923 (void) pthread_mutex_unlock(&cip->ci_lock); 1924 } 1925 } 1926 } 1927 1928 /* 1929 * Discard any case if it is in RESOLVED state (and if check_if_aged argument 1930 * is set if all suspects have passed the rsrc.aged time). 1931 */ 1932 void 1933 fmd_case_discard_resolved(fmd_case_t *cp, void *arg) 1934 { 1935 int check_if_aged = *(int *)arg; 1936 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1937 1938 /* 1939 * First check if case has completed transition to resolved. 1940 */ 1941 (void) pthread_mutex_lock(&cip->ci_lock); 1942 if (!(cip->ci_flags & FMD_CF_RES_CMPL)) { 1943 (void) pthread_mutex_unlock(&cip->ci_lock); 1944 return; 1945 } 1946 1947 /* 1948 * Now if check_is_aged is set, see if all suspects have aged. 1949 */ 1950 if (check_if_aged) { 1951 int aged = 1; 1952 1953 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1954 fmd_asru_check_if_aged, &aged); 1955 if (!aged) { 1956 (void) pthread_mutex_unlock(&cip->ci_lock); 1957 return; 1958 } 1959 } 1960 1961 /* 1962 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't 1963 * do it twice. 1964 */ 1965 fmd_module_lock(cip->ci_mod); 1966 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1967 fmd_module_unlock(cip->ci_mod); 1968 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1969 cip->ci_flags &= ~FMD_CF_RES_CMPL; 1970 (void) pthread_mutex_unlock(&cip->ci_lock); 1971 fmd_case_rele(cp); 1972 } 1973 1974 /* 1975 * Transition the specified case to *at least* the specified state by first 1976 * re-validating the suspect list using the resource cache. This function is 1977 * employed by the checkpoint code when restoring a saved, solved case to see 1978 * if the state of the case has effectively changed while fmd was not running 1979 * or the module was not loaded. 1980 */ 1981 void 1982 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1983 { 1984 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1985 1986 int usable = 0; /* are any suspects usable? */ 1987 1988 ASSERT(state >= FMD_CASE_SOLVED); 1989 (void) pthread_mutex_lock(&cip->ci_lock); 1990 1991 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1992 1993 (void) pthread_mutex_unlock(&cip->ci_lock); 1994 1995 if (!usable) { 1996 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1997 flags |= FMD_CF_ISOLATED; 1998 } 1999 2000 fmd_case_transition(cp, state, flags); 2001 } 2002 2003 void 2004 fmd_case_setdirty(fmd_case_t *cp) 2005 { 2006 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2007 2008 (void) pthread_mutex_lock(&cip->ci_lock); 2009 cip->ci_flags |= FMD_CF_DIRTY; 2010 (void) pthread_mutex_unlock(&cip->ci_lock); 2011 2012 fmd_module_setcdirty(cip->ci_mod); 2013 } 2014 2015 void 2016 fmd_case_clrdirty(fmd_case_t *cp) 2017 { 2018 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2019 2020 (void) pthread_mutex_lock(&cip->ci_lock); 2021 cip->ci_flags &= ~FMD_CF_DIRTY; 2022 (void) pthread_mutex_unlock(&cip->ci_lock); 2023 } 2024 2025 void 2026 fmd_case_commit(fmd_case_t *cp) 2027 { 2028 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2029 fmd_case_item_t *cit; 2030 2031 (void) pthread_mutex_lock(&cip->ci_lock); 2032 2033 if (cip->ci_flags & FMD_CF_DIRTY) { 2034 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 2035 fmd_event_commit(cit->cit_event); 2036 2037 if (cip->ci_principal != NULL) 2038 fmd_event_commit(cip->ci_principal); 2039 2040 fmd_buf_hash_commit(&cip->ci_bufs); 2041 cip->ci_flags &= ~FMD_CF_DIRTY; 2042 } 2043 2044 (void) pthread_mutex_unlock(&cip->ci_lock); 2045 } 2046 2047 /* 2048 * On proxy side, send back repair/acquit/etc request to diagnosing side 2049 */ 2050 void 2051 fmd_case_xprt_updated(fmd_case_t *cp) 2052 { 2053 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2054 nvlist_t **nva; 2055 uint8_t *ba; 2056 int msg = B_TRUE; 2057 int count = 0; 2058 fmd_case_lst_t fcl; 2059 2060 ASSERT(cip->ci_xprt != NULL); 2061 (void) pthread_mutex_lock(&cip->ci_lock); 2062 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 2063 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 2064 fcl.fcl_countp = &count; 2065 fcl.fcl_maxcount = cip->ci_nsuspects; 2066 fcl.fcl_msgp = &msg; 2067 fcl.fcl_ba = ba; 2068 fcl.fcl_nva = nva; 2069 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 2070 (void) pthread_mutex_unlock(&cip->ci_lock); 2071 fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru, 2072 count); 2073 } 2074 2075 /* 2076 * fmd_case_update_status() can be called on either the proxy side when a 2077 * list.suspect is received, or on the diagnosing side when an update request 2078 * is received from the proxy. It updates the status in the resource cache. 2079 */ 2080 void 2081 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup, 2082 uint8_t *diag_asrup) 2083 { 2084 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2085 int count = 0; 2086 fmd_asru_update_status_t faus; 2087 2088 /* 2089 * update status of resource cache entries 2090 */ 2091 faus.faus_countp = &count; 2092 faus.faus_maxcount = cip->ci_nsuspects; 2093 faus.faus_ba = statusp; 2094 faus.faus_proxy_asru = proxy_asrup; 2095 faus.faus_diag_asru = diag_asrup; 2096 faus.faus_is_proxy = (cip->ci_xprt != NULL); 2097 (void) pthread_mutex_lock(&cip->ci_lock); 2098 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status, 2099 &faus); 2100 (void) pthread_mutex_unlock(&cip->ci_lock); 2101 } 2102 2103 /* 2104 * Called on either the proxy side or the diag side when a repair has taken 2105 * place on the other side but this side may know the asru "contains" 2106 * relationships. 2107 */ 2108 void 2109 fmd_case_update_containees(fmd_case_t *cp) 2110 { 2111 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2112 2113 (void) pthread_mutex_lock(&cip->ci_lock); 2114 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2115 fmd_asru_update_containees, NULL); 2116 (void) pthread_mutex_unlock(&cip->ci_lock); 2117 } 2118 2119 /* 2120 * fmd_case_close_status() is called on diagnosing side when proxy side 2121 * has had a uuclose. It updates the status in the resource cache. 2122 */ 2123 void 2124 fmd_case_close_status(fmd_case_t *cp) 2125 { 2126 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2127 int count = 0; 2128 fmd_asru_close_status_t facs; 2129 2130 /* 2131 * update status of resource cache entries 2132 */ 2133 facs.facs_countp = &count; 2134 facs.facs_maxcount = cip->ci_nsuspects; 2135 (void) pthread_mutex_lock(&cip->ci_lock); 2136 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status, 2137 &facs); 2138 (void) pthread_mutex_unlock(&cip->ci_lock); 2139 } 2140 2141 /* 2142 * Indicate that the case may need to change state because one or more of the 2143 * ASRUs named as a suspect has changed state. We examine all the suspects 2144 * and if none are still faulty, we initiate a case close transition. 2145 */ 2146 void 2147 fmd_case_update(fmd_case_t *cp) 2148 { 2149 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2150 uint_t cstate; 2151 int faulty = 0; 2152 2153 (void) pthread_mutex_lock(&cip->ci_lock); 2154 cstate = cip->ci_state; 2155 2156 if (cip->ci_state < FMD_CASE_SOLVED) { 2157 (void) pthread_mutex_unlock(&cip->ci_lock); 2158 return; /* update is not appropriate */ 2159 } 2160 2161 if (cip->ci_flags & FMD_CF_REPAIRED) { 2162 (void) pthread_mutex_unlock(&cip->ci_lock); 2163 return; /* already repaired */ 2164 } 2165 2166 TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid)); 2167 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2168 (void) pthread_mutex_unlock(&cip->ci_lock); 2169 2170 if (faulty) { 2171 nvlist_t *nvl; 2172 fmd_event_t *e; 2173 char *class; 2174 2175 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid)); 2176 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2177 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2178 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2179 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 2180 fmd_log_append(fmd.d_fltlog, e, cp); 2181 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 2182 fmd_dispq_dispatch(fmd.d_disp, e, class); 2183 return; /* one or more suspects are still marked faulty */ 2184 } 2185 2186 if (cstate == FMD_CASE_CLOSED) 2187 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2188 else 2189 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2190 } 2191 2192 /* 2193 * Delete a closed case from the module's case list once the fmdo_close() entry 2194 * point has run to completion. If the case is owned by a transport module, 2195 * tell the transport to proxy a case close on the other end of the transport. 2196 * Transition to the appropriate next state based on ci_flags. This 2197 * function represents the end of CLOSE_WAIT and transitions the case to either 2198 * CLOSED or REPAIRED or discards it entirely because it was never solved; 2199 * refer to the topmost block comment explaining the state machine for details. 2200 */ 2201 void 2202 fmd_case_delete(fmd_case_t *cp) 2203 { 2204 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2205 fmd_modstat_t *msp; 2206 size_t buftotal; 2207 2208 TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid)); 2209 ASSERT(fmd_module_locked(cip->ci_mod)); 2210 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2211 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 2212 2213 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2214 msp = cip->ci_mod->mod_stats; 2215 2216 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 2217 msp->ms_caseopen.fmds_value.ui64--; 2218 2219 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 2220 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 2221 2222 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2223 2224 if (cip->ci_xprt == NULL) 2225 fmd_module_setcdirty(cip->ci_mod); 2226 2227 fmd_module_rele(cip->ci_mod); 2228 cip->ci_mod = fmd.d_rmod; 2229 fmd_module_hold(cip->ci_mod); 2230 2231 /* 2232 * If the case has been solved, then retain it 2233 * on the root module's case list at least until we're transitioned. 2234 * Otherwise free the case with our final fmd_case_rele() below. 2235 */ 2236 if (cip->ci_flags & FMD_CF_SOLVED) { 2237 fmd_module_lock(cip->ci_mod); 2238 fmd_list_append(&cip->ci_mod->mod_cases, cip); 2239 fmd_module_unlock(cip->ci_mod); 2240 fmd_case_hold(cp); 2241 } 2242 2243 /* 2244 * Transition onwards to REPAIRED or CLOSED as originally requested. 2245 * Note that for proxy case if we're transitioning to CLOSED it means 2246 * the case was isolated locally, so call fmd_xprt_uuclose() to notify 2247 * the diagnosing side. No need to notify the diagnosing side if we are 2248 * transitioning to REPAIRED as we only do this when requested to do 2249 * so by the diagnosing side anyway. 2250 */ 2251 if (cip->ci_flags & FMD_CF_REPAIRED) 2252 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 2253 else if (cip->ci_flags & FMD_CF_ISOLATED) { 2254 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 2255 if (cip->ci_xprt != NULL) 2256 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 2257 } 2258 2259 fmd_case_rele(cp); 2260 } 2261 2262 void 2263 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache) 2264 { 2265 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2266 2267 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 2268 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 2269 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 2270 2271 ASSERT(fmd_module_locked(cip->ci_mod)); 2272 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 2273 if (delete_from_asru_cache) { 2274 (void) pthread_mutex_lock(&cip->ci_lock); 2275 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 2276 (void) pthread_mutex_unlock(&cip->ci_lock); 2277 } 2278 fmd_case_rele(cp); 2279 } 2280 2281 /* 2282 * Indicate that the problem corresponding to a case has been repaired by 2283 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 2284 * already been closed, this function initiates the transition to CLOSE_WAIT. 2285 * The caller must have the case held from fmd_case_hash_lookup(), so we can 2286 * grab and drop ci_lock without the case being able to be freed in between. 2287 */ 2288 int 2289 fmd_case_repair(fmd_case_t *cp) 2290 { 2291 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2292 uint_t cstate; 2293 fmd_asru_rep_arg_t fara; 2294 2295 (void) pthread_mutex_lock(&cip->ci_lock); 2296 cstate = cip->ci_state; 2297 2298 if (cstate < FMD_CASE_SOLVED) { 2299 (void) pthread_mutex_unlock(&cip->ci_lock); 2300 return (fmd_set_errno(EFMD_CASE_STATE)); 2301 } 2302 2303 if (cip->ci_flags & FMD_CF_REPAIRED) { 2304 (void) pthread_mutex_unlock(&cip->ci_lock); 2305 return (0); /* already repaired */ 2306 } 2307 2308 TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid)); 2309 fara.fara_reason = FMD_ASRU_REPAIRED; 2310 fara.fara_bywhat = FARA_BY_CASE; 2311 fara.fara_rval = NULL; 2312 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2313 (void) pthread_mutex_unlock(&cip->ci_lock); 2314 2315 /* 2316 * if this is a proxied case, send the repair across the transport. 2317 * The remote side will then do the repair and send a list.repaired back 2318 * again such that we can finally repair the case on this side. 2319 */ 2320 if (cip->ci_xprt != NULL) { 2321 fmd_case_xprt_updated(cp); 2322 return (0); 2323 } 2324 2325 if (cstate == FMD_CASE_CLOSED) 2326 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2327 else 2328 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2329 2330 return (0); 2331 } 2332 2333 int 2334 fmd_case_acquit(fmd_case_t *cp) 2335 { 2336 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2337 uint_t cstate; 2338 fmd_asru_rep_arg_t fara; 2339 2340 (void) pthread_mutex_lock(&cip->ci_lock); 2341 cstate = cip->ci_state; 2342 2343 if (cstate < FMD_CASE_SOLVED) { 2344 (void) pthread_mutex_unlock(&cip->ci_lock); 2345 return (fmd_set_errno(EFMD_CASE_STATE)); 2346 } 2347 2348 if (cip->ci_flags & FMD_CF_REPAIRED) { 2349 (void) pthread_mutex_unlock(&cip->ci_lock); 2350 return (0); /* already repaired */ 2351 } 2352 2353 TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid)); 2354 fara.fara_reason = FMD_ASRU_ACQUITTED; 2355 fara.fara_bywhat = FARA_BY_CASE; 2356 fara.fara_rval = NULL; 2357 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara); 2358 (void) pthread_mutex_unlock(&cip->ci_lock); 2359 2360 /* 2361 * if this is a proxied case, send the repair across the transport. 2362 * The remote side will then do the repair and send a list.repaired back 2363 * again such that we can finally repair the case on this side. 2364 */ 2365 if (cip->ci_xprt != NULL) { 2366 fmd_case_xprt_updated(cp); 2367 return (0); 2368 } 2369 2370 if (cstate == FMD_CASE_CLOSED) 2371 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 2372 else 2373 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 2374 2375 return (0); 2376 } 2377 2378 int 2379 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 2380 { 2381 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2382 fmd_case_item_t *cit; 2383 uint_t state; 2384 int rv = 0; 2385 2386 (void) pthread_mutex_lock(&cip->ci_lock); 2387 2388 if (cip->ci_state >= FMD_CASE_SOLVED) 2389 state = FMD_EVS_DIAGNOSED; 2390 else 2391 state = FMD_EVS_ACCEPTED; 2392 2393 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 2394 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 2395 break; 2396 } 2397 2398 if (rv == 0 && cip->ci_principal != NULL) 2399 rv = fmd_event_equal(ep, cip->ci_principal); 2400 2401 (void) pthread_mutex_unlock(&cip->ci_lock); 2402 2403 if (rv != 0) 2404 fmd_event_transition(ep, state); 2405 2406 return (rv); 2407 } 2408 2409 int 2410 fmd_case_orphaned(fmd_case_t *cp) 2411 { 2412 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 2413 } 2414 2415 void 2416 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 2417 { 2418 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 2419 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 2420 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 2421 } 2422 2423 void 2424 fmd_case_set_injected(fmd_case_t *cp) 2425 { 2426 ((fmd_case_impl_t *)cp)->ci_injected = 1; 2427 } 2428 2429 void 2430 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl) 2431 { 2432 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2433 2434 if (cip->ci_diag_de) 2435 nvlist_free(cip->ci_diag_de); 2436 cip->ci_diag_de = nvl; 2437 } 2438 2439 void 2440 fmd_case_setcode(fmd_case_t *cp, char *code) 2441 { 2442 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2443 2444 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 2445 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 2446 } 2447 2448 /*ARGSUSED*/ 2449 static void 2450 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 2451 { 2452 int not_faulty = 0; 2453 int faulty = 0; 2454 nvlist_t *nvl; 2455 fmd_event_t *e; 2456 char *class; 2457 int any_unusable_and_present = 0; 2458 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 2459 2460 if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL) 2461 return; 2462 2463 if (cip->ci_state == FMD_CASE_RESOLVED) { 2464 cip->ci_flags |= FMD_CF_RES_CMPL; 2465 return; 2466 } 2467 2468 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 2469 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 2470 ¬_faulty); 2471 2472 if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) { 2473 /* 2474 * If none of the suspects is faulty, replay the list.repaired. 2475 * If all suspects are already either usable or not present then 2476 * also transition straight to RESOLVED state. 2477 */ 2478 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2479 fmd_case_unusable_and_present, &any_unusable_and_present); 2480 if (!any_unusable_and_present) { 2481 cip->ci_state = FMD_CASE_RESOLVED; 2482 2483 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2484 cip->ci_uuid)); 2485 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2486 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2487 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2488 class); 2489 fmd_dispq_dispatch(fmd.d_disp, e, class); 2490 2491 TRACE((FMD_DBG_CASE, "replay sending list.resolved %s", 2492 cip->ci_uuid)); 2493 fmd_case_publish(cp, FMD_CASE_RESOLVED); 2494 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 2495 fmd_asru_log_resolved, NULL); 2496 cip->ci_flags |= FMD_CF_RES_CMPL; 2497 } else { 2498 TRACE((FMD_DBG_CASE, "replay sending list.repaired %s", 2499 cip->ci_uuid)); 2500 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 2501 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2502 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 2503 class); 2504 fmd_dispq_dispatch(fmd.d_disp, e, class); 2505 } 2506 } else if (faulty && not_faulty) { 2507 /* 2508 * if some but not all of the suspects are not faulty, replay 2509 * the list.updated. 2510 */ 2511 TRACE((FMD_DBG_CASE, "replay sending list.updated %s", 2512 cip->ci_uuid)); 2513 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 2514 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 2515 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 2516 fmd_dispq_dispatch(fmd.d_disp, e, class); 2517 } 2518 } 2519 2520 void 2521 fmd_case_repair_replay() 2522 { 2523 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 2524 } 2525