1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * FMD Case Subsystem 29 * 30 * Diagnosis engines are expected to group telemetry events related to the 31 * diagnosis of a particular problem on the system into a set of cases. The 32 * diagnosis engine may have any number of cases open at a given point in time. 33 * Some cases may eventually be *solved* by associating a suspect list of one 34 * or more problems with the case, at which point fmd publishes a list.suspect 35 * event for the case and it becomes visible to administrators and agents. 36 * 37 * Every case is named using a UUID, and is globally visible in the case hash. 38 * Cases are reference-counted, except for the reference from the case hash 39 * itself. Consumers of case references include modules, which store active 40 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 41 * 42 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 43 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 44 * or transport) and the case is referenced by the mod_cases list. Once the 45 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 46 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 47 * 48 * +------------+ 49 * +----------| UNSOLVED | 50 * | +------------+ 51 * | 1 | 52 * | | 53 * | +-------v----+ 54 * 2 | | SOLVED | 55 * | +------------+ 56 * | 3 | 5 | 57 * +------------+ | | 58 * | | | 59 * +-v---v----v-+ 60 * | CLOSE_WAIT | 61 * +------------+ 62 * | | | 63 * +-----------+ | +------------+ 64 * | 4 | | 65 * v +-----v------+ | 66 * discard | CLOSED | 6 | 67 * +------------+ | 68 * | | 69 * | +------------+ 70 * 7 | | 71 * +-----v----v-+ 72 * | REPAIRED | 73 * +------------+ 74 * | 75 * 8 | 76 * +-----v------+ 77 * | RESOLVED | 78 * +------------+ 79 * | 80 * v 81 * discard 82 * 83 * The state machine changes are triggered by calls to fmd_case_transition() 84 * from various locations inside of fmd, as described below: 85 * 86 * [1] Called by: fmd_case_solve() 87 * Actions: FMD_CF_SOLVED flag is set in ci_flags 88 * conviction policy is applied to suspect list 89 * suspects convicted are marked faulty (F) in R$ 90 * list.suspect event logged and dispatched 91 * 92 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 93 * Actions: diagnosis engine fmdo_close() entry point scheduled 94 * case discarded upon exit from CLOSE_WAIT 95 * 96 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 97 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 98 * suspects convicted (F) are marked unusable (U) in R$ 99 * diagnosis engine fmdo_close() entry point scheduled 100 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 101 * 102 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 103 * Actions: list.isolated event dispatched 104 * case deleted from module's list of open cases 105 * 106 * [5] Called by: fmd_case_repair(), fmd_case_update() 107 * Actions: FMD_CF_REPAIR flag is set in ci_flags 108 * diagnosis engine fmdo_close() entry point scheduled 109 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 110 * 111 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 112 * Actions: suspects convicted are marked non faulty (!F) in R$ 113 * list.repaired or list.updated event dispatched 114 * 115 * [7] Called by: fmd_case_repair(), fmd_case_update() 116 * Actions: FMD_CF_REPAIR flag is set in ci_flags 117 * suspects convicted are marked non faulty (!F) in R$ 118 * list.repaired or list.updated event dispatched 119 * 120 * [8] Called by: fmd_case_uuresolve() 121 * Actions: list.resolved event dispatched 122 * case is discarded 123 */ 124 125 #include <sys/fm/protocol.h> 126 #include <uuid/uuid.h> 127 #include <alloca.h> 128 129 #include <fmd_alloc.h> 130 #include <fmd_module.h> 131 #include <fmd_error.h> 132 #include <fmd_conf.h> 133 #include <fmd_case.h> 134 #include <fmd_string.h> 135 #include <fmd_subr.h> 136 #include <fmd_protocol.h> 137 #include <fmd_event.h> 138 #include <fmd_eventq.h> 139 #include <fmd_dispq.h> 140 #include <fmd_buf.h> 141 #include <fmd_log.h> 142 #include <fmd_asru.h> 143 #include <fmd_fmri.h> 144 #include <fmd_xprt.h> 145 146 #include <fmd.h> 147 148 static const char *const _fmd_case_snames[] = { 149 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 150 "SOLVED", /* FMD_CASE_SOLVED */ 151 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 152 "CLOSED", /* FMD_CASE_CLOSED */ 153 "REPAIRED", /* FMD_CASE_REPAIRED */ 154 "RESOLVED" /* FMD_CASE_RESOLVED */ 155 }; 156 157 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 158 159 fmd_case_hash_t * 160 fmd_case_hash_create(void) 161 { 162 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 163 164 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 165 chp->ch_hashlen = fmd.d_str_buckets; 166 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 167 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 168 FMD_SLEEP); 169 chp->ch_count = 0; 170 171 return (chp); 172 } 173 174 /* 175 * Destroy the case hash. Unlike most of our hash tables, no active references 176 * are kept by the case hash itself; all references come from other subsystems. 177 * The hash must be destroyed after all modules are unloaded; if anything was 178 * present in the hash it would be by definition a reference count leak. 179 */ 180 void 181 fmd_case_hash_destroy(fmd_case_hash_t *chp) 182 { 183 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 184 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 185 fmd_free(chp, sizeof (fmd_case_hash_t)); 186 } 187 188 /* 189 * Take a snapshot of the case hash by placing an additional hold on each 190 * member in an auxiliary array, and then call 'func' for each case. 191 */ 192 void 193 fmd_case_hash_apply(fmd_case_hash_t *chp, 194 void (*func)(fmd_case_t *, void *), void *arg) 195 { 196 fmd_case_impl_t *cp, **cps, **cpp; 197 uint_t cpc, i; 198 199 (void) pthread_rwlock_rdlock(&chp->ch_lock); 200 201 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 202 cpc = chp->ch_count; 203 204 for (i = 0; i < chp->ch_hashlen; i++) { 205 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) 206 *cpp++ = fmd_case_tryhold(cp); 207 } 208 209 ASSERT(cpp == cps + cpc); 210 (void) pthread_rwlock_unlock(&chp->ch_lock); 211 212 for (i = 0; i < cpc; i++) { 213 if (cps[i] != NULL) { 214 func((fmd_case_t *)cps[i], arg); 215 fmd_case_rele((fmd_case_t *)cps[i]); 216 } 217 } 218 219 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 220 } 221 222 static void 223 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 224 { 225 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 226 227 cip->ci_code_next = chp->ch_code_hash[h]; 228 chp->ch_code_hash[h] = cip; 229 } 230 231 static void 232 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 233 { 234 fmd_case_impl_t **pp, *cp; 235 236 if (cip->ci_code) { 237 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 238 239 pp = &chp->ch_code_hash[h]; 240 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 241 if (cp != cip) 242 pp = &cp->ci_code_next; 243 else 244 break; 245 } 246 if (cp != NULL) { 247 *pp = cp->ci_code_next; 248 cp->ci_code_next = NULL; 249 } 250 } 251 } 252 253 /* 254 * Look up the diagcode for this case and cache it in ci_code. If no suspects 255 * were defined for this case or if the lookup fails, the event dictionary or 256 * module code is broken, and we set the event code to a precomputed default. 257 */ 258 static const char * 259 fmd_case_mkcode(fmd_case_t *cp) 260 { 261 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 262 fmd_case_susp_t *cis; 263 fmd_case_hash_t *chp = fmd.d_cases; 264 265 char **keys, **keyp; 266 const char *s; 267 268 ASSERT(MUTEX_HELD(&cip->ci_lock)); 269 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 270 271 /* 272 * delete any existing entry from code hash if it is on it 273 */ 274 fmd_case_code_hash_delete(chp, cip); 275 276 fmd_free(cip->ci_code, cip->ci_codelen); 277 cip->ci_codelen = cip->ci_mod->mod_codelen; 278 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 279 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 280 281 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 282 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 283 keyp++; 284 } 285 286 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 287 288 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 289 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 290 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 291 fmd_free(cip->ci_code, cip->ci_codelen); 292 cip->ci_codelen = strlen(s) + 1; 293 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 294 (void) strcpy(cip->ci_code, s); 295 } 296 297 /* 298 * add into hash of solved cases 299 */ 300 fmd_case_code_hash_insert(chp, cip); 301 302 return (cip->ci_code); 303 } 304 305 typedef struct { 306 int *fcl_countp; 307 uint8_t *fcl_ba; 308 nvlist_t **fcl_nva; 309 int *fcl_msgp; 310 } fmd_case_lst_t; 311 312 static void 313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 314 { 315 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 316 boolean_t b; 317 int state; 318 319 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 320 &b) == 0 && b == B_FALSE) 321 *entryp->fcl_msgp = B_FALSE; 322 entryp->fcl_ba[*entryp->fcl_countp] = 0; 323 state = fmd_asru_al_getstate(alp); 324 if (state & FMD_ASRU_DEGRADED) 325 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 326 if (state & FMD_ASRU_UNUSABLE) 327 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 328 if (state & FMD_ASRU_FAULTY) 329 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 330 if (!(state & FMD_ASRU_PRESENT)) 331 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 332 if (alp->al_reason == FMD_ASRU_REPAIRED) 333 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 334 else if (alp->al_reason == FMD_ASRU_REPLACED) 335 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 336 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 337 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 338 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 339 (*entryp->fcl_countp)++; 340 } 341 342 static void 343 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 344 { 345 int *faultyp = (int *)arg; 346 347 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 348 } 349 350 static void 351 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 352 { 353 int *usablep = (int *)arg; 354 355 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 356 } 357 358 static void 359 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 360 { 361 int *not_faultyp = (int *)arg; 362 363 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 364 } 365 366 /* 367 * Have we got any suspects with an asru that are still unusable and present? 368 */ 369 static void 370 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 371 { 372 int *rvalp = (int *)arg; 373 int state = fmd_asru_al_getstate(alp); 374 nvlist_t *asru; 375 376 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 377 return; 378 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 379 } 380 381 nvlist_t * 382 fmd_case_mkevent(fmd_case_t *cp, const char *class) 383 { 384 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 385 nvlist_t **nva, *nvl; 386 uint8_t *ba; 387 int msg = B_TRUE; 388 const char *code; 389 fmd_case_lst_t fcl; 390 int count = 0; 391 392 (void) pthread_mutex_lock(&cip->ci_lock); 393 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 394 395 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 396 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 397 398 /* 399 * For each suspect associated with the case, store its fault event 400 * nvlist in 'nva'. We also look to see if any of the suspect faults 401 * have asked not to be messaged. If any of them have made such a 402 * request, propagate that attribute to the composite list.* event. 403 * Finally, store each suspect's faulty status into the bitmap 'ba'. 404 */ 405 fcl.fcl_countp = &count; 406 fcl.fcl_msgp = &msg; 407 fcl.fcl_ba = ba; 408 fcl.fcl_nva = nva; 409 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 410 411 if (cip->ci_code == NULL) 412 (void) fmd_case_mkcode(cp); 413 /* 414 * For repair and updated event, we lookup diagcode from dict using key 415 * "list.repaired" or "list.updated" or "list.resolved". 416 */ 417 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 418 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 419 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 420 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 421 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 422 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 423 else 424 code = cip->ci_code; 425 426 if (msg == B_FALSE) 427 cip->ci_flags |= FMD_CF_INVISIBLE; 428 429 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 430 code, count, nva, ba, msg, &cip->ci_tv); 431 432 (void) pthread_mutex_unlock(&cip->ci_lock); 433 return (nvl); 434 } 435 436 static boolean_t 437 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 438 { 439 nvlist_t *new_rsrc; 440 nvlist_t *rsrc; 441 char *new_name = NULL; 442 char *name = NULL; 443 ssize_t new_namelen; 444 ssize_t namelen; 445 int fmri_present = 1; 446 int new_fmri_present = 1; 447 int match = B_FALSE; 448 fmd_topo_t *ftp = fmd_topo_hold(); 449 450 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 451 fmri_present = 0; 452 else { 453 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 454 goto done; 455 name = fmd_alloc(namelen + 1, FMD_SLEEP); 456 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 457 goto done; 458 } 459 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 460 new_fmri_present = 0; 461 else { 462 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 463 goto done; 464 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 465 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 466 goto done; 467 } 468 match = (fmri_present == new_fmri_present && 469 (fmri_present == 0 || 470 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 471 done: 472 if (name != NULL) 473 fmd_free(name, namelen + 1); 474 if (new_name != NULL) 475 fmd_free(new_name, new_namelen + 1); 476 fmd_topo_rele(ftp); 477 return (match); 478 } 479 480 static int 481 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 482 { 483 char *class, *new_class; 484 485 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 486 return (0); 487 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 488 FM_FAULT_RESOURCE)) 489 return (0); 490 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 491 return (0); 492 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 493 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 494 return (strcmp(class, new_class) == 0); 495 } 496 497 /* 498 * see if an identical suspect list already exists in the cache 499 */ 500 static int 501 fmd_case_check_for_dups(fmd_case_t *cp) 502 { 503 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 504 fmd_case_hash_t *chp = fmd.d_cases; 505 fmd_case_susp_t *xcis, *cis; 506 int match = 0, match_susp; 507 uint_t h; 508 509 (void) pthread_rwlock_rdlock(&chp->ch_lock); 510 511 /* 512 * Find all cases with this code 513 */ 514 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 515 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 516 xcip = xcip->ci_code_next) { 517 /* 518 * only look for any cases (apart from this one) 519 * whose code and number of suspects match 520 */ 521 if (xcip == cip || fmd_case_tryhold(xcip) == NULL) 522 continue; 523 if (strcmp(xcip->ci_code, cip->ci_code) != 0 || 524 xcip->ci_nsuspects != cip->ci_nsuspects) { 525 fmd_case_rele((fmd_case_t *)xcip); 526 continue; 527 } 528 529 /* 530 * For each suspect in one list, check if there 531 * is an identical suspect in the other list 532 */ 533 match = 1; 534 for (xcis = xcip->ci_suspects; xcis != NULL; 535 xcis = xcis->cis_next) { 536 match_susp = 0; 537 for (cis = cip->ci_suspects; cis != NULL; 538 cis = cis->cis_next) { 539 if (fmd_case_match_suspect(cis, xcis) == 1) { 540 match_susp = 1; 541 break; 542 } 543 } 544 if (match_susp == 0) { 545 match = 0; 546 break; 547 } 548 } 549 fmd_case_rele((fmd_case_t *)xcip); 550 if (match) { 551 (void) pthread_rwlock_unlock(&chp->ch_lock); 552 return (1); 553 } 554 } 555 (void) pthread_rwlock_unlock(&chp->ch_lock); 556 return (0); 557 } 558 559 /* 560 * Convict suspects in a case by applying a conviction policy and updating the 561 * resource cache prior to emitting the list.suspect event for the given case. 562 * At present, our policy is very simple: convict every suspect in the case. 563 * In the future, this policy can be extended and made configurable to permit: 564 * 565 * - convicting the suspect with the highest FIT rate 566 * - convicting the suspect with the cheapest FRU 567 * - convicting the suspect with the FRU that is in a depot's inventory 568 * - convicting the suspect with the longest lifetime 569 * 570 * and so forth. A word to the wise: this problem is significantly harder that 571 * it seems at first glance. Future work should heed the following advice: 572 * 573 * Hacking the policy into C code here is a very bad idea. The policy needs to 574 * be decided upon very carefully and fundamentally encodes knowledge of what 575 * suspect list combinations can be emitted by what diagnosis engines. As such 576 * fmd's code is the wrong location, because that would require fmd itself to 577 * be updated for every diagnosis engine change, defeating the entire design. 578 * The FMA Event Registry knows the suspect list combinations: policy inputs 579 * can be derived from it and used to produce per-module policy configuration. 580 * 581 * If the policy needs to be dynamic and not statically fixed at either fmd 582 * startup or module load time, any implementation of dynamic policy retrieval 583 * must employ some kind of caching mechanism or be part of a built-in module. 584 * The fmd_case_convict() function is called with locks held inside of fmd and 585 * is not a place where unbounded blocking on some inter-process or inter- 586 * system communication to another service (e.g. another daemon) can occur. 587 */ 588 static int 589 fmd_case_convict(fmd_case_t *cp) 590 { 591 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 592 fmd_asru_hash_t *ahp = fmd.d_asrus; 593 594 fmd_case_susp_t *cis; 595 fmd_asru_link_t *alp; 596 597 (void) pthread_mutex_lock(&cip->ci_lock); 598 (void) fmd_case_mkcode(cp); 599 if (fmd_case_check_for_dups(cp) == 1) { 600 (void) pthread_mutex_unlock(&cip->ci_lock); 601 return (1); 602 } 603 604 /* 605 * no suspect list already exists - allocate new cache entries 606 */ 607 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 608 if ((alp = fmd_asru_hash_create_entry(ahp, 609 cp, cis->cis_nvl)) == NULL) { 610 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 611 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 612 continue; 613 } 614 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 615 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 616 } 617 618 (void) pthread_mutex_unlock(&cip->ci_lock); 619 return (0); 620 } 621 622 void 623 fmd_case_publish(fmd_case_t *cp, uint_t state) 624 { 625 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 626 fmd_event_t *e; 627 nvlist_t *nvl; 628 char *class; 629 630 if (state == FMD_CASE_CURRENT) 631 state = cip->ci_state; /* use current state */ 632 633 switch (state) { 634 case FMD_CASE_SOLVED: 635 (void) pthread_mutex_lock(&cip->ci_lock); 636 if (cip->ci_tv_valid == 0) { 637 fmd_time_gettimeofday(&cip->ci_tv); 638 cip->ci_tv_valid = 1; 639 } 640 (void) pthread_mutex_unlock(&cip->ci_lock); 641 642 if (fmd_case_convict(cp) == 1) { /* dupclose */ 643 cip->ci_flags &= ~FMD_CF_SOLVED; 644 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 645 break; 646 } 647 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 648 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 649 650 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 651 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 652 fmd_log_append(fmd.d_fltlog, e, cp); 653 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 654 fmd_dispq_dispatch(fmd.d_disp, e, class); 655 656 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 657 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 658 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 659 660 break; 661 662 case FMD_CASE_CLOSE_WAIT: 663 fmd_case_hold(cp); 664 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 665 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 666 667 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 668 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 669 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 670 671 break; 672 673 case FMD_CASE_CLOSED: 674 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 675 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 676 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 677 fmd_dispq_dispatch(fmd.d_disp, e, class); 678 break; 679 680 case FMD_CASE_REPAIRED: 681 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 682 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 683 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 684 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 685 fmd_log_append(fmd.d_fltlog, e, cp); 686 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 687 fmd_dispq_dispatch(fmd.d_disp, e, class); 688 break; 689 690 case FMD_CASE_RESOLVED: 691 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 692 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 693 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 694 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 695 fmd_log_append(fmd.d_fltlog, e, cp); 696 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 697 fmd_dispq_dispatch(fmd.d_disp, e, class); 698 break; 699 } 700 } 701 702 fmd_case_t * 703 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 704 { 705 fmd_case_impl_t *cip; 706 uint_t h; 707 708 (void) pthread_rwlock_rdlock(&chp->ch_lock); 709 h = fmd_strhash(uuid) % chp->ch_hashlen; 710 711 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 712 if (strcmp(cip->ci_uuid, uuid) == 0) 713 break; 714 } 715 716 /* 717 * If deleting bit is set, treat the case as if it doesn't exist. 718 */ 719 if (cip != NULL) 720 cip = fmd_case_tryhold(cip); 721 722 if (cip == NULL) 723 (void) fmd_set_errno(EFMD_CASE_INVAL); 724 725 (void) pthread_rwlock_unlock(&chp->ch_lock); 726 return ((fmd_case_t *)cip); 727 } 728 729 static fmd_case_impl_t * 730 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 731 { 732 fmd_case_impl_t *eip; 733 uint_t h; 734 735 (void) pthread_rwlock_wrlock(&chp->ch_lock); 736 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 737 738 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 739 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 740 fmd_case_tryhold(eip) != NULL) { 741 (void) pthread_rwlock_unlock(&chp->ch_lock); 742 return (eip); /* uuid already present */ 743 } 744 } 745 746 cip->ci_next = chp->ch_hash[h]; 747 chp->ch_hash[h] = cip; 748 749 chp->ch_count++; 750 ASSERT(chp->ch_count != 0); 751 752 (void) pthread_rwlock_unlock(&chp->ch_lock); 753 return (cip); 754 } 755 756 static void 757 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 758 { 759 fmd_case_impl_t *cp, **pp; 760 uint_t h; 761 762 ASSERT(MUTEX_HELD(&cip->ci_lock)); 763 764 cip->ci_flags |= FMD_CF_DELETING; 765 (void) pthread_mutex_unlock(&cip->ci_lock); 766 767 (void) pthread_rwlock_wrlock(&chp->ch_lock); 768 769 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 770 pp = &chp->ch_hash[h]; 771 772 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 773 if (cp != cip) 774 pp = &cp->ci_next; 775 else 776 break; 777 } 778 779 if (cp == NULL) { 780 fmd_panic("case %p (%s) not found on hash chain %u\n", 781 (void *)cip, cip->ci_uuid, h); 782 } 783 784 *pp = cp->ci_next; 785 cp->ci_next = NULL; 786 787 /* 788 * delete from code hash if it is on it 789 */ 790 fmd_case_code_hash_delete(chp, cip); 791 792 ASSERT(chp->ch_count != 0); 793 chp->ch_count--; 794 795 (void) pthread_rwlock_unlock(&chp->ch_lock); 796 797 (void) pthread_mutex_lock(&cip->ci_lock); 798 ASSERT(cip->ci_flags & FMD_CF_DELETING); 799 } 800 801 fmd_case_t * 802 fmd_case_create(fmd_module_t *mp, void *data) 803 { 804 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 805 fmd_case_impl_t *eip = NULL; 806 uuid_t uuid; 807 808 (void) pthread_mutex_init(&cip->ci_lock, NULL); 809 fmd_buf_hash_create(&cip->ci_bufs); 810 811 fmd_module_hold(mp); 812 cip->ci_mod = mp; 813 cip->ci_refs = 1; 814 cip->ci_state = FMD_CASE_UNSOLVED; 815 cip->ci_flags = FMD_CF_DIRTY; 816 cip->ci_data = data; 817 818 /* 819 * Calling libuuid: get a clue. The library interfaces cleverly do not 820 * define any constant for the length of an unparse string, and do not 821 * permit the caller to specify a buffer length for safety. The spec 822 * says it will be 36 bytes, but we make it tunable just in case. 823 */ 824 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 825 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 826 827 /* 828 * We expect this loop to execute only once, but code it defensively 829 * against the possibility of libuuid bugs. Keep generating uuids and 830 * attempting to do a hash insert until we get a unique one. 831 */ 832 do { 833 if (eip != NULL) 834 fmd_case_rele((fmd_case_t *)eip); 835 uuid_generate(uuid); 836 uuid_unparse(uuid, cip->ci_uuid); 837 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 838 839 ASSERT(fmd_module_locked(mp)); 840 fmd_list_append(&mp->mod_cases, cip); 841 fmd_module_setcdirty(mp); 842 843 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 844 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 845 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 846 847 return ((fmd_case_t *)cip); 848 } 849 850 static void 851 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 852 { 853 fmd_case_susp_t *cis, *ncis; 854 855 ASSERT(MUTEX_HELD(&cip->ci_lock)); 856 857 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 858 ncis = cis->cis_next; 859 nvlist_free(cis->cis_nvl); 860 fmd_free(cis, sizeof (fmd_case_susp_t)); 861 } 862 863 cip->ci_suspects = NULL; 864 cip->ci_nsuspects = 0; 865 } 866 867 fmd_case_t * 868 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 869 uint_t state, const char *uuid, const char *code) 870 { 871 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 872 fmd_case_impl_t *eip; 873 874 ASSERT(state < FMD_CASE_RESOLVED); 875 876 (void) pthread_mutex_init(&cip->ci_lock, NULL); 877 fmd_buf_hash_create(&cip->ci_bufs); 878 879 fmd_module_hold(mp); 880 cip->ci_mod = mp; 881 cip->ci_xprt = xp; 882 cip->ci_refs = 1; 883 cip->ci_state = state; 884 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 885 cip->ci_uuidlen = strlen(cip->ci_uuid); 886 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 887 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 888 889 if (state > FMD_CASE_CLOSE_WAIT) 890 cip->ci_flags |= FMD_CF_SOLVED; 891 892 /* 893 * Insert the case into the global case hash. If the specified UUID is 894 * already present, check to see if it is an orphan: if so, reclaim it; 895 * otherwise if it is owned by a different module then return NULL. 896 */ 897 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 898 (void) pthread_mutex_lock(&cip->ci_lock); 899 cip->ci_refs--; /* decrement to zero */ 900 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 901 902 cip = eip; /* switch 'cip' to the existing case */ 903 (void) pthread_mutex_lock(&cip->ci_lock); 904 905 /* 906 * If the ASRU cache is trying to recreate an orphan, then just 907 * return the existing case that we found without changing it. 908 */ 909 if (mp == fmd.d_rmod) { 910 /* 911 * When recreating an orphan case, state passed in may 912 * either be CLOSED (faulty) or REPAIRED (!faulty). If 913 * any suspects are still CLOSED (faulty) then the 914 * overall state needs to be CLOSED. 915 */ 916 if (state == FMD_CASE_CLOSED) 917 cip->ci_state = FMD_CASE_CLOSED; 918 (void) pthread_mutex_unlock(&cip->ci_lock); 919 fmd_case_rele((fmd_case_t *)cip); 920 return ((fmd_case_t *)cip); 921 } 922 923 /* 924 * If the existing case isn't an orphan or is being proxied, 925 * then we have a UUID conflict: return failure to the caller. 926 */ 927 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 928 (void) pthread_mutex_unlock(&cip->ci_lock); 929 fmd_case_rele((fmd_case_t *)cip); 930 return (NULL); 931 } 932 933 /* 934 * If the new module is reclaiming an orphaned case, remove 935 * the case from the root module, switch ci_mod, and then fall 936 * through to adding the case to the new owner module 'mp'. 937 */ 938 fmd_module_lock(cip->ci_mod); 939 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 940 fmd_module_unlock(cip->ci_mod); 941 942 fmd_module_rele(cip->ci_mod); 943 cip->ci_mod = mp; 944 fmd_module_hold(mp); 945 946 fmd_case_destroy_suspects(cip); 947 cip->ci_state = state; 948 949 (void) pthread_mutex_unlock(&cip->ci_lock); 950 fmd_case_rele((fmd_case_t *)cip); 951 } else { 952 /* 953 * add into hash of solved cases 954 */ 955 if (cip->ci_code) 956 fmd_case_code_hash_insert(fmd.d_cases, cip); 957 } 958 959 ASSERT(fmd_module_locked(mp)); 960 fmd_list_append(&mp->mod_cases, cip); 961 962 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 963 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 964 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 965 966 return ((fmd_case_t *)cip); 967 } 968 969 void 970 fmd_case_destroy(fmd_case_t *cp, int visible) 971 { 972 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 973 fmd_case_item_t *cit, *ncit; 974 975 ASSERT(MUTEX_HELD(&cip->ci_lock)); 976 ASSERT(cip->ci_refs == 0); 977 978 if (visible) { 979 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 980 fmd_case_hash_delete(fmd.d_cases, cip); 981 } 982 983 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 984 ncit = cit->cit_next; 985 fmd_event_rele(cit->cit_event); 986 fmd_free(cit, sizeof (fmd_case_item_t)); 987 } 988 989 fmd_case_destroy_suspects(cip); 990 991 if (cip->ci_principal != NULL) 992 fmd_event_rele(cip->ci_principal); 993 994 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 995 fmd_free(cip->ci_code, cip->ci_codelen); 996 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 997 998 fmd_module_rele(cip->ci_mod); 999 fmd_free(cip, sizeof (fmd_case_impl_t)); 1000 } 1001 1002 void 1003 fmd_case_hold(fmd_case_t *cp) 1004 { 1005 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1006 1007 (void) pthread_mutex_lock(&cip->ci_lock); 1008 fmd_case_hold_locked(cp); 1009 (void) pthread_mutex_unlock(&cip->ci_lock); 1010 } 1011 1012 void 1013 fmd_case_hold_locked(fmd_case_t *cp) 1014 { 1015 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1016 1017 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1018 if (cip->ci_flags & FMD_CF_DELETING) 1019 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1020 (void *)cip, cip->ci_uuid); 1021 cip->ci_refs++; 1022 ASSERT(cip->ci_refs != 0); 1023 } 1024 1025 static fmd_case_impl_t * 1026 fmd_case_tryhold(fmd_case_impl_t *cip) 1027 { 1028 /* 1029 * If the case's "deleting" bit is unset, hold and return case, 1030 * otherwise, return NULL. 1031 */ 1032 (void) pthread_mutex_lock(&cip->ci_lock); 1033 if (cip->ci_flags & FMD_CF_DELETING) { 1034 (void) pthread_mutex_unlock(&cip->ci_lock); 1035 cip = NULL; 1036 } else { 1037 fmd_case_hold_locked((fmd_case_t *)cip); 1038 (void) pthread_mutex_unlock(&cip->ci_lock); 1039 } 1040 return (cip); 1041 } 1042 1043 void 1044 fmd_case_rele(fmd_case_t *cp) 1045 { 1046 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1047 1048 (void) pthread_mutex_lock(&cip->ci_lock); 1049 ASSERT(cip->ci_refs != 0); 1050 1051 if (--cip->ci_refs == 0) 1052 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1053 else 1054 (void) pthread_mutex_unlock(&cip->ci_lock); 1055 } 1056 1057 void 1058 fmd_case_rele_locked(fmd_case_t *cp) 1059 { 1060 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1061 1062 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1063 --cip->ci_refs; 1064 ASSERT(cip->ci_refs != 0); 1065 } 1066 1067 int 1068 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1069 { 1070 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1071 fmd_case_item_t *cit; 1072 fmd_event_t *oep; 1073 uint_t state; 1074 int new; 1075 1076 fmd_event_hold(ep); 1077 (void) pthread_mutex_lock(&cip->ci_lock); 1078 1079 if (cip->ci_flags & FMD_CF_SOLVED) 1080 state = FMD_EVS_DIAGNOSED; 1081 else 1082 state = FMD_EVS_ACCEPTED; 1083 1084 oep = cip->ci_principal; 1085 cip->ci_principal = ep; 1086 1087 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1088 if (cit->cit_event == ep) 1089 break; 1090 } 1091 1092 cip->ci_flags |= FMD_CF_DIRTY; 1093 new = cit == NULL && ep != oep; 1094 1095 (void) pthread_mutex_unlock(&cip->ci_lock); 1096 1097 fmd_module_setcdirty(cip->ci_mod); 1098 fmd_event_transition(ep, state); 1099 1100 if (oep != NULL) 1101 fmd_event_rele(oep); 1102 1103 return (new); 1104 } 1105 1106 int 1107 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1108 { 1109 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1110 fmd_case_item_t *cit; 1111 uint_t state; 1112 int new; 1113 1114 (void) pthread_mutex_lock(&cip->ci_lock); 1115 1116 if (cip->ci_flags & FMD_CF_SOLVED) 1117 state = FMD_EVS_DIAGNOSED; 1118 else 1119 state = FMD_EVS_ACCEPTED; 1120 1121 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1122 if (cit->cit_event == ep) 1123 break; 1124 } 1125 1126 new = cit == NULL && ep != cip->ci_principal; 1127 1128 /* 1129 * If the event is already in the case or the case is already solved, 1130 * there is no reason to save it: just transition it appropriately. 1131 */ 1132 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1133 (void) pthread_mutex_unlock(&cip->ci_lock); 1134 fmd_event_transition(ep, state); 1135 return (new); 1136 } 1137 1138 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1139 fmd_event_hold(ep); 1140 1141 cit->cit_next = cip->ci_items; 1142 cit->cit_event = ep; 1143 1144 cip->ci_items = cit; 1145 cip->ci_nitems++; 1146 1147 cip->ci_flags |= FMD_CF_DIRTY; 1148 (void) pthread_mutex_unlock(&cip->ci_lock); 1149 1150 fmd_module_setcdirty(cip->ci_mod); 1151 fmd_event_transition(ep, state); 1152 1153 return (new); 1154 } 1155 1156 void 1157 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1158 { 1159 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1160 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1161 1162 (void) pthread_mutex_lock(&cip->ci_lock); 1163 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1164 cip->ci_flags |= FMD_CF_DIRTY; 1165 1166 cis->cis_next = cip->ci_suspects; 1167 cis->cis_nvl = nvl; 1168 1169 cip->ci_suspects = cis; 1170 cip->ci_nsuspects++; 1171 1172 (void) pthread_mutex_unlock(&cip->ci_lock); 1173 fmd_module_setcdirty(cip->ci_mod); 1174 } 1175 1176 void 1177 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1178 { 1179 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1180 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1181 boolean_t b; 1182 1183 (void) pthread_mutex_lock(&cip->ci_lock); 1184 ASSERT(cip->ci_state == FMD_CASE_CLOSED || 1185 cip->ci_state == FMD_CASE_REPAIRED); 1186 ASSERT(cip->ci_mod == fmd.d_rmod); 1187 1188 cis->cis_next = cip->ci_suspects; 1189 cis->cis_nvl = nvl; 1190 1191 if (nvlist_lookup_boolean_value(nvl, 1192 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1193 cip->ci_flags |= FMD_CF_INVISIBLE; 1194 1195 cip->ci_suspects = cis; 1196 cip->ci_nsuspects++; 1197 1198 (void) pthread_mutex_unlock(&cip->ci_lock); 1199 } 1200 1201 void 1202 fmd_case_reset_suspects(fmd_case_t *cp) 1203 { 1204 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1205 1206 (void) pthread_mutex_lock(&cip->ci_lock); 1207 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1208 1209 fmd_case_destroy_suspects(cip); 1210 cip->ci_flags |= FMD_CF_DIRTY; 1211 1212 (void) pthread_mutex_unlock(&cip->ci_lock); 1213 fmd_module_setcdirty(cip->ci_mod); 1214 } 1215 1216 /*ARGSUSED*/ 1217 static void 1218 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1219 { 1220 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1221 } 1222 1223 /* 1224 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1225 * whatever actions and emit whatever events are appropriate for the state. 1226 * Refer to the topmost block comment explaining the state machine for details. 1227 */ 1228 void 1229 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1230 { 1231 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1232 fmd_case_item_t *cit; 1233 fmd_event_t *e; 1234 int resolved = 0; 1235 int any_unusable_and_present = 0; 1236 1237 ASSERT(state <= FMD_CASE_RESOLVED); 1238 (void) pthread_mutex_lock(&cip->ci_lock); 1239 1240 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1241 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 1242 1243 cip->ci_flags |= flags; 1244 1245 if (cip->ci_state >= state) { 1246 (void) pthread_mutex_unlock(&cip->ci_lock); 1247 return; /* already in specified state */ 1248 } 1249 1250 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1251 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1252 1253 cip->ci_state = state; 1254 cip->ci_flags |= FMD_CF_DIRTY; 1255 1256 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1257 fmd_module_setcdirty(cip->ci_mod); 1258 1259 switch (state) { 1260 case FMD_CASE_SOLVED: 1261 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1262 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1263 1264 if (cip->ci_principal != NULL) { 1265 fmd_event_transition(cip->ci_principal, 1266 FMD_EVS_DIAGNOSED); 1267 } 1268 break; 1269 1270 case FMD_CASE_CLOSE_WAIT: 1271 /* 1272 * If the case was never solved, do not change ASRUs. 1273 * If the case was never fmd_case_closed, do not change ASRUs. 1274 * If the case was repaired, do not change ASRUs. 1275 */ 1276 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1277 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1278 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1279 fmd_case_unusable, NULL); 1280 1281 /* 1282 * If an orphaned case transitions to CLOSE_WAIT, the owning 1283 * module is no longer loaded: continue on to CASE_CLOSED. 1284 */ 1285 if (fmd_case_orphaned(cp)) 1286 state = cip->ci_state = FMD_CASE_CLOSED; 1287 break; 1288 1289 case FMD_CASE_REPAIRED: 1290 ASSERT(fmd_case_orphaned(cp)); 1291 1292 /* 1293 * If all suspects are already either usable or not present then 1294 * transition straight to RESOLVED state, publishing both the 1295 * list.repaired and list.resolved. 1296 */ 1297 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1298 fmd_case_unusable_and_present, &any_unusable_and_present); 1299 if (any_unusable_and_present) 1300 break; 1301 1302 fmd_module_lock(cip->ci_mod); 1303 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1304 fmd_module_unlock(cip->ci_mod); 1305 cip->ci_state = FMD_CASE_RESOLVED; 1306 (void) pthread_mutex_unlock(&cip->ci_lock); 1307 fmd_case_publish(cp, state); 1308 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1309 _fmd_case_snames[FMD_CASE_REPAIRED], 1310 _fmd_case_snames[FMD_CASE_RESOLVED])); 1311 state = FMD_CASE_RESOLVED; 1312 resolved = 1; 1313 (void) pthread_mutex_lock(&cip->ci_lock); 1314 break; 1315 1316 case FMD_CASE_RESOLVED: 1317 ASSERT(fmd_case_orphaned(cp)); 1318 1319 /* 1320 * If all suspects are already either usable or not present then 1321 * carry on, publish list.resolved and discard the case. 1322 */ 1323 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1324 fmd_case_unusable_and_present, &any_unusable_and_present); 1325 if (any_unusable_and_present) { 1326 (void) pthread_mutex_unlock(&cip->ci_lock); 1327 return; 1328 } 1329 1330 fmd_module_lock(cip->ci_mod); 1331 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1332 fmd_module_unlock(cip->ci_mod); 1333 resolved = 1; 1334 break; 1335 } 1336 1337 (void) pthread_mutex_unlock(&cip->ci_lock); 1338 1339 /* 1340 * If the module has initialized, then publish the appropriate event 1341 * for the new case state. If not, we are being called from the 1342 * checkpoint code during module load, in which case the module's 1343 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1344 * may not be open yet, which will prevent us from computing the event 1345 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1346 * event in our queue: this won't be processed until _fmd_init is done. 1347 */ 1348 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1349 fmd_case_publish(cp, state); 1350 else { 1351 fmd_case_hold(cp); 1352 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1353 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1354 } 1355 1356 if (resolved) { 1357 /* 1358 * If we transitioned to RESOLVED, adjust the reference count to 1359 * reflect our removal from fmd.d_rmod->mod_cases above. If the 1360 * caller has not placed an additional hold on the case, it 1361 * will now be freed. 1362 */ 1363 (void) pthread_mutex_lock(&cip->ci_lock); 1364 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1365 (void) pthread_mutex_unlock(&cip->ci_lock); 1366 fmd_case_rele(cp); 1367 } 1368 } 1369 1370 /* 1371 * Transition the specified case to *at least* the specified state by first 1372 * re-validating the suspect list using the resource cache. This function is 1373 * employed by the checkpoint code when restoring a saved, solved case to see 1374 * if the state of the case has effectively changed while fmd was not running 1375 * or the module was not loaded. 1376 */ 1377 void 1378 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1379 { 1380 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1381 1382 int usable = 0; /* are any suspects usable? */ 1383 1384 ASSERT(state >= FMD_CASE_SOLVED); 1385 (void) pthread_mutex_lock(&cip->ci_lock); 1386 1387 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1388 1389 (void) pthread_mutex_unlock(&cip->ci_lock); 1390 1391 if (!usable) { 1392 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1393 flags |= FMD_CF_ISOLATED; 1394 } 1395 1396 fmd_case_transition(cp, state, flags); 1397 } 1398 1399 void 1400 fmd_case_setdirty(fmd_case_t *cp) 1401 { 1402 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1403 1404 (void) pthread_mutex_lock(&cip->ci_lock); 1405 cip->ci_flags |= FMD_CF_DIRTY; 1406 (void) pthread_mutex_unlock(&cip->ci_lock); 1407 1408 fmd_module_setcdirty(cip->ci_mod); 1409 } 1410 1411 void 1412 fmd_case_clrdirty(fmd_case_t *cp) 1413 { 1414 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1415 1416 (void) pthread_mutex_lock(&cip->ci_lock); 1417 cip->ci_flags &= ~FMD_CF_DIRTY; 1418 (void) pthread_mutex_unlock(&cip->ci_lock); 1419 } 1420 1421 void 1422 fmd_case_commit(fmd_case_t *cp) 1423 { 1424 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1425 fmd_case_item_t *cit; 1426 1427 (void) pthread_mutex_lock(&cip->ci_lock); 1428 1429 if (cip->ci_flags & FMD_CF_DIRTY) { 1430 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1431 fmd_event_commit(cit->cit_event); 1432 1433 if (cip->ci_principal != NULL) 1434 fmd_event_commit(cip->ci_principal); 1435 1436 fmd_buf_hash_commit(&cip->ci_bufs); 1437 cip->ci_flags &= ~FMD_CF_DIRTY; 1438 } 1439 1440 (void) pthread_mutex_unlock(&cip->ci_lock); 1441 } 1442 1443 /* 1444 * Indicate that the case may need to change state because one or more of the 1445 * ASRUs named as a suspect has changed state. We examine all the suspects 1446 * and if none are still faulty, we initiate a case close transition. 1447 */ 1448 void 1449 fmd_case_update(fmd_case_t *cp) 1450 { 1451 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1452 uint_t cstate; 1453 int faulty = 0; 1454 1455 (void) pthread_mutex_lock(&cip->ci_lock); 1456 cstate = cip->ci_state; 1457 1458 if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1459 (void) pthread_mutex_unlock(&cip->ci_lock); 1460 return; /* update is not appropriate */ 1461 } 1462 1463 if (cip->ci_flags & FMD_CF_REPAIRED) { 1464 (void) pthread_mutex_unlock(&cip->ci_lock); 1465 return; /* already repaired */ 1466 } 1467 1468 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1469 (void) pthread_mutex_unlock(&cip->ci_lock); 1470 1471 if (faulty) { 1472 nvlist_t *nvl; 1473 fmd_event_t *e; 1474 char *class; 1475 1476 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 1477 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1478 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1479 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1480 fmd_log_append(fmd.d_fltlog, e, cp); 1481 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1482 fmd_dispq_dispatch(fmd.d_disp, e, class); 1483 return; /* one or more suspects are still marked faulty */ 1484 } 1485 1486 if (cstate == FMD_CASE_CLOSED) 1487 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1488 else 1489 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1490 } 1491 1492 /* 1493 * Delete a closed case from the module's case list once the fmdo_close() entry 1494 * point has run to completion. If the case is owned by a transport module, 1495 * tell the transport to proxy a case close on the other end of the transport. 1496 * If not, transition to the appropriate next state based on ci_flags. This 1497 * function represents the end of CLOSE_WAIT and transitions the case to either 1498 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1499 * refer to the topmost block comment explaining the state machine for details. 1500 */ 1501 void 1502 fmd_case_delete(fmd_case_t *cp) 1503 { 1504 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1505 fmd_modstat_t *msp; 1506 size_t buftotal; 1507 1508 ASSERT(fmd_module_locked(cip->ci_mod)); 1509 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1510 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1511 1512 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1513 msp = cip->ci_mod->mod_stats; 1514 1515 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1516 msp->ms_caseopen.fmds_value.ui64--; 1517 1518 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1519 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1520 1521 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1522 1523 if (cip->ci_xprt == NULL) 1524 fmd_module_setcdirty(cip->ci_mod); 1525 1526 fmd_module_rele(cip->ci_mod); 1527 cip->ci_mod = fmd.d_rmod; 1528 fmd_module_hold(cip->ci_mod); 1529 1530 /* 1531 * If the case is not proxied and it has been solved, then retain it 1532 * on the root module's case list at least until we're transitioned. 1533 * Otherwise free the case with our final fmd_case_rele() below. 1534 */ 1535 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1536 fmd_module_lock(cip->ci_mod); 1537 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1538 fmd_module_unlock(cip->ci_mod); 1539 fmd_case_hold(cp); 1540 } 1541 1542 /* 1543 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1544 * rather than orphaned because by definition it can have no entries 1545 * in the resource cache of the current fault manager. 1546 */ 1547 if (cip->ci_xprt != NULL) 1548 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1549 else if (cip->ci_flags & FMD_CF_REPAIRED) 1550 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1551 else if (cip->ci_flags & FMD_CF_ISOLATED) 1552 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1553 1554 fmd_case_rele(cp); 1555 } 1556 1557 void 1558 fmd_case_discard(fmd_case_t *cp) 1559 { 1560 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1561 1562 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1563 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1564 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1565 1566 ASSERT(fmd_module_locked(cip->ci_mod)); 1567 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1568 fmd_case_rele(cp); 1569 } 1570 1571 /* 1572 * Indicate that the problem corresponding to a case has been repaired by 1573 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1574 * already been closed, this function initiates the transition to CLOSE_WAIT. 1575 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1576 * grab and drop ci_lock without the case being able to be freed in between. 1577 */ 1578 int 1579 fmd_case_repair(fmd_case_t *cp) 1580 { 1581 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1582 uint_t cstate; 1583 1584 (void) pthread_mutex_lock(&cip->ci_lock); 1585 cstate = cip->ci_state; 1586 1587 if (cip->ci_xprt != NULL) { 1588 (void) pthread_mutex_unlock(&cip->ci_lock); 1589 return (fmd_set_errno(EFMD_CASE_OWNER)); 1590 } 1591 1592 if (cstate < FMD_CASE_SOLVED) { 1593 (void) pthread_mutex_unlock(&cip->ci_lock); 1594 return (fmd_set_errno(EFMD_CASE_STATE)); 1595 } 1596 1597 if (cip->ci_flags & FMD_CF_REPAIRED) { 1598 (void) pthread_mutex_unlock(&cip->ci_lock); 1599 return (0); /* already repaired */ 1600 } 1601 1602 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, NULL); 1603 (void) pthread_mutex_unlock(&cip->ci_lock); 1604 1605 if (cstate == FMD_CASE_CLOSED) 1606 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1607 else 1608 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1609 1610 return (0); 1611 } 1612 1613 int 1614 fmd_case_acquit(fmd_case_t *cp) 1615 { 1616 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1617 uint_t cstate; 1618 1619 (void) pthread_mutex_lock(&cip->ci_lock); 1620 cstate = cip->ci_state; 1621 1622 if (cip->ci_xprt != NULL) { 1623 (void) pthread_mutex_unlock(&cip->ci_lock); 1624 return (fmd_set_errno(EFMD_CASE_OWNER)); 1625 } 1626 1627 if (cstate < FMD_CASE_SOLVED) { 1628 (void) pthread_mutex_unlock(&cip->ci_lock); 1629 return (fmd_set_errno(EFMD_CASE_STATE)); 1630 } 1631 1632 if (cip->ci_flags & FMD_CF_REPAIRED) { 1633 (void) pthread_mutex_unlock(&cip->ci_lock); 1634 return (0); /* already repaired */ 1635 } 1636 1637 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_acquit, NULL); 1638 (void) pthread_mutex_unlock(&cip->ci_lock); 1639 1640 if (cstate == FMD_CASE_CLOSED) 1641 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1642 else 1643 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1644 1645 return (0); 1646 } 1647 1648 int 1649 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1650 { 1651 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1652 fmd_case_item_t *cit; 1653 uint_t state; 1654 int rv = 0; 1655 1656 (void) pthread_mutex_lock(&cip->ci_lock); 1657 1658 if (cip->ci_state >= FMD_CASE_SOLVED) 1659 state = FMD_EVS_DIAGNOSED; 1660 else 1661 state = FMD_EVS_ACCEPTED; 1662 1663 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1664 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1665 break; 1666 } 1667 1668 if (rv == 0 && cip->ci_principal != NULL) 1669 rv = fmd_event_equal(ep, cip->ci_principal); 1670 1671 (void) pthread_mutex_unlock(&cip->ci_lock); 1672 1673 if (rv != 0) 1674 fmd_event_transition(ep, state); 1675 1676 return (rv); 1677 } 1678 1679 int 1680 fmd_case_orphaned(fmd_case_t *cp) 1681 { 1682 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1683 } 1684 1685 void 1686 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1687 { 1688 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1689 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1690 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1691 } 1692 1693 /*ARGSUSED*/ 1694 void 1695 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 1696 { 1697 int not_faulty = 0; 1698 int faulty = 0; 1699 nvlist_t *nvl; 1700 fmd_event_t *e; 1701 char *class; 1702 int any_unusable_and_present = 0; 1703 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1704 1705 if (cip->ci_state < FMD_CASE_SOLVED) 1706 return; 1707 1708 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1709 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 1710 ¬_faulty); 1711 1712 if (!faulty) { 1713 /* 1714 * If none of the suspects is faulty, replay the list.repaired. 1715 * If all suspects are already either usable or not present then 1716 * also transition straight to RESOLVED state. 1717 */ 1718 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1719 fmd_case_unusable_and_present, &any_unusable_and_present); 1720 if (!any_unusable_and_present) { 1721 fmd_module_lock(cip->ci_mod); 1722 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1723 fmd_module_unlock(cip->ci_mod); 1724 cip->ci_state = FMD_CASE_RESOLVED; 1725 1726 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1727 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1728 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 1729 class); 1730 fmd_dispq_dispatch(fmd.d_disp, e, class); 1731 1732 fmd_case_publish(cp, FMD_CASE_RESOLVED); 1733 (void) pthread_mutex_lock(&cip->ci_lock); 1734 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1735 (void) pthread_mutex_unlock(&cip->ci_lock); 1736 fmd_case_rele(cp); 1737 } else { 1738 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1739 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1740 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 1741 class); 1742 fmd_dispq_dispatch(fmd.d_disp, e, class); 1743 } 1744 } else if (not_faulty) { 1745 /* 1746 * if some but not all of the suspects are not faulty, replay 1747 * the list.updated. 1748 */ 1749 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 1750 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1751 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1752 fmd_dispq_dispatch(fmd.d_disp, e, class); 1753 } 1754 } 1755 1756 void 1757 fmd_case_repair_replay() 1758 { 1759 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 1760 } 1761