1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 fmd_case_hash_t * 135 fmd_case_hash_create(void) 136 { 137 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 138 139 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 140 chp->ch_hashlen = fmd.d_str_buckets; 141 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 142 chp->ch_count = 0; 143 144 return (chp); 145 } 146 147 /* 148 * Destroy the case hash. Unlike most of our hash tables, no active references 149 * are kept by the case hash itself; all references come from other subsystems. 150 * The hash must be destroyed after all modules are unloaded; if anything was 151 * present in the hash it would be by definition a reference count leak. 152 */ 153 void 154 fmd_case_hash_destroy(fmd_case_hash_t *chp) 155 { 156 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 157 fmd_free(chp, sizeof (fmd_case_hash_t)); 158 } 159 160 /* 161 * Take a snapshot of the case hash by placing an additional hold on each 162 * member in an auxiliary array, and then call 'func' for each case. 163 */ 164 void 165 fmd_case_hash_apply(fmd_case_hash_t *chp, 166 void (*func)(fmd_case_t *, void *), void *arg) 167 { 168 fmd_case_impl_t *cp, **cps, **cpp; 169 uint_t cpc, i; 170 171 (void) pthread_rwlock_rdlock(&chp->ch_lock); 172 173 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 174 cpc = chp->ch_count; 175 176 for (i = 0; i < chp->ch_hashlen; i++) { 177 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 178 fmd_case_hold((fmd_case_t *)cp); 179 *cpp++ = cp; 180 } 181 } 182 183 ASSERT(cpp == cps + cpc); 184 (void) pthread_rwlock_unlock(&chp->ch_lock); 185 186 for (i = 0; i < cpc; i++) { 187 func((fmd_case_t *)cps[i], arg); 188 fmd_case_rele((fmd_case_t *)cps[i]); 189 } 190 191 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 192 } 193 194 /* 195 * Look up the diagcode for this case and cache it in ci_code. If no suspects 196 * were defined for this case or if the lookup fails, the event dictionary or 197 * module code is broken, and we set the event code to a precomputed default. 198 */ 199 static const char * 200 fmd_case_mkcode(fmd_case_t *cp) 201 { 202 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 203 fmd_case_susp_t *cis; 204 205 char **keys, **keyp; 206 const char *s; 207 208 ASSERT(MUTEX_HELD(&cip->ci_lock)); 209 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 210 211 fmd_free(cip->ci_code, cip->ci_codelen); 212 cip->ci_codelen = cip->ci_mod->mod_codelen; 213 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 214 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 215 216 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 217 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 218 keyp++; 219 } 220 221 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 222 223 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 224 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 225 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 226 fmd_free(cip->ci_code, cip->ci_codelen); 227 cip->ci_codelen = strlen(s) + 1; 228 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 229 (void) strcpy(cip->ci_code, s); 230 } 231 232 return (cip->ci_code); 233 } 234 235 nvlist_t * 236 fmd_case_mkevent(fmd_case_t *cp, const char *class) 237 { 238 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 239 fmd_case_susp_t *cis; 240 241 fmd_asru_hash_t *ahp = fmd.d_asrus; 242 fmd_asru_t *asru; 243 244 nvlist_t **nva, **nvp, *nvl, *fmri; 245 uint8_t *ba, *bp; 246 247 int msg = B_TRUE; 248 boolean_t b; 249 250 (void) pthread_mutex_lock(&cip->ci_lock); 251 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 252 253 nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 254 ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 255 256 /* 257 * For each suspect associated with the case, store its fault event 258 * nvlist in 'nva'. We also look to see if any of the suspect faults 259 * have asked not to be messaged. If any of them have made such a 260 * request, propagate that attribute to the composite list.* event. 261 * Finally, store each suspect's faulty status into the bitmap 'ba'. 262 */ 263 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 264 if (nvlist_lookup_boolean_value(cis->cis_nvl, 265 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 266 msg = B_FALSE; 267 268 if (nvlist_lookup_nvlist(cis->cis_nvl, 269 FM_FAULT_ASRU, &fmri) == 0 && (asru = 270 fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) { 271 *bp++ = (asru->asru_flags & FMD_ASRU_FAULTY) != 0; 272 fmd_asru_hash_release(ahp, asru); 273 } else 274 *bp++ = 0; 275 276 *nvp++ = cis->cis_nvl; 277 } 278 279 if (cip->ci_code == NULL) 280 (void) fmd_case_mkcode(cp); 281 282 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, 283 cip->ci_uuid, cip->ci_code, cip->ci_nsuspects, nva, ba, msg); 284 285 (void) pthread_mutex_unlock(&cip->ci_lock); 286 return (nvl); 287 } 288 289 /* 290 * Convict suspects in a case by applying a conviction policy and updating the 291 * resource cache prior to emitting the list.suspect event for the given case. 292 * At present, our policy is very simple: convict every suspect in the case. 293 * In the future, this policy can be extended and made configurable to permit: 294 * 295 * - convicting the suspect with the highest FIT rate 296 * - convicting the suspect with the cheapest FRU 297 * - convicting the suspect with the FRU that is in a depot's inventory 298 * - convicting the suspect with the longest lifetime 299 * 300 * and so forth. A word to the wise: this problem is significantly harder that 301 * it seems at first glance. Future work should heed the following advice: 302 * 303 * Hacking the policy into C code here is a very bad idea. The policy needs to 304 * be decided upon very carefully and fundamentally encodes knowledge of what 305 * suspect list combinations can be emitted by what diagnosis engines. As such 306 * fmd's code is the wrong location, because that would require fmd itself to 307 * be updated for every diagnosis engine change, defeating the entire design. 308 * The FMA Event Registry knows the suspect list combinations: policy inputs 309 * can be derived from it and used to produce per-module policy configuration. 310 * 311 * If the policy needs to be dynamic and not statically fixed at either fmd 312 * startup or module load time, any implementation of dynamic policy retrieval 313 * must employ some kind of caching mechanism or be part of a built-in module. 314 * The fmd_case_convict() function is called with locks held inside of fmd and 315 * is not a place where unbounded blocking on some inter-process or inter- 316 * system communication to another service (e.g. another daemon) can occur. 317 */ 318 static void 319 fmd_case_convict(fmd_case_t *cp) 320 { 321 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 322 fmd_asru_hash_t *ahp = fmd.d_asrus; 323 324 fmd_case_susp_t *cis; 325 fmd_asru_t *asru; 326 nvlist_t *fmri; 327 328 (void) pthread_mutex_lock(&cip->ci_lock); 329 (void) fmd_case_mkcode(cp); 330 331 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 332 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri)) 333 continue; /* no ASRU provided by diagnosis engine */ 334 335 if ((asru = fmd_asru_hash_lookup_nvl(ahp, 336 fmri, FMD_B_TRUE)) == NULL) { 337 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 338 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 339 continue; 340 } 341 342 (void) fmd_asru_clrflags(asru, 343 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 344 (void) fmd_asru_setflags(asru, 345 FMD_ASRU_FAULTY, cp, cis->cis_nvl); 346 347 fmd_asru_hash_release(ahp, asru); 348 } 349 350 (void) pthread_mutex_unlock(&cip->ci_lock); 351 } 352 353 void 354 fmd_case_publish(fmd_case_t *cp, uint_t state) 355 { 356 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 357 fmd_event_t *e; 358 nvlist_t *nvl; 359 char *class; 360 361 if (state == FMD_CASE_CURRENT) 362 state = cip->ci_state; /* use current state */ 363 364 switch (state) { 365 case FMD_CASE_SOLVED: 366 fmd_case_convict(cp); 367 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 368 (void) pthread_mutex_lock(&cip->ci_lock); 369 if (cip->ci_diag == NULL) 370 (void) nvlist_xdup(nvl, &cip->ci_diag, &fmd.d_nva); 371 (void) pthread_mutex_unlock(&cip->ci_lock); 372 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 373 374 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 375 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 376 fmd_log_append(fmd.d_fltlog, e, cp); 377 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 378 fmd_dispq_dispatch(fmd.d_disp, e, class); 379 380 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 381 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 382 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 383 384 break; 385 386 case FMD_CASE_CLOSE_WAIT: 387 fmd_case_hold(cp); 388 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 389 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 390 391 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 392 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 393 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 394 395 break; 396 397 case FMD_CASE_CLOSED: 398 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 399 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 400 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 401 fmd_dispq_dispatch(fmd.d_disp, e, class); 402 break; 403 404 case FMD_CASE_REPAIRED: 405 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 406 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 407 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 408 fmd_dispq_dispatch(fmd.d_disp, e, class); 409 break; 410 } 411 } 412 413 fmd_case_t * 414 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 415 { 416 fmd_case_impl_t *cip; 417 uint_t h; 418 419 (void) pthread_rwlock_rdlock(&chp->ch_lock); 420 h = fmd_strhash(uuid) % chp->ch_hashlen; 421 422 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 423 if (strcmp(cip->ci_uuid, uuid) == 0) 424 break; 425 } 426 427 if (cip != NULL) 428 fmd_case_hold((fmd_case_t *)cip); 429 else 430 (void) fmd_set_errno(EFMD_CASE_INVAL); 431 432 (void) pthread_rwlock_unlock(&chp->ch_lock); 433 return ((fmd_case_t *)cip); 434 } 435 436 static fmd_case_impl_t * 437 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 438 { 439 fmd_case_impl_t *eip; 440 uint_t h; 441 442 (void) pthread_rwlock_wrlock(&chp->ch_lock); 443 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 444 445 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 446 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0) { 447 fmd_case_hold((fmd_case_t *)eip); 448 (void) pthread_rwlock_unlock(&chp->ch_lock); 449 return (eip); /* uuid already present */ 450 } 451 } 452 453 cip->ci_next = chp->ch_hash[h]; 454 chp->ch_hash[h] = cip; 455 456 chp->ch_count++; 457 ASSERT(chp->ch_count != 0); 458 459 (void) pthread_rwlock_unlock(&chp->ch_lock); 460 return (cip); 461 } 462 463 static void 464 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 465 { 466 fmd_case_impl_t *cp, **pp; 467 uint_t h; 468 469 (void) pthread_rwlock_wrlock(&chp->ch_lock); 470 471 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 472 pp = &chp->ch_hash[h]; 473 474 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 475 if (cp != cip) 476 pp = &cp->ci_next; 477 else 478 break; 479 } 480 481 if (cp == NULL) { 482 fmd_panic("case %p (%s) not found on hash chain %u\n", 483 (void *)cip, cip->ci_uuid, h); 484 } 485 486 *pp = cp->ci_next; 487 cp->ci_next = NULL; 488 489 ASSERT(chp->ch_count != 0); 490 chp->ch_count--; 491 492 (void) pthread_rwlock_unlock(&chp->ch_lock); 493 } 494 495 fmd_case_t * 496 fmd_case_create(fmd_module_t *mp, void *data) 497 { 498 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 499 fmd_case_impl_t *eip = NULL; 500 uuid_t uuid; 501 502 (void) pthread_mutex_init(&cip->ci_lock, NULL); 503 fmd_buf_hash_create(&cip->ci_bufs); 504 505 fmd_module_hold(mp); 506 cip->ci_mod = mp; 507 cip->ci_refs = 1; 508 cip->ci_state = FMD_CASE_UNSOLVED; 509 cip->ci_flags = FMD_CF_DIRTY; 510 cip->ci_data = data; 511 512 /* 513 * Calling libuuid: get a clue. The library interfaces cleverly do not 514 * define any constant for the length of an unparse string, and do not 515 * permit the caller to specify a buffer length for safety. The spec 516 * says it will be 36 bytes, but we make it tunable just in case. 517 */ 518 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 519 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 520 521 /* 522 * We expect this loop to execute only once, but code it defensively 523 * against the possibility of libuuid bugs. Keep generating uuids and 524 * attempting to do a hash insert until we get a unique one. 525 */ 526 do { 527 if (eip != NULL) 528 fmd_case_rele((fmd_case_t *)eip); 529 uuid_generate(uuid); 530 uuid_unparse(uuid, cip->ci_uuid); 531 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 532 533 ASSERT(fmd_module_locked(mp)); 534 fmd_list_append(&mp->mod_cases, cip); 535 fmd_module_setcdirty(mp); 536 537 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 538 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 539 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 540 541 return ((fmd_case_t *)cip); 542 } 543 544 static void 545 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 546 { 547 fmd_case_susp_t *cis, *ncis; 548 549 ASSERT(MUTEX_HELD(&cip->ci_lock)); 550 551 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 552 ncis = cis->cis_next; 553 nvlist_free(cis->cis_nvl); 554 fmd_free(cis, sizeof (fmd_case_susp_t)); 555 } 556 557 cip->ci_suspects = NULL; 558 cip->ci_nsuspects = 0; 559 } 560 561 fmd_case_t * 562 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 563 uint_t state, const char *uuid, const char *code) 564 { 565 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 566 fmd_case_impl_t *eip; 567 568 ASSERT(state < FMD_CASE_REPAIRED); 569 570 (void) pthread_mutex_init(&cip->ci_lock, NULL); 571 fmd_buf_hash_create(&cip->ci_bufs); 572 573 fmd_module_hold(mp); 574 cip->ci_mod = mp; 575 cip->ci_xprt = xp; 576 cip->ci_refs = 1; 577 cip->ci_state = state; 578 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 579 cip->ci_uuidlen = strlen(cip->ci_uuid); 580 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 581 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 582 583 if (state > FMD_CASE_CLOSE_WAIT) 584 cip->ci_flags |= FMD_CF_SOLVED; 585 586 /* 587 * Insert the case into the global case hash. If the specified UUID is 588 * already present, check to see if it is an orphan: if so, reclaim it; 589 * otherwise if it is owned by a different module then return NULL. 590 */ 591 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 592 (void) pthread_mutex_lock(&cip->ci_lock); 593 cip->ci_refs--; /* decrement to zero */ 594 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 595 596 cip = eip; /* switch 'cip' to the existing case */ 597 (void) pthread_mutex_lock(&cip->ci_lock); 598 599 /* 600 * If the ASRU cache is trying to recreate an orphan, then just 601 * return the existing case that we found without changing it. 602 */ 603 if (mp == fmd.d_rmod) { 604 (void) pthread_mutex_unlock(&cip->ci_lock); 605 fmd_case_rele((fmd_case_t *)cip); 606 return ((fmd_case_t *)cip); 607 } 608 609 /* 610 * If the existing case isn't an orphan or is being proxied, 611 * then we have a UUID conflict: return failure to the caller. 612 */ 613 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 614 (void) pthread_mutex_unlock(&cip->ci_lock); 615 fmd_case_rele((fmd_case_t *)cip); 616 return (NULL); 617 } 618 619 /* 620 * If the new module is reclaiming an orphaned case, remove 621 * the case from the root module, switch ci_mod, and then fall 622 * through to adding the case to the new owner module 'mp'. 623 */ 624 fmd_module_lock(cip->ci_mod); 625 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 626 fmd_module_unlock(cip->ci_mod); 627 628 fmd_module_rele(cip->ci_mod); 629 cip->ci_mod = mp; 630 fmd_module_hold(mp); 631 632 fmd_case_destroy_suspects(cip); 633 cip->ci_state = state; 634 635 (void) pthread_mutex_unlock(&cip->ci_lock); 636 fmd_case_rele((fmd_case_t *)cip); 637 } 638 639 ASSERT(fmd_module_locked(mp)); 640 fmd_list_append(&mp->mod_cases, cip); 641 642 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 643 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 644 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 645 646 return ((fmd_case_t *)cip); 647 } 648 649 void 650 fmd_case_destroy(fmd_case_t *cp, int visible) 651 { 652 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 653 fmd_case_item_t *cit, *ncit; 654 655 ASSERT(MUTEX_HELD(&cip->ci_lock)); 656 ASSERT(cip->ci_refs == 0); 657 658 if (visible) { 659 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 660 fmd_case_hash_delete(fmd.d_cases, cip); 661 } 662 663 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 664 ncit = cit->cit_next; 665 fmd_event_rele(cit->cit_event); 666 fmd_free(cit, sizeof (fmd_case_item_t)); 667 } 668 669 fmd_case_destroy_suspects(cip); 670 671 if (cip->ci_principal != NULL) 672 fmd_event_rele(cip->ci_principal); 673 674 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 675 fmd_free(cip->ci_code, cip->ci_codelen); 676 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 677 678 if (cip->ci_diag != NULL) 679 nvlist_free(cip->ci_diag); 680 681 fmd_module_rele(cip->ci_mod); 682 fmd_free(cip, sizeof (fmd_case_impl_t)); 683 } 684 685 void 686 fmd_case_hold(fmd_case_t *cp) 687 { 688 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 689 690 (void) pthread_mutex_lock(&cip->ci_lock); 691 cip->ci_refs++; 692 ASSERT(cip->ci_refs != 0); 693 (void) pthread_mutex_unlock(&cip->ci_lock); 694 } 695 696 void 697 fmd_case_hold_locked(fmd_case_t *cp) 698 { 699 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 700 701 ASSERT(MUTEX_HELD(&cip->ci_lock)); 702 cip->ci_refs++; 703 ASSERT(cip->ci_refs != 0); 704 } 705 706 void 707 fmd_case_rele(fmd_case_t *cp) 708 { 709 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 710 711 (void) pthread_mutex_lock(&cip->ci_lock); 712 ASSERT(cip->ci_refs != 0); 713 714 if (--cip->ci_refs == 0) 715 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 716 else 717 (void) pthread_mutex_unlock(&cip->ci_lock); 718 } 719 720 int 721 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 722 { 723 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 724 fmd_case_item_t *cit; 725 fmd_event_t *oep; 726 uint_t state; 727 int new; 728 729 fmd_event_hold(ep); 730 (void) pthread_mutex_lock(&cip->ci_lock); 731 732 if (cip->ci_flags & FMD_CF_SOLVED) 733 state = FMD_EVS_DIAGNOSED; 734 else 735 state = FMD_EVS_ACCEPTED; 736 737 oep = cip->ci_principal; 738 cip->ci_principal = ep; 739 740 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 741 if (cit->cit_event == ep) 742 break; 743 } 744 745 cip->ci_flags |= FMD_CF_DIRTY; 746 new = cit == NULL && ep != oep; 747 748 (void) pthread_mutex_unlock(&cip->ci_lock); 749 750 fmd_module_setcdirty(cip->ci_mod); 751 fmd_event_transition(ep, state); 752 753 if (oep != NULL) 754 fmd_event_rele(oep); 755 756 return (new); 757 } 758 759 int 760 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 761 { 762 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 763 fmd_case_item_t *cit; 764 uint_t state; 765 int new; 766 767 (void) pthread_mutex_lock(&cip->ci_lock); 768 769 if (cip->ci_flags & FMD_CF_SOLVED) 770 state = FMD_EVS_DIAGNOSED; 771 else 772 state = FMD_EVS_ACCEPTED; 773 774 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 775 if (cit->cit_event == ep) 776 break; 777 } 778 779 new = cit == NULL && ep != cip->ci_principal; 780 781 /* 782 * If the event is already in the case or the case is already solved, 783 * there is no reason to save it: just transition it appropriately. 784 */ 785 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 786 (void) pthread_mutex_unlock(&cip->ci_lock); 787 fmd_event_transition(ep, state); 788 return (new); 789 } 790 791 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 792 fmd_event_hold(ep); 793 794 cit->cit_next = cip->ci_items; 795 cit->cit_event = ep; 796 797 cip->ci_items = cit; 798 cip->ci_nitems++; 799 800 cip->ci_flags |= FMD_CF_DIRTY; 801 (void) pthread_mutex_unlock(&cip->ci_lock); 802 803 fmd_module_setcdirty(cip->ci_mod); 804 fmd_event_transition(ep, state); 805 806 return (new); 807 } 808 809 void 810 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 811 { 812 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 813 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 814 815 (void) pthread_mutex_lock(&cip->ci_lock); 816 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 817 cip->ci_flags |= FMD_CF_DIRTY; 818 819 cis->cis_next = cip->ci_suspects; 820 cis->cis_nvl = nvl; 821 822 cip->ci_suspects = cis; 823 cip->ci_nsuspects++; 824 825 (void) pthread_mutex_unlock(&cip->ci_lock); 826 fmd_module_setcdirty(cip->ci_mod); 827 } 828 829 void 830 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 831 { 832 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 833 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 834 835 (void) pthread_mutex_lock(&cip->ci_lock); 836 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 837 ASSERT(cip->ci_mod == fmd.d_rmod); 838 839 cis->cis_next = cip->ci_suspects; 840 cis->cis_nvl = nvl; 841 842 cip->ci_suspects = cis; 843 cip->ci_nsuspects++; 844 845 (void) pthread_mutex_unlock(&cip->ci_lock); 846 } 847 848 void 849 fmd_case_reset_suspects(fmd_case_t *cp) 850 { 851 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 852 853 (void) pthread_mutex_lock(&cip->ci_lock); 854 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 855 856 fmd_case_destroy_suspects(cip); 857 cip->ci_flags |= FMD_CF_DIRTY; 858 859 (void) pthread_mutex_unlock(&cip->ci_lock); 860 fmd_module_setcdirty(cip->ci_mod); 861 } 862 863 /* 864 * Grab ci_lock and update the case state and set the dirty bit. Then perform 865 * whatever actions and emit whatever events are appropriate for the state. 866 * Refer to the topmost block comment explaining the state machine for details. 867 */ 868 void 869 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 870 { 871 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 872 873 fmd_case_susp_t *cis; 874 fmd_case_item_t *cit; 875 fmd_asru_t *asru; 876 fmd_event_t *e; 877 nvlist_t *nvl; 878 879 ASSERT(state <= FMD_CASE_REPAIRED); 880 (void) pthread_mutex_lock(&cip->ci_lock); 881 882 if (!(cip->ci_flags & FMD_CF_SOLVED)) 883 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 884 885 cip->ci_flags |= flags; 886 887 if (cip->ci_state >= state) { 888 (void) pthread_mutex_unlock(&cip->ci_lock); 889 return; /* already in specified state */ 890 } 891 892 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 893 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 894 895 cip->ci_state = state; 896 cip->ci_flags |= FMD_CF_DIRTY; 897 898 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 899 fmd_module_setcdirty(cip->ci_mod); 900 901 switch (state) { 902 case FMD_CASE_SOLVED: 903 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 904 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 905 906 if (cip->ci_principal != NULL) { 907 fmd_event_transition(cip->ci_principal, 908 FMD_EVS_DIAGNOSED); 909 } 910 break; 911 912 case FMD_CASE_CLOSE_WAIT: 913 /* 914 * If the case was never solved, do not change ASRUs. 915 * If the case was never fmd_case_closed, do not change ASRUs. 916 * If the case was repaired, do not change ASRUs. 917 */ 918 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 919 FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 920 goto close_wait_finish; 921 922 /* 923 * For each fault event in the suspect list, attempt to look up 924 * the corresponding ASRU in the ASRU dictionary. If the ASRU 925 * is found there and is marked faulty, we now mark it unusable 926 * and record the case meta-data and fault event with the ASRU. 927 */ 928 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 929 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 930 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 931 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 932 (void) fmd_asru_setflags(asru, 933 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 934 fmd_asru_hash_release(fmd.d_asrus, asru); 935 } 936 } 937 938 close_wait_finish: 939 /* 940 * If an orphaned case transitions to CLOSE_WAIT, the owning 941 * module is no longer loaded: continue on to CASE_CLOSED. 942 */ 943 if (fmd_case_orphaned(cp)) 944 state = cip->ci_state = FMD_CASE_CLOSED; 945 break; 946 947 case FMD_CASE_REPAIRED: 948 ASSERT(fmd_case_orphaned(cp)); 949 fmd_module_lock(cip->ci_mod); 950 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 951 fmd_module_unlock(cip->ci_mod); 952 break; 953 } 954 955 (void) pthread_mutex_unlock(&cip->ci_lock); 956 957 /* 958 * If the module has initialized, then publish the appropriate event 959 * for the new case state. If not, we are being called from the 960 * checkpoint code during module load, in which case the module's 961 * _fmd_init() routine hasn't finished yet, and our event dictionaries 962 * may not be open yet, which will prevent us from computing the event 963 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 964 * event in our queue: this won't be processed until _fmd_init is done. 965 */ 966 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 967 fmd_case_publish(cp, state); 968 else { 969 fmd_case_hold(cp); 970 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 971 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 972 } 973 974 /* 975 * If we transitioned to REPAIRED, adjust the reference count to 976 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 977 * not placed an additional hold on the case, it will now be freed. 978 */ 979 if (state == FMD_CASE_REPAIRED) 980 fmd_case_rele(cp); 981 } 982 983 /* 984 * Transition the specified case to *at least* the specified state by first 985 * re-validating the suspect list using the resource cache. This function is 986 * employed by the checkpoint code when restoring a saved, solved case to see 987 * if the state of the case has effectively changed while fmd was not running 988 * or the module was not loaded. If none of the suspects are present anymore, 989 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 990 */ 991 void 992 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 993 { 994 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 995 fmd_case_susp_t *cis; 996 fmd_asru_t *asru; 997 nvlist_t *nvl; 998 999 int present = 0; /* are any suspects present? */ 1000 int usable = 0; /* are any suspects usable? */ 1001 1002 ASSERT(state >= FMD_CASE_SOLVED); 1003 (void) pthread_mutex_lock(&cip->ci_lock); 1004 1005 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1006 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1007 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1008 fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) { 1009 1010 if ((asru->asru_flags & FMD_ASRU_INTERNAL) || 1011 fmd_fmri_present(asru->asru_fmri) > 0) 1012 present++; 1013 1014 if (fmd_fmri_unusable(asru->asru_fmri) <= 0) 1015 usable++; 1016 1017 fmd_asru_hash_release(fmd.d_asrus, asru); 1018 } 1019 } 1020 1021 (void) pthread_mutex_unlock(&cip->ci_lock); 1022 1023 if (!present) { 1024 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1025 flags |= FMD_CF_REPAIRED; 1026 } else if (!usable) { 1027 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1028 flags |= FMD_CF_ISOLATED; 1029 } 1030 1031 fmd_case_transition(cp, state, flags); 1032 } 1033 1034 void 1035 fmd_case_setdirty(fmd_case_t *cp) 1036 { 1037 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1038 1039 (void) pthread_mutex_lock(&cip->ci_lock); 1040 cip->ci_flags |= FMD_CF_DIRTY; 1041 (void) pthread_mutex_unlock(&cip->ci_lock); 1042 1043 fmd_module_setcdirty(cip->ci_mod); 1044 } 1045 1046 void 1047 fmd_case_clrdirty(fmd_case_t *cp) 1048 { 1049 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1050 1051 (void) pthread_mutex_lock(&cip->ci_lock); 1052 cip->ci_flags &= ~FMD_CF_DIRTY; 1053 (void) pthread_mutex_unlock(&cip->ci_lock); 1054 } 1055 1056 void 1057 fmd_case_commit(fmd_case_t *cp) 1058 { 1059 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1060 fmd_case_item_t *cit; 1061 1062 (void) pthread_mutex_lock(&cip->ci_lock); 1063 1064 if (cip->ci_flags & FMD_CF_DIRTY) { 1065 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1066 fmd_event_commit(cit->cit_event); 1067 1068 if (cip->ci_principal != NULL) 1069 fmd_event_commit(cip->ci_principal); 1070 1071 fmd_buf_hash_commit(&cip->ci_bufs); 1072 cip->ci_flags &= ~FMD_CF_DIRTY; 1073 } 1074 1075 (void) pthread_mutex_unlock(&cip->ci_lock); 1076 } 1077 1078 /* 1079 * Indicate that the case may need to change state because one or more of the 1080 * ASRUs named as a suspect has changed state. We examine all the suspects 1081 * and if none are still faulty, we initiate a case close transition. 1082 */ 1083 void 1084 fmd_case_update(fmd_case_t *cp) 1085 { 1086 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1087 fmd_case_susp_t *cis; 1088 fmd_asru_t *asru; 1089 nvlist_t *nvl; 1090 1091 int astate = 0; 1092 uint_t cstate; 1093 1094 (void) pthread_mutex_lock(&cip->ci_lock); 1095 cstate = cip->ci_state; 1096 1097 if ((cip->ci_flags & FMD_CF_REPAIRING) || 1098 cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1099 (void) pthread_mutex_unlock(&cip->ci_lock); 1100 return; /* update is not appropriate */ 1101 } 1102 1103 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1104 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1105 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1106 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 1107 astate |= fmd_asru_getstate(asru); 1108 fmd_asru_hash_release(fmd.d_asrus, asru); 1109 } 1110 } 1111 1112 (void) pthread_mutex_unlock(&cip->ci_lock); 1113 1114 if (astate & FMD_ASRU_FAULTY) 1115 return; /* one or more suspects are still marked faulty */ 1116 1117 if (cstate == FMD_CASE_CLOSED) 1118 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1119 else 1120 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1121 } 1122 1123 /* 1124 * Delete a closed case from the module's case list once the fmdo_close() entry 1125 * point has run to completion. If the case is owned by a transport module, 1126 * tell the transport to proxy a case close on the other end of the transport. 1127 * If not, transition to the appropriate next state based on ci_flags. This 1128 * function represents the end of CLOSE_WAIT and transitions the case to either 1129 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1130 * refer to the topmost block comment explaining the state machine for details. 1131 */ 1132 void 1133 fmd_case_delete(fmd_case_t *cp) 1134 { 1135 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1136 fmd_modstat_t *msp; 1137 size_t buftotal; 1138 1139 ASSERT(fmd_module_locked(cip->ci_mod)); 1140 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1141 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1142 1143 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1144 msp = cip->ci_mod->mod_stats; 1145 1146 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1147 msp->ms_caseopen.fmds_value.ui64--; 1148 1149 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1150 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1151 1152 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1153 1154 if (cip->ci_xprt == NULL) 1155 fmd_module_setcdirty(cip->ci_mod); 1156 1157 fmd_module_rele(cip->ci_mod); 1158 cip->ci_mod = fmd.d_rmod; 1159 fmd_module_hold(cip->ci_mod); 1160 1161 /* 1162 * If the case is not proxied and it has been solved, then retain it 1163 * on the root module's case list at least until we're transitioned. 1164 * Otherwise free the case with our final fmd_case_rele() below. 1165 */ 1166 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1167 fmd_module_lock(cip->ci_mod); 1168 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1169 fmd_module_unlock(cip->ci_mod); 1170 fmd_case_hold(cp); 1171 } 1172 1173 /* 1174 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1175 * rather than orphaned because by definition it can have no entries 1176 * in the resource cache of the current fault manager. 1177 */ 1178 if (cip->ci_xprt != NULL) 1179 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1180 else if (cip->ci_flags & FMD_CF_REPAIRED) 1181 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1182 else if (cip->ci_flags & FMD_CF_ISOLATED) 1183 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1184 1185 fmd_case_rele(cp); 1186 } 1187 1188 void 1189 fmd_case_discard(fmd_case_t *cp) 1190 { 1191 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1192 1193 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1194 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1195 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1196 1197 ASSERT(fmd_module_locked(cip->ci_mod)); 1198 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1199 fmd_case_rele(cp); 1200 } 1201 1202 static void 1203 fmd_case_repair_containee(fmd_asru_t *ee, void *er) 1204 { 1205 if ((ee->asru_flags & FMD_ASRU_FAULTY) && 1206 fmd_fmri_contains(er, ee->asru_fmri) > 0) 1207 (void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL); 1208 } 1209 1210 /* 1211 * Indicate that the problem corresponding to a case has been repaired by 1212 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1213 * already been closed, this function initiates the transition to CLOSE_WAIT. 1214 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1215 * grab and drop ci_lock without the case being able to be freed in between. 1216 */ 1217 int 1218 fmd_case_repair(fmd_case_t *cp) 1219 { 1220 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1221 fmd_case_susp_t *cis; 1222 nvlist_t *nvl; 1223 uint_t cstate; 1224 1225 fmd_asru_hash_t *ahp = fmd.d_asrus; 1226 fmd_asru_t **aa; 1227 uint_t i, an; 1228 1229 (void) pthread_mutex_lock(&cip->ci_lock); 1230 cstate = cip->ci_state; 1231 1232 if (cip->ci_xprt != NULL) { 1233 (void) pthread_mutex_unlock(&cip->ci_lock); 1234 return (fmd_set_errno(EFMD_CASE_OWNER)); 1235 } 1236 1237 if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) { 1238 (void) pthread_mutex_unlock(&cip->ci_lock); 1239 return (fmd_set_errno(EFMD_CASE_STATE)); 1240 } 1241 1242 /* 1243 * Take a snapshot of any ASRUs referenced by the case that are present 1244 * in the resource cache. Then drop ci_lock and clear the faulty bit 1245 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held). 1246 */ 1247 an = cip->ci_nsuspects; 1248 aa = alloca(sizeof (fmd_asru_t *) * an); 1249 bzero(aa, sizeof (fmd_asru_t *) * an); 1250 1251 for (i = 0, cis = cip->ci_suspects; 1252 cis != NULL; cis = cis->cis_next, i++) { 1253 if (nvlist_lookup_nvlist(cis->cis_nvl, 1254 FM_FAULT_ASRU, &nvl) == 0) 1255 aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE); 1256 } 1257 1258 cip->ci_flags |= FMD_CF_REPAIRING; 1259 (void) pthread_mutex_unlock(&cip->ci_lock); 1260 1261 /* 1262 * For each suspect ASRU, if the case associated with this ASRU matches 1263 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY. Note 1264 * that at present, we're assuming that when a given resource FMRI R1 1265 * contains another R2, that any faults are related by a common 1266 * diagnosis engine. This is true in our current architecture, but may 1267 * not always be true, at which point we'll need more cleverness here. 1268 */ 1269 for (i = 0; i < an; i++) { 1270 if (aa[i] == NULL) 1271 continue; /* no asru was found */ 1272 1273 if (aa[i]->asru_case == cp) { 1274 fmd_asru_hash_apply(fmd.d_asrus, 1275 fmd_case_repair_containee, aa[i]->asru_fmri); 1276 (void) fmd_asru_clrflags(aa[i], 1277 FMD_ASRU_FAULTY, NULL, NULL); 1278 } 1279 1280 fmd_asru_hash_release(ahp, aa[i]); 1281 } 1282 1283 (void) pthread_mutex_lock(&cip->ci_lock); 1284 cip->ci_flags &= ~FMD_CF_REPAIRING; 1285 (void) pthread_mutex_unlock(&cip->ci_lock); 1286 1287 if (cstate == FMD_CASE_CLOSED) 1288 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1289 else 1290 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1291 1292 return (0); 1293 } 1294 1295 int 1296 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1297 { 1298 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1299 fmd_case_item_t *cit; 1300 uint_t state; 1301 int rv = 0; 1302 1303 (void) pthread_mutex_lock(&cip->ci_lock); 1304 1305 if (cip->ci_state >= FMD_CASE_SOLVED) 1306 state = FMD_EVS_DIAGNOSED; 1307 else 1308 state = FMD_EVS_ACCEPTED; 1309 1310 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1311 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1312 break; 1313 } 1314 1315 if (rv == 0 && cip->ci_principal != NULL) 1316 rv = fmd_event_equal(ep, cip->ci_principal); 1317 1318 (void) pthread_mutex_unlock(&cip->ci_lock); 1319 1320 if (rv != 0) 1321 fmd_event_transition(ep, state); 1322 1323 return (rv); 1324 } 1325 1326 int 1327 fmd_case_orphaned(fmd_case_t *cp) 1328 { 1329 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1330 } 1331