1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 extern volatile uint32_t fmd_asru_fake_not_present; 135 136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 137 138 fmd_case_hash_t * 139 fmd_case_hash_create(void) 140 { 141 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 142 143 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 144 chp->ch_hashlen = fmd.d_str_buckets; 145 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 146 chp->ch_count = 0; 147 148 return (chp); 149 } 150 151 /* 152 * Destroy the case hash. Unlike most of our hash tables, no active references 153 * are kept by the case hash itself; all references come from other subsystems. 154 * The hash must be destroyed after all modules are unloaded; if anything was 155 * present in the hash it would be by definition a reference count leak. 156 */ 157 void 158 fmd_case_hash_destroy(fmd_case_hash_t *chp) 159 { 160 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 161 fmd_free(chp, sizeof (fmd_case_hash_t)); 162 } 163 164 /* 165 * Take a snapshot of the case hash by placing an additional hold on each 166 * member in an auxiliary array, and then call 'func' for each case. 167 */ 168 void 169 fmd_case_hash_apply(fmd_case_hash_t *chp, 170 void (*func)(fmd_case_t *, void *), void *arg) 171 { 172 fmd_case_impl_t *cp, **cps, **cpp; 173 uint_t cpc, i; 174 175 (void) pthread_rwlock_rdlock(&chp->ch_lock); 176 177 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 178 cpc = chp->ch_count; 179 180 for (i = 0; i < chp->ch_hashlen; i++) { 181 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 182 fmd_case_hold((fmd_case_t *)cp); 183 *cpp++ = cp; 184 } 185 } 186 187 ASSERT(cpp == cps + cpc); 188 (void) pthread_rwlock_unlock(&chp->ch_lock); 189 190 for (i = 0; i < cpc; i++) { 191 func((fmd_case_t *)cps[i], arg); 192 fmd_case_rele((fmd_case_t *)cps[i]); 193 } 194 195 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 196 } 197 198 /* 199 * Look up the diagcode for this case and cache it in ci_code. If no suspects 200 * were defined for this case or if the lookup fails, the event dictionary or 201 * module code is broken, and we set the event code to a precomputed default. 202 */ 203 static const char * 204 fmd_case_mkcode(fmd_case_t *cp) 205 { 206 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 207 fmd_case_susp_t *cis; 208 209 char **keys, **keyp; 210 const char *s; 211 212 ASSERT(MUTEX_HELD(&cip->ci_lock)); 213 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 214 215 fmd_free(cip->ci_code, cip->ci_codelen); 216 cip->ci_codelen = cip->ci_mod->mod_codelen; 217 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 218 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 219 220 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 221 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 222 keyp++; 223 } 224 225 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 226 227 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 228 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 229 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 230 fmd_free(cip->ci_code, cip->ci_codelen); 231 cip->ci_codelen = strlen(s) + 1; 232 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 233 (void) strcpy(cip->ci_code, s); 234 } 235 236 return (cip->ci_code); 237 } 238 239 nvlist_t * 240 fmd_case_mkevent(fmd_case_t *cp, const char *class) 241 { 242 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 243 fmd_case_susp_t *cis; 244 245 fmd_asru_hash_t *ahp = fmd.d_asrus; 246 fmd_asru_t *asru; 247 248 nvlist_t **nva, **nvp, *nvl, *fmri; 249 uint8_t *ba, *bp; 250 251 int msg = B_TRUE; 252 boolean_t b; 253 254 (void) pthread_mutex_lock(&cip->ci_lock); 255 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 256 257 nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 258 ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 259 260 /* 261 * For each suspect associated with the case, store its fault event 262 * nvlist in 'nva'. We also look to see if any of the suspect faults 263 * have asked not to be messaged. If any of them have made such a 264 * request, propagate that attribute to the composite list.* event. 265 * Finally, store each suspect's faulty status into the bitmap 'ba'. 266 */ 267 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 268 if (nvlist_lookup_boolean_value(cis->cis_nvl, 269 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 270 msg = B_FALSE; 271 272 if (nvlist_lookup_nvlist(cis->cis_nvl, 273 FM_FAULT_ASRU, &fmri) == 0 && (asru = 274 fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) { 275 *bp = 0; 276 if (fmd_asru_fake_not_present || 277 !fmd_fmri_present(asru->asru_fmri)) 278 *bp |= FM_SUSPECT_NOT_PRESENT; 279 if (fmd_asru_fake_not_present || 280 fmd_fmri_unusable(asru->asru_fmri)) 281 *bp |= FM_SUSPECT_UNUSABLE; 282 if (asru->asru_flags & FMD_ASRU_FAULTY) 283 *bp |= FM_SUSPECT_FAULTY; 284 bp++; 285 fmd_asru_hash_release(ahp, asru); 286 } else 287 *bp++ = 0; 288 289 *nvp++ = cis->cis_nvl; 290 } 291 292 if (cip->ci_code == NULL) 293 (void) fmd_case_mkcode(cp); 294 295 if (msg == B_FALSE) 296 cip->ci_flags |= FMD_CF_INVISIBLE; 297 298 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 299 cip->ci_code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv); 300 301 (void) pthread_mutex_unlock(&cip->ci_lock); 302 return (nvl); 303 } 304 305 /* 306 * Convict suspects in a case by applying a conviction policy and updating the 307 * resource cache prior to emitting the list.suspect event for the given case. 308 * At present, our policy is very simple: convict every suspect in the case. 309 * In the future, this policy can be extended and made configurable to permit: 310 * 311 * - convicting the suspect with the highest FIT rate 312 * - convicting the suspect with the cheapest FRU 313 * - convicting the suspect with the FRU that is in a depot's inventory 314 * - convicting the suspect with the longest lifetime 315 * 316 * and so forth. A word to the wise: this problem is significantly harder that 317 * it seems at first glance. Future work should heed the following advice: 318 * 319 * Hacking the policy into C code here is a very bad idea. The policy needs to 320 * be decided upon very carefully and fundamentally encodes knowledge of what 321 * suspect list combinations can be emitted by what diagnosis engines. As such 322 * fmd's code is the wrong location, because that would require fmd itself to 323 * be updated for every diagnosis engine change, defeating the entire design. 324 * The FMA Event Registry knows the suspect list combinations: policy inputs 325 * can be derived from it and used to produce per-module policy configuration. 326 * 327 * If the policy needs to be dynamic and not statically fixed at either fmd 328 * startup or module load time, any implementation of dynamic policy retrieval 329 * must employ some kind of caching mechanism or be part of a built-in module. 330 * The fmd_case_convict() function is called with locks held inside of fmd and 331 * is not a place where unbounded blocking on some inter-process or inter- 332 * system communication to another service (e.g. another daemon) can occur. 333 */ 334 static void 335 fmd_case_convict(fmd_case_t *cp) 336 { 337 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 338 fmd_asru_hash_t *ahp = fmd.d_asrus; 339 340 fmd_case_susp_t *cis; 341 fmd_asru_t *asru; 342 nvlist_t *fmri; 343 344 (void) pthread_mutex_lock(&cip->ci_lock); 345 (void) fmd_case_mkcode(cp); 346 347 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 348 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri)) 349 continue; /* no ASRU provided by diagnosis engine */ 350 351 if ((asru = fmd_asru_hash_lookup_nvl(ahp, 352 fmri, FMD_B_TRUE)) == NULL) { 353 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 354 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 355 continue; 356 } 357 358 (void) fmd_asru_clrflags(asru, 359 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 360 (void) fmd_asru_setflags(asru, 361 FMD_ASRU_FAULTY, cp, cis->cis_nvl); 362 363 fmd_asru_hash_release(ahp, asru); 364 } 365 366 (void) pthread_mutex_unlock(&cip->ci_lock); 367 } 368 369 void 370 fmd_case_publish(fmd_case_t *cp, uint_t state) 371 { 372 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 373 fmd_event_t *e; 374 nvlist_t *nvl; 375 char *class; 376 377 if (state == FMD_CASE_CURRENT) 378 state = cip->ci_state; /* use current state */ 379 380 switch (state) { 381 case FMD_CASE_SOLVED: 382 (void) pthread_mutex_lock(&cip->ci_lock); 383 if (cip->ci_tv_valid == 0) { 384 fmd_time_gettimeofday(&cip->ci_tv); 385 cip->ci_tv_valid = 1; 386 } 387 (void) pthread_mutex_unlock(&cip->ci_lock); 388 fmd_case_convict(cp); 389 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 390 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 391 392 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 393 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 394 fmd_log_append(fmd.d_fltlog, e, cp); 395 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 396 fmd_dispq_dispatch(fmd.d_disp, e, class); 397 398 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 399 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 400 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 401 402 break; 403 404 case FMD_CASE_CLOSE_WAIT: 405 fmd_case_hold(cp); 406 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 407 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 408 409 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 410 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 411 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 412 413 break; 414 415 case FMD_CASE_CLOSED: 416 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 417 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 418 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 419 fmd_dispq_dispatch(fmd.d_disp, e, class); 420 break; 421 422 case FMD_CASE_REPAIRED: 423 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 424 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 425 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 426 fmd_dispq_dispatch(fmd.d_disp, e, class); 427 break; 428 } 429 } 430 431 fmd_case_t * 432 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 433 { 434 fmd_case_impl_t *cip; 435 uint_t h; 436 437 (void) pthread_rwlock_rdlock(&chp->ch_lock); 438 h = fmd_strhash(uuid) % chp->ch_hashlen; 439 440 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 441 if (strcmp(cip->ci_uuid, uuid) == 0) 442 break; 443 } 444 445 /* 446 * If deleting bit is set, treat the case as if it doesn't exist. 447 */ 448 if (cip != NULL) 449 cip = fmd_case_tryhold(cip); 450 451 if (cip == NULL) 452 (void) fmd_set_errno(EFMD_CASE_INVAL); 453 454 (void) pthread_rwlock_unlock(&chp->ch_lock); 455 return ((fmd_case_t *)cip); 456 } 457 458 static fmd_case_impl_t * 459 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 460 { 461 fmd_case_impl_t *eip; 462 uint_t h; 463 464 (void) pthread_rwlock_wrlock(&chp->ch_lock); 465 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 466 467 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 468 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 469 fmd_case_tryhold(eip) != NULL) { 470 (void) pthread_rwlock_unlock(&chp->ch_lock); 471 return (eip); /* uuid already present */ 472 } 473 } 474 475 cip->ci_next = chp->ch_hash[h]; 476 chp->ch_hash[h] = cip; 477 478 chp->ch_count++; 479 ASSERT(chp->ch_count != 0); 480 481 (void) pthread_rwlock_unlock(&chp->ch_lock); 482 return (cip); 483 } 484 485 static void 486 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 487 { 488 fmd_case_impl_t *cp, **pp; 489 uint_t h; 490 491 ASSERT(MUTEX_HELD(&cip->ci_lock)); 492 493 cip->ci_flags |= FMD_CF_DELETING; 494 (void) pthread_mutex_unlock(&cip->ci_lock); 495 496 (void) pthread_rwlock_wrlock(&chp->ch_lock); 497 498 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 499 pp = &chp->ch_hash[h]; 500 501 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 502 if (cp != cip) 503 pp = &cp->ci_next; 504 else 505 break; 506 } 507 508 if (cp == NULL) { 509 fmd_panic("case %p (%s) not found on hash chain %u\n", 510 (void *)cip, cip->ci_uuid, h); 511 } 512 513 *pp = cp->ci_next; 514 cp->ci_next = NULL; 515 516 ASSERT(chp->ch_count != 0); 517 chp->ch_count--; 518 519 (void) pthread_rwlock_unlock(&chp->ch_lock); 520 521 (void) pthread_mutex_lock(&cip->ci_lock); 522 ASSERT(cip->ci_flags & FMD_CF_DELETING); 523 } 524 525 fmd_case_t * 526 fmd_case_create(fmd_module_t *mp, void *data) 527 { 528 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 529 fmd_case_impl_t *eip = NULL; 530 uuid_t uuid; 531 532 (void) pthread_mutex_init(&cip->ci_lock, NULL); 533 fmd_buf_hash_create(&cip->ci_bufs); 534 535 fmd_module_hold(mp); 536 cip->ci_mod = mp; 537 cip->ci_refs = 1; 538 cip->ci_state = FMD_CASE_UNSOLVED; 539 cip->ci_flags = FMD_CF_DIRTY; 540 cip->ci_data = data; 541 542 /* 543 * Calling libuuid: get a clue. The library interfaces cleverly do not 544 * define any constant for the length of an unparse string, and do not 545 * permit the caller to specify a buffer length for safety. The spec 546 * says it will be 36 bytes, but we make it tunable just in case. 547 */ 548 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 549 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 550 551 /* 552 * We expect this loop to execute only once, but code it defensively 553 * against the possibility of libuuid bugs. Keep generating uuids and 554 * attempting to do a hash insert until we get a unique one. 555 */ 556 do { 557 if (eip != NULL) 558 fmd_case_rele((fmd_case_t *)eip); 559 uuid_generate(uuid); 560 uuid_unparse(uuid, cip->ci_uuid); 561 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 562 563 ASSERT(fmd_module_locked(mp)); 564 fmd_list_append(&mp->mod_cases, cip); 565 fmd_module_setcdirty(mp); 566 567 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 568 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 569 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 570 571 return ((fmd_case_t *)cip); 572 } 573 574 static void 575 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 576 { 577 fmd_case_susp_t *cis, *ncis; 578 579 ASSERT(MUTEX_HELD(&cip->ci_lock)); 580 581 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 582 ncis = cis->cis_next; 583 nvlist_free(cis->cis_nvl); 584 fmd_free(cis, sizeof (fmd_case_susp_t)); 585 } 586 587 cip->ci_suspects = NULL; 588 cip->ci_nsuspects = 0; 589 } 590 591 fmd_case_t * 592 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 593 uint_t state, const char *uuid, const char *code) 594 { 595 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 596 fmd_case_impl_t *eip; 597 598 ASSERT(state < FMD_CASE_REPAIRED); 599 600 (void) pthread_mutex_init(&cip->ci_lock, NULL); 601 fmd_buf_hash_create(&cip->ci_bufs); 602 603 fmd_module_hold(mp); 604 cip->ci_mod = mp; 605 cip->ci_xprt = xp; 606 cip->ci_refs = 1; 607 cip->ci_state = state; 608 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 609 cip->ci_uuidlen = strlen(cip->ci_uuid); 610 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 611 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 612 613 if (state > FMD_CASE_CLOSE_WAIT) 614 cip->ci_flags |= FMD_CF_SOLVED; 615 616 /* 617 * Insert the case into the global case hash. If the specified UUID is 618 * already present, check to see if it is an orphan: if so, reclaim it; 619 * otherwise if it is owned by a different module then return NULL. 620 */ 621 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 622 (void) pthread_mutex_lock(&cip->ci_lock); 623 cip->ci_refs--; /* decrement to zero */ 624 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 625 626 cip = eip; /* switch 'cip' to the existing case */ 627 (void) pthread_mutex_lock(&cip->ci_lock); 628 629 /* 630 * If the ASRU cache is trying to recreate an orphan, then just 631 * return the existing case that we found without changing it. 632 */ 633 if (mp == fmd.d_rmod) { 634 (void) pthread_mutex_unlock(&cip->ci_lock); 635 fmd_case_rele((fmd_case_t *)cip); 636 return ((fmd_case_t *)cip); 637 } 638 639 /* 640 * If the existing case isn't an orphan or is being proxied, 641 * then we have a UUID conflict: return failure to the caller. 642 */ 643 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 644 (void) pthread_mutex_unlock(&cip->ci_lock); 645 fmd_case_rele((fmd_case_t *)cip); 646 return (NULL); 647 } 648 649 /* 650 * If the new module is reclaiming an orphaned case, remove 651 * the case from the root module, switch ci_mod, and then fall 652 * through to adding the case to the new owner module 'mp'. 653 */ 654 fmd_module_lock(cip->ci_mod); 655 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 656 fmd_module_unlock(cip->ci_mod); 657 658 fmd_module_rele(cip->ci_mod); 659 cip->ci_mod = mp; 660 fmd_module_hold(mp); 661 662 fmd_case_destroy_suspects(cip); 663 cip->ci_state = state; 664 665 (void) pthread_mutex_unlock(&cip->ci_lock); 666 fmd_case_rele((fmd_case_t *)cip); 667 } 668 669 ASSERT(fmd_module_locked(mp)); 670 fmd_list_append(&mp->mod_cases, cip); 671 672 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 673 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 674 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 675 676 return ((fmd_case_t *)cip); 677 } 678 679 void 680 fmd_case_destroy(fmd_case_t *cp, int visible) 681 { 682 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 683 fmd_case_item_t *cit, *ncit; 684 685 ASSERT(MUTEX_HELD(&cip->ci_lock)); 686 ASSERT(cip->ci_refs == 0); 687 688 if (visible) { 689 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 690 fmd_case_hash_delete(fmd.d_cases, cip); 691 } 692 693 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 694 ncit = cit->cit_next; 695 fmd_event_rele(cit->cit_event); 696 fmd_free(cit, sizeof (fmd_case_item_t)); 697 } 698 699 fmd_case_destroy_suspects(cip); 700 701 if (cip->ci_principal != NULL) 702 fmd_event_rele(cip->ci_principal); 703 704 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 705 fmd_free(cip->ci_code, cip->ci_codelen); 706 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 707 708 fmd_module_rele(cip->ci_mod); 709 fmd_free(cip, sizeof (fmd_case_impl_t)); 710 } 711 712 void 713 fmd_case_hold(fmd_case_t *cp) 714 { 715 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 716 717 (void) pthread_mutex_lock(&cip->ci_lock); 718 fmd_case_hold_locked(cp); 719 (void) pthread_mutex_unlock(&cip->ci_lock); 720 } 721 722 void 723 fmd_case_hold_locked(fmd_case_t *cp) 724 { 725 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 726 727 ASSERT(MUTEX_HELD(&cip->ci_lock)); 728 if (cip->ci_flags & FMD_CF_DELETING) 729 fmd_panic("attempt to hold a deleting case %p (%s)\n", 730 (void *)cip, cip->ci_uuid); 731 cip->ci_refs++; 732 ASSERT(cip->ci_refs != 0); 733 } 734 735 static fmd_case_impl_t * 736 fmd_case_tryhold(fmd_case_impl_t *cip) 737 { 738 /* 739 * If the case's "deleting" bit is unset, hold and return case, 740 * otherwise, return NULL. 741 */ 742 (void) pthread_mutex_lock(&cip->ci_lock); 743 if (cip->ci_flags & FMD_CF_DELETING) { 744 (void) pthread_mutex_unlock(&cip->ci_lock); 745 cip = NULL; 746 } else { 747 fmd_case_hold_locked((fmd_case_t *)cip); 748 (void) pthread_mutex_unlock(&cip->ci_lock); 749 } 750 return (cip); 751 } 752 753 void 754 fmd_case_rele(fmd_case_t *cp) 755 { 756 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 757 758 (void) pthread_mutex_lock(&cip->ci_lock); 759 ASSERT(cip->ci_refs != 0); 760 761 if (--cip->ci_refs == 0) 762 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 763 else 764 (void) pthread_mutex_unlock(&cip->ci_lock); 765 } 766 767 int 768 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 769 { 770 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 771 fmd_case_item_t *cit; 772 fmd_event_t *oep; 773 uint_t state; 774 int new; 775 776 fmd_event_hold(ep); 777 (void) pthread_mutex_lock(&cip->ci_lock); 778 779 if (cip->ci_flags & FMD_CF_SOLVED) 780 state = FMD_EVS_DIAGNOSED; 781 else 782 state = FMD_EVS_ACCEPTED; 783 784 oep = cip->ci_principal; 785 cip->ci_principal = ep; 786 787 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 788 if (cit->cit_event == ep) 789 break; 790 } 791 792 cip->ci_flags |= FMD_CF_DIRTY; 793 new = cit == NULL && ep != oep; 794 795 (void) pthread_mutex_unlock(&cip->ci_lock); 796 797 fmd_module_setcdirty(cip->ci_mod); 798 fmd_event_transition(ep, state); 799 800 if (oep != NULL) 801 fmd_event_rele(oep); 802 803 return (new); 804 } 805 806 int 807 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 808 { 809 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 810 fmd_case_item_t *cit; 811 uint_t state; 812 int new; 813 814 (void) pthread_mutex_lock(&cip->ci_lock); 815 816 if (cip->ci_flags & FMD_CF_SOLVED) 817 state = FMD_EVS_DIAGNOSED; 818 else 819 state = FMD_EVS_ACCEPTED; 820 821 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 822 if (cit->cit_event == ep) 823 break; 824 } 825 826 new = cit == NULL && ep != cip->ci_principal; 827 828 /* 829 * If the event is already in the case or the case is already solved, 830 * there is no reason to save it: just transition it appropriately. 831 */ 832 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 833 (void) pthread_mutex_unlock(&cip->ci_lock); 834 fmd_event_transition(ep, state); 835 return (new); 836 } 837 838 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 839 fmd_event_hold(ep); 840 841 cit->cit_next = cip->ci_items; 842 cit->cit_event = ep; 843 844 cip->ci_items = cit; 845 cip->ci_nitems++; 846 847 cip->ci_flags |= FMD_CF_DIRTY; 848 (void) pthread_mutex_unlock(&cip->ci_lock); 849 850 fmd_module_setcdirty(cip->ci_mod); 851 fmd_event_transition(ep, state); 852 853 return (new); 854 } 855 856 void 857 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 858 { 859 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 860 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 861 862 (void) pthread_mutex_lock(&cip->ci_lock); 863 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 864 cip->ci_flags |= FMD_CF_DIRTY; 865 866 cis->cis_next = cip->ci_suspects; 867 cis->cis_nvl = nvl; 868 869 cip->ci_suspects = cis; 870 cip->ci_nsuspects++; 871 872 (void) pthread_mutex_unlock(&cip->ci_lock); 873 fmd_module_setcdirty(cip->ci_mod); 874 } 875 876 void 877 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 878 { 879 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 880 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 881 boolean_t b; 882 883 (void) pthread_mutex_lock(&cip->ci_lock); 884 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 885 ASSERT(cip->ci_mod == fmd.d_rmod); 886 887 cis->cis_next = cip->ci_suspects; 888 cis->cis_nvl = nvl; 889 890 if (nvlist_lookup_boolean_value(nvl, 891 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 892 cip->ci_flags |= FMD_CF_INVISIBLE; 893 894 cip->ci_suspects = cis; 895 cip->ci_nsuspects++; 896 897 (void) pthread_mutex_unlock(&cip->ci_lock); 898 } 899 900 void 901 fmd_case_reset_suspects(fmd_case_t *cp) 902 { 903 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 904 905 (void) pthread_mutex_lock(&cip->ci_lock); 906 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 907 908 fmd_case_destroy_suspects(cip); 909 cip->ci_flags |= FMD_CF_DIRTY; 910 911 (void) pthread_mutex_unlock(&cip->ci_lock); 912 fmd_module_setcdirty(cip->ci_mod); 913 } 914 915 /* 916 * Grab ci_lock and update the case state and set the dirty bit. Then perform 917 * whatever actions and emit whatever events are appropriate for the state. 918 * Refer to the topmost block comment explaining the state machine for details. 919 */ 920 void 921 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 922 { 923 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 924 925 fmd_case_susp_t *cis; 926 fmd_case_item_t *cit; 927 fmd_asru_t *asru; 928 fmd_event_t *e; 929 nvlist_t *nvl; 930 931 ASSERT(state <= FMD_CASE_REPAIRED); 932 (void) pthread_mutex_lock(&cip->ci_lock); 933 934 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 935 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 936 937 cip->ci_flags |= flags; 938 939 if (cip->ci_state >= state) { 940 (void) pthread_mutex_unlock(&cip->ci_lock); 941 return; /* already in specified state */ 942 } 943 944 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 945 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 946 947 cip->ci_state = state; 948 cip->ci_flags |= FMD_CF_DIRTY; 949 950 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 951 fmd_module_setcdirty(cip->ci_mod); 952 953 switch (state) { 954 case FMD_CASE_SOLVED: 955 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 956 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 957 958 if (cip->ci_principal != NULL) { 959 fmd_event_transition(cip->ci_principal, 960 FMD_EVS_DIAGNOSED); 961 } 962 break; 963 964 case FMD_CASE_CLOSE_WAIT: 965 /* 966 * If the case was never solved, do not change ASRUs. 967 * If the case was never fmd_case_closed, do not change ASRUs. 968 * If the case was repaired, do not change ASRUs. 969 */ 970 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 971 FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 972 goto close_wait_finish; 973 974 /* 975 * For each fault event in the suspect list, attempt to look up 976 * the corresponding ASRU in the ASRU dictionary. If the ASRU 977 * is found there and is marked faulty, we now mark it unusable 978 * and record the case meta-data and fault event with the ASRU. 979 */ 980 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 981 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 982 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 983 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 984 (void) fmd_asru_setflags(asru, 985 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 986 fmd_asru_hash_release(fmd.d_asrus, asru); 987 } 988 } 989 990 close_wait_finish: 991 /* 992 * If an orphaned case transitions to CLOSE_WAIT, the owning 993 * module is no longer loaded: continue on to CASE_CLOSED. 994 */ 995 if (fmd_case_orphaned(cp)) 996 state = cip->ci_state = FMD_CASE_CLOSED; 997 break; 998 999 case FMD_CASE_REPAIRED: 1000 ASSERT(fmd_case_orphaned(cp)); 1001 fmd_module_lock(cip->ci_mod); 1002 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1003 fmd_module_unlock(cip->ci_mod); 1004 break; 1005 } 1006 1007 (void) pthread_mutex_unlock(&cip->ci_lock); 1008 1009 /* 1010 * If the module has initialized, then publish the appropriate event 1011 * for the new case state. If not, we are being called from the 1012 * checkpoint code during module load, in which case the module's 1013 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1014 * may not be open yet, which will prevent us from computing the event 1015 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1016 * event in our queue: this won't be processed until _fmd_init is done. 1017 */ 1018 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1019 fmd_case_publish(cp, state); 1020 else { 1021 fmd_case_hold(cp); 1022 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1023 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1024 } 1025 1026 /* 1027 * If we transitioned to REPAIRED, adjust the reference count to 1028 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 1029 * not placed an additional hold on the case, it will now be freed. 1030 */ 1031 if (state == FMD_CASE_REPAIRED) 1032 fmd_case_rele(cp); 1033 } 1034 1035 /* 1036 * Transition the specified case to *at least* the specified state by first 1037 * re-validating the suspect list using the resource cache. This function is 1038 * employed by the checkpoint code when restoring a saved, solved case to see 1039 * if the state of the case has effectively changed while fmd was not running 1040 * or the module was not loaded. If none of the suspects are present anymore, 1041 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 1042 */ 1043 void 1044 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1045 { 1046 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1047 fmd_case_susp_t *cis; 1048 fmd_asru_t *asru; 1049 nvlist_t *nvl; 1050 1051 int faulty = 0; /* are any suspects faulty? */ 1052 int usable = 0; /* are any suspects usable? */ 1053 1054 ASSERT(state >= FMD_CASE_SOLVED); 1055 (void) pthread_mutex_lock(&cip->ci_lock); 1056 1057 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1058 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1059 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1060 fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) { 1061 1062 if (asru->asru_flags & FMD_ASRU_FAULTY) 1063 faulty++; 1064 1065 if (fmd_asru_fake_not_present == 0 && 1066 fmd_fmri_unusable(asru->asru_fmri) <= 0) 1067 usable++; 1068 1069 fmd_asru_hash_release(fmd.d_asrus, asru); 1070 } 1071 } 1072 1073 (void) pthread_mutex_unlock(&cip->ci_lock); 1074 1075 /* 1076 * If none of the suspects were faulty, it implies they were either 1077 * repaired already or not present and the rsrc.age time has expired. 1078 * We can move the state on to repaired. 1079 */ 1080 if (!faulty) { 1081 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1082 flags |= FMD_CF_REPAIRED; 1083 } else if (!usable) { 1084 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1085 flags |= FMD_CF_ISOLATED; 1086 } 1087 1088 fmd_case_transition(cp, state, flags); 1089 } 1090 1091 void 1092 fmd_case_setdirty(fmd_case_t *cp) 1093 { 1094 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1095 1096 (void) pthread_mutex_lock(&cip->ci_lock); 1097 cip->ci_flags |= FMD_CF_DIRTY; 1098 (void) pthread_mutex_unlock(&cip->ci_lock); 1099 1100 fmd_module_setcdirty(cip->ci_mod); 1101 } 1102 1103 void 1104 fmd_case_clrdirty(fmd_case_t *cp) 1105 { 1106 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1107 1108 (void) pthread_mutex_lock(&cip->ci_lock); 1109 cip->ci_flags &= ~FMD_CF_DIRTY; 1110 (void) pthread_mutex_unlock(&cip->ci_lock); 1111 } 1112 1113 void 1114 fmd_case_commit(fmd_case_t *cp) 1115 { 1116 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1117 fmd_case_item_t *cit; 1118 1119 (void) pthread_mutex_lock(&cip->ci_lock); 1120 1121 if (cip->ci_flags & FMD_CF_DIRTY) { 1122 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1123 fmd_event_commit(cit->cit_event); 1124 1125 if (cip->ci_principal != NULL) 1126 fmd_event_commit(cip->ci_principal); 1127 1128 fmd_buf_hash_commit(&cip->ci_bufs); 1129 cip->ci_flags &= ~FMD_CF_DIRTY; 1130 } 1131 1132 (void) pthread_mutex_unlock(&cip->ci_lock); 1133 } 1134 1135 /* 1136 * Indicate that the case may need to change state because one or more of the 1137 * ASRUs named as a suspect has changed state. We examine all the suspects 1138 * and if none are still faulty, we initiate a case close transition. 1139 */ 1140 void 1141 fmd_case_update(fmd_case_t *cp) 1142 { 1143 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1144 fmd_case_susp_t *cis; 1145 fmd_asru_t *asru; 1146 nvlist_t *nvl; 1147 1148 int astate = 0; 1149 uint_t cstate; 1150 1151 (void) pthread_mutex_lock(&cip->ci_lock); 1152 cstate = cip->ci_state; 1153 1154 if ((cip->ci_flags & FMD_CF_REPAIRING) || 1155 cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1156 (void) pthread_mutex_unlock(&cip->ci_lock); 1157 return; /* update is not appropriate */ 1158 } 1159 1160 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1161 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1162 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1163 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 1164 astate |= (asru->asru_flags & FMD_ASRU_STATE); 1165 fmd_asru_hash_release(fmd.d_asrus, asru); 1166 } 1167 } 1168 1169 (void) pthread_mutex_unlock(&cip->ci_lock); 1170 1171 if (astate & FMD_ASRU_FAULTY) 1172 return; /* one or more suspects are still marked faulty */ 1173 1174 if (cstate == FMD_CASE_CLOSED) 1175 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1176 else 1177 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1178 } 1179 1180 /* 1181 * Delete a closed case from the module's case list once the fmdo_close() entry 1182 * point has run to completion. If the case is owned by a transport module, 1183 * tell the transport to proxy a case close on the other end of the transport. 1184 * If not, transition to the appropriate next state based on ci_flags. This 1185 * function represents the end of CLOSE_WAIT and transitions the case to either 1186 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1187 * refer to the topmost block comment explaining the state machine for details. 1188 */ 1189 void 1190 fmd_case_delete(fmd_case_t *cp) 1191 { 1192 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1193 fmd_modstat_t *msp; 1194 size_t buftotal; 1195 1196 ASSERT(fmd_module_locked(cip->ci_mod)); 1197 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1198 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1199 1200 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1201 msp = cip->ci_mod->mod_stats; 1202 1203 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1204 msp->ms_caseopen.fmds_value.ui64--; 1205 1206 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1207 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1208 1209 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1210 1211 if (cip->ci_xprt == NULL) 1212 fmd_module_setcdirty(cip->ci_mod); 1213 1214 fmd_module_rele(cip->ci_mod); 1215 cip->ci_mod = fmd.d_rmod; 1216 fmd_module_hold(cip->ci_mod); 1217 1218 /* 1219 * If the case is not proxied and it has been solved, then retain it 1220 * on the root module's case list at least until we're transitioned. 1221 * Otherwise free the case with our final fmd_case_rele() below. 1222 */ 1223 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1224 fmd_module_lock(cip->ci_mod); 1225 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1226 fmd_module_unlock(cip->ci_mod); 1227 fmd_case_hold(cp); 1228 } 1229 1230 /* 1231 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1232 * rather than orphaned because by definition it can have no entries 1233 * in the resource cache of the current fault manager. 1234 */ 1235 if (cip->ci_xprt != NULL) 1236 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1237 else if (cip->ci_flags & FMD_CF_REPAIRED) 1238 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1239 else if (cip->ci_flags & FMD_CF_ISOLATED) 1240 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1241 1242 fmd_case_rele(cp); 1243 } 1244 1245 void 1246 fmd_case_discard(fmd_case_t *cp) 1247 { 1248 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1249 1250 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1251 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1252 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1253 1254 ASSERT(fmd_module_locked(cip->ci_mod)); 1255 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1256 fmd_case_rele(cp); 1257 } 1258 1259 static void 1260 fmd_case_repair_containee(fmd_asru_t *ee, void *er) 1261 { 1262 if ((ee->asru_flags & FMD_ASRU_FAULTY) && 1263 fmd_fmri_contains(er, ee->asru_fmri) > 0) 1264 (void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL); 1265 } 1266 1267 /* 1268 * Indicate that the problem corresponding to a case has been repaired by 1269 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1270 * already been closed, this function initiates the transition to CLOSE_WAIT. 1271 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1272 * grab and drop ci_lock without the case being able to be freed in between. 1273 */ 1274 int 1275 fmd_case_repair(fmd_case_t *cp) 1276 { 1277 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1278 fmd_case_susp_t *cis; 1279 nvlist_t *nvl; 1280 uint_t cstate; 1281 1282 fmd_asru_hash_t *ahp = fmd.d_asrus; 1283 fmd_asru_t **aa; 1284 uint_t i, an; 1285 1286 (void) pthread_mutex_lock(&cip->ci_lock); 1287 cstate = cip->ci_state; 1288 1289 if (cip->ci_xprt != NULL) { 1290 (void) pthread_mutex_unlock(&cip->ci_lock); 1291 return (fmd_set_errno(EFMD_CASE_OWNER)); 1292 } 1293 1294 if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) { 1295 (void) pthread_mutex_unlock(&cip->ci_lock); 1296 return (fmd_set_errno(EFMD_CASE_STATE)); 1297 } 1298 1299 /* 1300 * Take a snapshot of any ASRUs referenced by the case that are present 1301 * in the resource cache. Then drop ci_lock and clear the faulty bit 1302 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held). 1303 */ 1304 an = cip->ci_nsuspects; 1305 aa = alloca(sizeof (fmd_asru_t *) * an); 1306 bzero(aa, sizeof (fmd_asru_t *) * an); 1307 1308 for (i = 0, cis = cip->ci_suspects; 1309 cis != NULL; cis = cis->cis_next, i++) { 1310 if (nvlist_lookup_nvlist(cis->cis_nvl, 1311 FM_FAULT_ASRU, &nvl) == 0) 1312 aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE); 1313 } 1314 1315 cip->ci_flags |= FMD_CF_REPAIRING; 1316 (void) pthread_mutex_unlock(&cip->ci_lock); 1317 1318 /* 1319 * For each suspect ASRU, if the case associated with this ASRU matches 1320 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY. Note 1321 * that at present, we're assuming that when a given resource FMRI R1 1322 * contains another R2, that any faults are related by a common 1323 * diagnosis engine. This is true in our current architecture, but may 1324 * not always be true, at which point we'll need more cleverness here. 1325 */ 1326 for (i = 0; i < an; i++) { 1327 if (aa[i] == NULL) 1328 continue; /* no asru was found */ 1329 1330 if (aa[i]->asru_case == cp) { 1331 fmd_asru_hash_apply(fmd.d_asrus, 1332 fmd_case_repair_containee, aa[i]->asru_fmri); 1333 (void) fmd_asru_clrflags(aa[i], 1334 FMD_ASRU_FAULTY, NULL, NULL); 1335 } 1336 1337 fmd_asru_hash_release(ahp, aa[i]); 1338 } 1339 1340 (void) pthread_mutex_lock(&cip->ci_lock); 1341 cip->ci_flags &= ~FMD_CF_REPAIRING; 1342 (void) pthread_mutex_unlock(&cip->ci_lock); 1343 1344 if (cstate == FMD_CASE_CLOSED) 1345 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1346 else 1347 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1348 1349 return (0); 1350 } 1351 1352 int 1353 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1354 { 1355 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1356 fmd_case_item_t *cit; 1357 uint_t state; 1358 int rv = 0; 1359 1360 (void) pthread_mutex_lock(&cip->ci_lock); 1361 1362 if (cip->ci_state >= FMD_CASE_SOLVED) 1363 state = FMD_EVS_DIAGNOSED; 1364 else 1365 state = FMD_EVS_ACCEPTED; 1366 1367 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1368 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1369 break; 1370 } 1371 1372 if (rv == 0 && cip->ci_principal != NULL) 1373 rv = fmd_event_equal(ep, cip->ci_principal); 1374 1375 (void) pthread_mutex_unlock(&cip->ci_lock); 1376 1377 if (rv != 0) 1378 fmd_event_transition(ep, state); 1379 1380 return (rv); 1381 } 1382 1383 int 1384 fmd_case_orphaned(fmd_case_t *cp) 1385 { 1386 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1387 } 1388 1389 void 1390 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1391 { 1392 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1393 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1394 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1395 } 1396