1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 extern volatile uint32_t fmd_asru_fake_not_present; 135 136 fmd_case_hash_t * 137 fmd_case_hash_create(void) 138 { 139 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 140 141 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 142 chp->ch_hashlen = fmd.d_str_buckets; 143 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 144 chp->ch_count = 0; 145 146 return (chp); 147 } 148 149 /* 150 * Destroy the case hash. Unlike most of our hash tables, no active references 151 * are kept by the case hash itself; all references come from other subsystems. 152 * The hash must be destroyed after all modules are unloaded; if anything was 153 * present in the hash it would be by definition a reference count leak. 154 */ 155 void 156 fmd_case_hash_destroy(fmd_case_hash_t *chp) 157 { 158 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 159 fmd_free(chp, sizeof (fmd_case_hash_t)); 160 } 161 162 /* 163 * Take a snapshot of the case hash by placing an additional hold on each 164 * member in an auxiliary array, and then call 'func' for each case. 165 */ 166 void 167 fmd_case_hash_apply(fmd_case_hash_t *chp, 168 void (*func)(fmd_case_t *, void *), void *arg) 169 { 170 fmd_case_impl_t *cp, **cps, **cpp; 171 uint_t cpc, i; 172 173 (void) pthread_rwlock_rdlock(&chp->ch_lock); 174 175 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 176 cpc = chp->ch_count; 177 178 for (i = 0; i < chp->ch_hashlen; i++) { 179 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 180 fmd_case_hold((fmd_case_t *)cp); 181 *cpp++ = cp; 182 } 183 } 184 185 ASSERT(cpp == cps + cpc); 186 (void) pthread_rwlock_unlock(&chp->ch_lock); 187 188 for (i = 0; i < cpc; i++) { 189 func((fmd_case_t *)cps[i], arg); 190 fmd_case_rele((fmd_case_t *)cps[i]); 191 } 192 193 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 194 } 195 196 /* 197 * Look up the diagcode for this case and cache it in ci_code. If no suspects 198 * were defined for this case or if the lookup fails, the event dictionary or 199 * module code is broken, and we set the event code to a precomputed default. 200 */ 201 static const char * 202 fmd_case_mkcode(fmd_case_t *cp) 203 { 204 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 205 fmd_case_susp_t *cis; 206 207 char **keys, **keyp; 208 const char *s; 209 210 ASSERT(MUTEX_HELD(&cip->ci_lock)); 211 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 212 213 fmd_free(cip->ci_code, cip->ci_codelen); 214 cip->ci_codelen = cip->ci_mod->mod_codelen; 215 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 216 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 217 218 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 219 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 220 keyp++; 221 } 222 223 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 224 225 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 226 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 227 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 228 fmd_free(cip->ci_code, cip->ci_codelen); 229 cip->ci_codelen = strlen(s) + 1; 230 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 231 (void) strcpy(cip->ci_code, s); 232 } 233 234 return (cip->ci_code); 235 } 236 237 nvlist_t * 238 fmd_case_mkevent(fmd_case_t *cp, const char *class) 239 { 240 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 241 fmd_case_susp_t *cis; 242 243 fmd_asru_hash_t *ahp = fmd.d_asrus; 244 fmd_asru_t *asru; 245 246 nvlist_t **nva, **nvp, *nvl, *fmri; 247 uint8_t *ba, *bp; 248 249 int msg = B_TRUE; 250 boolean_t b; 251 252 (void) pthread_mutex_lock(&cip->ci_lock); 253 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 254 255 nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 256 ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 257 258 /* 259 * For each suspect associated with the case, store its fault event 260 * nvlist in 'nva'. We also look to see if any of the suspect faults 261 * have asked not to be messaged. If any of them have made such a 262 * request, propagate that attribute to the composite list.* event. 263 * Finally, store each suspect's faulty status into the bitmap 'ba'. 264 */ 265 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 266 if (nvlist_lookup_boolean_value(cis->cis_nvl, 267 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 268 msg = B_FALSE; 269 270 if (nvlist_lookup_nvlist(cis->cis_nvl, 271 FM_FAULT_ASRU, &fmri) == 0 && (asru = 272 fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) { 273 *bp = 0; 274 if (fmd_asru_fake_not_present || 275 !fmd_fmri_present(asru->asru_fmri)) 276 *bp |= FM_SUSPECT_NOT_PRESENT; 277 if (fmd_asru_fake_not_present || 278 fmd_fmri_unusable(asru->asru_fmri)) 279 *bp |= FM_SUSPECT_UNUSABLE; 280 if (asru->asru_flags & FMD_ASRU_FAULTY) 281 *bp |= FM_SUSPECT_FAULTY; 282 bp++; 283 fmd_asru_hash_release(ahp, asru); 284 } else 285 *bp++ = 0; 286 287 *nvp++ = cis->cis_nvl; 288 } 289 290 if (cip->ci_code == NULL) 291 (void) fmd_case_mkcode(cp); 292 293 if (msg == B_FALSE) 294 cip->ci_flags |= FMD_CF_INVISIBLE; 295 296 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 297 cip->ci_code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv); 298 299 (void) pthread_mutex_unlock(&cip->ci_lock); 300 return (nvl); 301 } 302 303 /* 304 * Convict suspects in a case by applying a conviction policy and updating the 305 * resource cache prior to emitting the list.suspect event for the given case. 306 * At present, our policy is very simple: convict every suspect in the case. 307 * In the future, this policy can be extended and made configurable to permit: 308 * 309 * - convicting the suspect with the highest FIT rate 310 * - convicting the suspect with the cheapest FRU 311 * - convicting the suspect with the FRU that is in a depot's inventory 312 * - convicting the suspect with the longest lifetime 313 * 314 * and so forth. A word to the wise: this problem is significantly harder that 315 * it seems at first glance. Future work should heed the following advice: 316 * 317 * Hacking the policy into C code here is a very bad idea. The policy needs to 318 * be decided upon very carefully and fundamentally encodes knowledge of what 319 * suspect list combinations can be emitted by what diagnosis engines. As such 320 * fmd's code is the wrong location, because that would require fmd itself to 321 * be updated for every diagnosis engine change, defeating the entire design. 322 * The FMA Event Registry knows the suspect list combinations: policy inputs 323 * can be derived from it and used to produce per-module policy configuration. 324 * 325 * If the policy needs to be dynamic and not statically fixed at either fmd 326 * startup or module load time, any implementation of dynamic policy retrieval 327 * must employ some kind of caching mechanism or be part of a built-in module. 328 * The fmd_case_convict() function is called with locks held inside of fmd and 329 * is not a place where unbounded blocking on some inter-process or inter- 330 * system communication to another service (e.g. another daemon) can occur. 331 */ 332 static void 333 fmd_case_convict(fmd_case_t *cp) 334 { 335 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 336 fmd_asru_hash_t *ahp = fmd.d_asrus; 337 338 fmd_case_susp_t *cis; 339 fmd_asru_t *asru; 340 nvlist_t *fmri; 341 342 (void) pthread_mutex_lock(&cip->ci_lock); 343 (void) fmd_case_mkcode(cp); 344 345 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 346 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri)) 347 continue; /* no ASRU provided by diagnosis engine */ 348 349 if ((asru = fmd_asru_hash_lookup_nvl(ahp, 350 fmri, FMD_B_TRUE)) == NULL) { 351 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 352 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 353 continue; 354 } 355 356 (void) fmd_asru_clrflags(asru, 357 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 358 (void) fmd_asru_setflags(asru, 359 FMD_ASRU_FAULTY, cp, cis->cis_nvl); 360 361 fmd_asru_hash_release(ahp, asru); 362 } 363 364 (void) pthread_mutex_unlock(&cip->ci_lock); 365 } 366 367 void 368 fmd_case_publish(fmd_case_t *cp, uint_t state) 369 { 370 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 371 fmd_event_t *e; 372 nvlist_t *nvl; 373 char *class; 374 375 if (state == FMD_CASE_CURRENT) 376 state = cip->ci_state; /* use current state */ 377 378 switch (state) { 379 case FMD_CASE_SOLVED: 380 (void) pthread_mutex_lock(&cip->ci_lock); 381 if (cip->ci_tv_valid == 0) { 382 fmd_time_gettimeofday(&cip->ci_tv); 383 cip->ci_tv_valid = 1; 384 } 385 (void) pthread_mutex_unlock(&cip->ci_lock); 386 fmd_case_convict(cp); 387 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 388 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 389 390 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 391 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 392 fmd_log_append(fmd.d_fltlog, e, cp); 393 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 394 fmd_dispq_dispatch(fmd.d_disp, e, class); 395 396 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 397 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 398 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 399 400 break; 401 402 case FMD_CASE_CLOSE_WAIT: 403 fmd_case_hold(cp); 404 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 405 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 406 407 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 408 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 409 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 410 411 break; 412 413 case FMD_CASE_CLOSED: 414 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 415 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 416 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 417 fmd_dispq_dispatch(fmd.d_disp, e, class); 418 break; 419 420 case FMD_CASE_REPAIRED: 421 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 422 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 423 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 424 fmd_dispq_dispatch(fmd.d_disp, e, class); 425 break; 426 } 427 } 428 429 fmd_case_t * 430 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 431 { 432 fmd_case_impl_t *cip; 433 uint_t h; 434 435 (void) pthread_rwlock_rdlock(&chp->ch_lock); 436 h = fmd_strhash(uuid) % chp->ch_hashlen; 437 438 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 439 if (strcmp(cip->ci_uuid, uuid) == 0) 440 break; 441 } 442 443 if (cip != NULL) 444 fmd_case_hold((fmd_case_t *)cip); 445 else 446 (void) fmd_set_errno(EFMD_CASE_INVAL); 447 448 (void) pthread_rwlock_unlock(&chp->ch_lock); 449 return ((fmd_case_t *)cip); 450 } 451 452 static fmd_case_impl_t * 453 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 454 { 455 fmd_case_impl_t *eip; 456 uint_t h; 457 458 (void) pthread_rwlock_wrlock(&chp->ch_lock); 459 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 460 461 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 462 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0) { 463 fmd_case_hold((fmd_case_t *)eip); 464 (void) pthread_rwlock_unlock(&chp->ch_lock); 465 return (eip); /* uuid already present */ 466 } 467 } 468 469 cip->ci_next = chp->ch_hash[h]; 470 chp->ch_hash[h] = cip; 471 472 chp->ch_count++; 473 ASSERT(chp->ch_count != 0); 474 475 (void) pthread_rwlock_unlock(&chp->ch_lock); 476 return (cip); 477 } 478 479 static void 480 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 481 { 482 fmd_case_impl_t *cp, **pp; 483 uint_t h; 484 485 (void) pthread_rwlock_wrlock(&chp->ch_lock); 486 487 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 488 pp = &chp->ch_hash[h]; 489 490 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 491 if (cp != cip) 492 pp = &cp->ci_next; 493 else 494 break; 495 } 496 497 if (cp == NULL) { 498 fmd_panic("case %p (%s) not found on hash chain %u\n", 499 (void *)cip, cip->ci_uuid, h); 500 } 501 502 *pp = cp->ci_next; 503 cp->ci_next = NULL; 504 505 ASSERT(chp->ch_count != 0); 506 chp->ch_count--; 507 508 (void) pthread_rwlock_unlock(&chp->ch_lock); 509 } 510 511 fmd_case_t * 512 fmd_case_create(fmd_module_t *mp, void *data) 513 { 514 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 515 fmd_case_impl_t *eip = NULL; 516 uuid_t uuid; 517 518 (void) pthread_mutex_init(&cip->ci_lock, NULL); 519 fmd_buf_hash_create(&cip->ci_bufs); 520 521 fmd_module_hold(mp); 522 cip->ci_mod = mp; 523 cip->ci_refs = 1; 524 cip->ci_state = FMD_CASE_UNSOLVED; 525 cip->ci_flags = FMD_CF_DIRTY; 526 cip->ci_data = data; 527 528 /* 529 * Calling libuuid: get a clue. The library interfaces cleverly do not 530 * define any constant for the length of an unparse string, and do not 531 * permit the caller to specify a buffer length for safety. The spec 532 * says it will be 36 bytes, but we make it tunable just in case. 533 */ 534 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 535 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 536 537 /* 538 * We expect this loop to execute only once, but code it defensively 539 * against the possibility of libuuid bugs. Keep generating uuids and 540 * attempting to do a hash insert until we get a unique one. 541 */ 542 do { 543 if (eip != NULL) 544 fmd_case_rele((fmd_case_t *)eip); 545 uuid_generate(uuid); 546 uuid_unparse(uuid, cip->ci_uuid); 547 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 548 549 ASSERT(fmd_module_locked(mp)); 550 fmd_list_append(&mp->mod_cases, cip); 551 fmd_module_setcdirty(mp); 552 553 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 554 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 555 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 556 557 return ((fmd_case_t *)cip); 558 } 559 560 static void 561 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 562 { 563 fmd_case_susp_t *cis, *ncis; 564 565 ASSERT(MUTEX_HELD(&cip->ci_lock)); 566 567 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 568 ncis = cis->cis_next; 569 nvlist_free(cis->cis_nvl); 570 fmd_free(cis, sizeof (fmd_case_susp_t)); 571 } 572 573 cip->ci_suspects = NULL; 574 cip->ci_nsuspects = 0; 575 } 576 577 fmd_case_t * 578 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 579 uint_t state, const char *uuid, const char *code) 580 { 581 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 582 fmd_case_impl_t *eip; 583 584 ASSERT(state < FMD_CASE_REPAIRED); 585 586 (void) pthread_mutex_init(&cip->ci_lock, NULL); 587 fmd_buf_hash_create(&cip->ci_bufs); 588 589 fmd_module_hold(mp); 590 cip->ci_mod = mp; 591 cip->ci_xprt = xp; 592 cip->ci_refs = 1; 593 cip->ci_state = state; 594 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 595 cip->ci_uuidlen = strlen(cip->ci_uuid); 596 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 597 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 598 599 if (state > FMD_CASE_CLOSE_WAIT) 600 cip->ci_flags |= FMD_CF_SOLVED; 601 602 /* 603 * Insert the case into the global case hash. If the specified UUID is 604 * already present, check to see if it is an orphan: if so, reclaim it; 605 * otherwise if it is owned by a different module then return NULL. 606 */ 607 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 608 (void) pthread_mutex_lock(&cip->ci_lock); 609 cip->ci_refs--; /* decrement to zero */ 610 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 611 612 cip = eip; /* switch 'cip' to the existing case */ 613 (void) pthread_mutex_lock(&cip->ci_lock); 614 615 /* 616 * If the ASRU cache is trying to recreate an orphan, then just 617 * return the existing case that we found without changing it. 618 */ 619 if (mp == fmd.d_rmod) { 620 (void) pthread_mutex_unlock(&cip->ci_lock); 621 fmd_case_rele((fmd_case_t *)cip); 622 return ((fmd_case_t *)cip); 623 } 624 625 /* 626 * If the existing case isn't an orphan or is being proxied, 627 * then we have a UUID conflict: return failure to the caller. 628 */ 629 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 630 (void) pthread_mutex_unlock(&cip->ci_lock); 631 fmd_case_rele((fmd_case_t *)cip); 632 return (NULL); 633 } 634 635 /* 636 * If the new module is reclaiming an orphaned case, remove 637 * the case from the root module, switch ci_mod, and then fall 638 * through to adding the case to the new owner module 'mp'. 639 */ 640 fmd_module_lock(cip->ci_mod); 641 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 642 fmd_module_unlock(cip->ci_mod); 643 644 fmd_module_rele(cip->ci_mod); 645 cip->ci_mod = mp; 646 fmd_module_hold(mp); 647 648 fmd_case_destroy_suspects(cip); 649 cip->ci_state = state; 650 651 (void) pthread_mutex_unlock(&cip->ci_lock); 652 fmd_case_rele((fmd_case_t *)cip); 653 } 654 655 ASSERT(fmd_module_locked(mp)); 656 fmd_list_append(&mp->mod_cases, cip); 657 658 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 659 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 660 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 661 662 return ((fmd_case_t *)cip); 663 } 664 665 void 666 fmd_case_destroy(fmd_case_t *cp, int visible) 667 { 668 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 669 fmd_case_item_t *cit, *ncit; 670 671 ASSERT(MUTEX_HELD(&cip->ci_lock)); 672 ASSERT(cip->ci_refs == 0); 673 674 if (visible) { 675 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 676 fmd_case_hash_delete(fmd.d_cases, cip); 677 } 678 679 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 680 ncit = cit->cit_next; 681 fmd_event_rele(cit->cit_event); 682 fmd_free(cit, sizeof (fmd_case_item_t)); 683 } 684 685 fmd_case_destroy_suspects(cip); 686 687 if (cip->ci_principal != NULL) 688 fmd_event_rele(cip->ci_principal); 689 690 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 691 fmd_free(cip->ci_code, cip->ci_codelen); 692 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 693 694 fmd_module_rele(cip->ci_mod); 695 fmd_free(cip, sizeof (fmd_case_impl_t)); 696 } 697 698 void 699 fmd_case_hold(fmd_case_t *cp) 700 { 701 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 702 703 (void) pthread_mutex_lock(&cip->ci_lock); 704 cip->ci_refs++; 705 ASSERT(cip->ci_refs != 0); 706 (void) pthread_mutex_unlock(&cip->ci_lock); 707 } 708 709 void 710 fmd_case_hold_locked(fmd_case_t *cp) 711 { 712 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 713 714 ASSERT(MUTEX_HELD(&cip->ci_lock)); 715 cip->ci_refs++; 716 ASSERT(cip->ci_refs != 0); 717 } 718 719 void 720 fmd_case_rele(fmd_case_t *cp) 721 { 722 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 723 724 (void) pthread_mutex_lock(&cip->ci_lock); 725 ASSERT(cip->ci_refs != 0); 726 727 if (--cip->ci_refs == 0) 728 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 729 else 730 (void) pthread_mutex_unlock(&cip->ci_lock); 731 } 732 733 int 734 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 735 { 736 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 737 fmd_case_item_t *cit; 738 fmd_event_t *oep; 739 uint_t state; 740 int new; 741 742 fmd_event_hold(ep); 743 (void) pthread_mutex_lock(&cip->ci_lock); 744 745 if (cip->ci_flags & FMD_CF_SOLVED) 746 state = FMD_EVS_DIAGNOSED; 747 else 748 state = FMD_EVS_ACCEPTED; 749 750 oep = cip->ci_principal; 751 cip->ci_principal = ep; 752 753 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 754 if (cit->cit_event == ep) 755 break; 756 } 757 758 cip->ci_flags |= FMD_CF_DIRTY; 759 new = cit == NULL && ep != oep; 760 761 (void) pthread_mutex_unlock(&cip->ci_lock); 762 763 fmd_module_setcdirty(cip->ci_mod); 764 fmd_event_transition(ep, state); 765 766 if (oep != NULL) 767 fmd_event_rele(oep); 768 769 return (new); 770 } 771 772 int 773 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 774 { 775 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 776 fmd_case_item_t *cit; 777 uint_t state; 778 int new; 779 780 (void) pthread_mutex_lock(&cip->ci_lock); 781 782 if (cip->ci_flags & FMD_CF_SOLVED) 783 state = FMD_EVS_DIAGNOSED; 784 else 785 state = FMD_EVS_ACCEPTED; 786 787 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 788 if (cit->cit_event == ep) 789 break; 790 } 791 792 new = cit == NULL && ep != cip->ci_principal; 793 794 /* 795 * If the event is already in the case or the case is already solved, 796 * there is no reason to save it: just transition it appropriately. 797 */ 798 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 799 (void) pthread_mutex_unlock(&cip->ci_lock); 800 fmd_event_transition(ep, state); 801 return (new); 802 } 803 804 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 805 fmd_event_hold(ep); 806 807 cit->cit_next = cip->ci_items; 808 cit->cit_event = ep; 809 810 cip->ci_items = cit; 811 cip->ci_nitems++; 812 813 cip->ci_flags |= FMD_CF_DIRTY; 814 (void) pthread_mutex_unlock(&cip->ci_lock); 815 816 fmd_module_setcdirty(cip->ci_mod); 817 fmd_event_transition(ep, state); 818 819 return (new); 820 } 821 822 void 823 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 824 { 825 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 826 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 827 828 (void) pthread_mutex_lock(&cip->ci_lock); 829 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 830 cip->ci_flags |= FMD_CF_DIRTY; 831 832 cis->cis_next = cip->ci_suspects; 833 cis->cis_nvl = nvl; 834 835 cip->ci_suspects = cis; 836 cip->ci_nsuspects++; 837 838 (void) pthread_mutex_unlock(&cip->ci_lock); 839 fmd_module_setcdirty(cip->ci_mod); 840 } 841 842 void 843 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 844 { 845 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 846 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 847 boolean_t b; 848 849 (void) pthread_mutex_lock(&cip->ci_lock); 850 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 851 ASSERT(cip->ci_mod == fmd.d_rmod); 852 853 cis->cis_next = cip->ci_suspects; 854 cis->cis_nvl = nvl; 855 856 if (nvlist_lookup_boolean_value(nvl, 857 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 858 cip->ci_flags |= FMD_CF_INVISIBLE; 859 860 cip->ci_suspects = cis; 861 cip->ci_nsuspects++; 862 863 (void) pthread_mutex_unlock(&cip->ci_lock); 864 } 865 866 void 867 fmd_case_reset_suspects(fmd_case_t *cp) 868 { 869 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 870 871 (void) pthread_mutex_lock(&cip->ci_lock); 872 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 873 874 fmd_case_destroy_suspects(cip); 875 cip->ci_flags |= FMD_CF_DIRTY; 876 877 (void) pthread_mutex_unlock(&cip->ci_lock); 878 fmd_module_setcdirty(cip->ci_mod); 879 } 880 881 /* 882 * Grab ci_lock and update the case state and set the dirty bit. Then perform 883 * whatever actions and emit whatever events are appropriate for the state. 884 * Refer to the topmost block comment explaining the state machine for details. 885 */ 886 void 887 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 888 { 889 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 890 891 fmd_case_susp_t *cis; 892 fmd_case_item_t *cit; 893 fmd_asru_t *asru; 894 fmd_event_t *e; 895 nvlist_t *nvl; 896 897 ASSERT(state <= FMD_CASE_REPAIRED); 898 (void) pthread_mutex_lock(&cip->ci_lock); 899 900 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 901 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 902 903 cip->ci_flags |= flags; 904 905 if (cip->ci_state >= state) { 906 (void) pthread_mutex_unlock(&cip->ci_lock); 907 return; /* already in specified state */ 908 } 909 910 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 911 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 912 913 cip->ci_state = state; 914 cip->ci_flags |= FMD_CF_DIRTY; 915 916 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 917 fmd_module_setcdirty(cip->ci_mod); 918 919 switch (state) { 920 case FMD_CASE_SOLVED: 921 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 922 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 923 924 if (cip->ci_principal != NULL) { 925 fmd_event_transition(cip->ci_principal, 926 FMD_EVS_DIAGNOSED); 927 } 928 break; 929 930 case FMD_CASE_CLOSE_WAIT: 931 /* 932 * If the case was never solved, do not change ASRUs. 933 * If the case was never fmd_case_closed, do not change ASRUs. 934 * If the case was repaired, do not change ASRUs. 935 */ 936 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 937 FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 938 goto close_wait_finish; 939 940 /* 941 * For each fault event in the suspect list, attempt to look up 942 * the corresponding ASRU in the ASRU dictionary. If the ASRU 943 * is found there and is marked faulty, we now mark it unusable 944 * and record the case meta-data and fault event with the ASRU. 945 */ 946 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 947 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 948 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 949 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 950 (void) fmd_asru_setflags(asru, 951 FMD_ASRU_UNUSABLE, cp, cis->cis_nvl); 952 fmd_asru_hash_release(fmd.d_asrus, asru); 953 } 954 } 955 956 close_wait_finish: 957 /* 958 * If an orphaned case transitions to CLOSE_WAIT, the owning 959 * module is no longer loaded: continue on to CASE_CLOSED. 960 */ 961 if (fmd_case_orphaned(cp)) 962 state = cip->ci_state = FMD_CASE_CLOSED; 963 break; 964 965 case FMD_CASE_REPAIRED: 966 ASSERT(fmd_case_orphaned(cp)); 967 fmd_module_lock(cip->ci_mod); 968 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 969 fmd_module_unlock(cip->ci_mod); 970 break; 971 } 972 973 (void) pthread_mutex_unlock(&cip->ci_lock); 974 975 /* 976 * If the module has initialized, then publish the appropriate event 977 * for the new case state. If not, we are being called from the 978 * checkpoint code during module load, in which case the module's 979 * _fmd_init() routine hasn't finished yet, and our event dictionaries 980 * may not be open yet, which will prevent us from computing the event 981 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 982 * event in our queue: this won't be processed until _fmd_init is done. 983 */ 984 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 985 fmd_case_publish(cp, state); 986 else { 987 fmd_case_hold(cp); 988 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 989 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 990 } 991 992 /* 993 * If we transitioned to REPAIRED, adjust the reference count to 994 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 995 * not placed an additional hold on the case, it will now be freed. 996 */ 997 if (state == FMD_CASE_REPAIRED) 998 fmd_case_rele(cp); 999 } 1000 1001 /* 1002 * Transition the specified case to *at least* the specified state by first 1003 * re-validating the suspect list using the resource cache. This function is 1004 * employed by the checkpoint code when restoring a saved, solved case to see 1005 * if the state of the case has effectively changed while fmd was not running 1006 * or the module was not loaded. If none of the suspects are present anymore, 1007 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 1008 */ 1009 void 1010 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1011 { 1012 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1013 fmd_case_susp_t *cis; 1014 fmd_asru_t *asru; 1015 nvlist_t *nvl; 1016 1017 int faulty = 0; /* are any suspects faulty? */ 1018 int usable = 0; /* are any suspects usable? */ 1019 1020 ASSERT(state >= FMD_CASE_SOLVED); 1021 (void) pthread_mutex_lock(&cip->ci_lock); 1022 1023 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1024 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1025 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1026 fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) { 1027 1028 if (asru->asru_flags & FMD_ASRU_FAULTY) 1029 faulty++; 1030 1031 if (fmd_asru_fake_not_present == 0 && 1032 fmd_fmri_unusable(asru->asru_fmri) <= 0) 1033 usable++; 1034 1035 fmd_asru_hash_release(fmd.d_asrus, asru); 1036 } 1037 } 1038 1039 (void) pthread_mutex_unlock(&cip->ci_lock); 1040 1041 /* 1042 * If none of the suspects were faulty, it implies they were either 1043 * repaired already or not present and the rsrc.age time has expired. 1044 * We can move the state on to repaired. 1045 */ 1046 if (!faulty) { 1047 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1048 flags |= FMD_CF_REPAIRED; 1049 } else if (!usable) { 1050 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1051 flags |= FMD_CF_ISOLATED; 1052 } 1053 1054 fmd_case_transition(cp, state, flags); 1055 } 1056 1057 void 1058 fmd_case_setdirty(fmd_case_t *cp) 1059 { 1060 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1061 1062 (void) pthread_mutex_lock(&cip->ci_lock); 1063 cip->ci_flags |= FMD_CF_DIRTY; 1064 (void) pthread_mutex_unlock(&cip->ci_lock); 1065 1066 fmd_module_setcdirty(cip->ci_mod); 1067 } 1068 1069 void 1070 fmd_case_clrdirty(fmd_case_t *cp) 1071 { 1072 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1073 1074 (void) pthread_mutex_lock(&cip->ci_lock); 1075 cip->ci_flags &= ~FMD_CF_DIRTY; 1076 (void) pthread_mutex_unlock(&cip->ci_lock); 1077 } 1078 1079 void 1080 fmd_case_commit(fmd_case_t *cp) 1081 { 1082 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1083 fmd_case_item_t *cit; 1084 1085 (void) pthread_mutex_lock(&cip->ci_lock); 1086 1087 if (cip->ci_flags & FMD_CF_DIRTY) { 1088 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1089 fmd_event_commit(cit->cit_event); 1090 1091 if (cip->ci_principal != NULL) 1092 fmd_event_commit(cip->ci_principal); 1093 1094 fmd_buf_hash_commit(&cip->ci_bufs); 1095 cip->ci_flags &= ~FMD_CF_DIRTY; 1096 } 1097 1098 (void) pthread_mutex_unlock(&cip->ci_lock); 1099 } 1100 1101 /* 1102 * Indicate that the case may need to change state because one or more of the 1103 * ASRUs named as a suspect has changed state. We examine all the suspects 1104 * and if none are still faulty, we initiate a case close transition. 1105 */ 1106 void 1107 fmd_case_update(fmd_case_t *cp) 1108 { 1109 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1110 fmd_case_susp_t *cis; 1111 fmd_asru_t *asru; 1112 nvlist_t *nvl; 1113 1114 int astate = 0; 1115 uint_t cstate; 1116 1117 (void) pthread_mutex_lock(&cip->ci_lock); 1118 cstate = cip->ci_state; 1119 1120 if ((cip->ci_flags & FMD_CF_REPAIRING) || 1121 cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1122 (void) pthread_mutex_unlock(&cip->ci_lock); 1123 return; /* update is not appropriate */ 1124 } 1125 1126 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 1127 if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, 1128 &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl( 1129 fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) { 1130 astate |= (asru->asru_flags & FMD_ASRU_STATE); 1131 fmd_asru_hash_release(fmd.d_asrus, asru); 1132 } 1133 } 1134 1135 (void) pthread_mutex_unlock(&cip->ci_lock); 1136 1137 if (astate & FMD_ASRU_FAULTY) 1138 return; /* one or more suspects are still marked faulty */ 1139 1140 if (cstate == FMD_CASE_CLOSED) 1141 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1142 else 1143 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1144 } 1145 1146 /* 1147 * Delete a closed case from the module's case list once the fmdo_close() entry 1148 * point has run to completion. If the case is owned by a transport module, 1149 * tell the transport to proxy a case close on the other end of the transport. 1150 * If not, transition to the appropriate next state based on ci_flags. This 1151 * function represents the end of CLOSE_WAIT and transitions the case to either 1152 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1153 * refer to the topmost block comment explaining the state machine for details. 1154 */ 1155 void 1156 fmd_case_delete(fmd_case_t *cp) 1157 { 1158 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1159 fmd_modstat_t *msp; 1160 size_t buftotal; 1161 1162 ASSERT(fmd_module_locked(cip->ci_mod)); 1163 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1164 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1165 1166 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1167 msp = cip->ci_mod->mod_stats; 1168 1169 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1170 msp->ms_caseopen.fmds_value.ui64--; 1171 1172 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1173 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1174 1175 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1176 1177 if (cip->ci_xprt == NULL) 1178 fmd_module_setcdirty(cip->ci_mod); 1179 1180 fmd_module_rele(cip->ci_mod); 1181 cip->ci_mod = fmd.d_rmod; 1182 fmd_module_hold(cip->ci_mod); 1183 1184 /* 1185 * If the case is not proxied and it has been solved, then retain it 1186 * on the root module's case list at least until we're transitioned. 1187 * Otherwise free the case with our final fmd_case_rele() below. 1188 */ 1189 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1190 fmd_module_lock(cip->ci_mod); 1191 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1192 fmd_module_unlock(cip->ci_mod); 1193 fmd_case_hold(cp); 1194 } 1195 1196 /* 1197 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1198 * rather than orphaned because by definition it can have no entries 1199 * in the resource cache of the current fault manager. 1200 */ 1201 if (cip->ci_xprt != NULL) 1202 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1203 else if (cip->ci_flags & FMD_CF_REPAIRED) 1204 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1205 else if (cip->ci_flags & FMD_CF_ISOLATED) 1206 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1207 1208 fmd_case_rele(cp); 1209 } 1210 1211 void 1212 fmd_case_discard(fmd_case_t *cp) 1213 { 1214 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1215 1216 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1217 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1218 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1219 1220 ASSERT(fmd_module_locked(cip->ci_mod)); 1221 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1222 fmd_case_rele(cp); 1223 } 1224 1225 static void 1226 fmd_case_repair_containee(fmd_asru_t *ee, void *er) 1227 { 1228 if ((ee->asru_flags & FMD_ASRU_FAULTY) && 1229 fmd_fmri_contains(er, ee->asru_fmri) > 0) 1230 (void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL); 1231 } 1232 1233 /* 1234 * Indicate that the problem corresponding to a case has been repaired by 1235 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1236 * already been closed, this function initiates the transition to CLOSE_WAIT. 1237 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1238 * grab and drop ci_lock without the case being able to be freed in between. 1239 */ 1240 int 1241 fmd_case_repair(fmd_case_t *cp) 1242 { 1243 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1244 fmd_case_susp_t *cis; 1245 nvlist_t *nvl; 1246 uint_t cstate; 1247 1248 fmd_asru_hash_t *ahp = fmd.d_asrus; 1249 fmd_asru_t **aa; 1250 uint_t i, an; 1251 1252 (void) pthread_mutex_lock(&cip->ci_lock); 1253 cstate = cip->ci_state; 1254 1255 if (cip->ci_xprt != NULL) { 1256 (void) pthread_mutex_unlock(&cip->ci_lock); 1257 return (fmd_set_errno(EFMD_CASE_OWNER)); 1258 } 1259 1260 if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) { 1261 (void) pthread_mutex_unlock(&cip->ci_lock); 1262 return (fmd_set_errno(EFMD_CASE_STATE)); 1263 } 1264 1265 /* 1266 * Take a snapshot of any ASRUs referenced by the case that are present 1267 * in the resource cache. Then drop ci_lock and clear the faulty bit 1268 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held). 1269 */ 1270 an = cip->ci_nsuspects; 1271 aa = alloca(sizeof (fmd_asru_t *) * an); 1272 bzero(aa, sizeof (fmd_asru_t *) * an); 1273 1274 for (i = 0, cis = cip->ci_suspects; 1275 cis != NULL; cis = cis->cis_next, i++) { 1276 if (nvlist_lookup_nvlist(cis->cis_nvl, 1277 FM_FAULT_ASRU, &nvl) == 0) 1278 aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE); 1279 } 1280 1281 cip->ci_flags |= FMD_CF_REPAIRING; 1282 (void) pthread_mutex_unlock(&cip->ci_lock); 1283 1284 /* 1285 * For each suspect ASRU, if the case associated with this ASRU matches 1286 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY. Note 1287 * that at present, we're assuming that when a given resource FMRI R1 1288 * contains another R2, that any faults are related by a common 1289 * diagnosis engine. This is true in our current architecture, but may 1290 * not always be true, at which point we'll need more cleverness here. 1291 */ 1292 for (i = 0; i < an; i++) { 1293 if (aa[i] == NULL) 1294 continue; /* no asru was found */ 1295 1296 if (aa[i]->asru_case == cp) { 1297 fmd_asru_hash_apply(fmd.d_asrus, 1298 fmd_case_repair_containee, aa[i]->asru_fmri); 1299 (void) fmd_asru_clrflags(aa[i], 1300 FMD_ASRU_FAULTY, NULL, NULL); 1301 } 1302 1303 fmd_asru_hash_release(ahp, aa[i]); 1304 } 1305 1306 (void) pthread_mutex_lock(&cip->ci_lock); 1307 cip->ci_flags &= ~FMD_CF_REPAIRING; 1308 (void) pthread_mutex_unlock(&cip->ci_lock); 1309 1310 if (cstate == FMD_CASE_CLOSED) 1311 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1312 else 1313 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1314 1315 return (0); 1316 } 1317 1318 int 1319 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1320 { 1321 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1322 fmd_case_item_t *cit; 1323 uint_t state; 1324 int rv = 0; 1325 1326 (void) pthread_mutex_lock(&cip->ci_lock); 1327 1328 if (cip->ci_state >= FMD_CASE_SOLVED) 1329 state = FMD_EVS_DIAGNOSED; 1330 else 1331 state = FMD_EVS_ACCEPTED; 1332 1333 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1334 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1335 break; 1336 } 1337 1338 if (rv == 0 && cip->ci_principal != NULL) 1339 rv = fmd_event_equal(ep, cip->ci_principal); 1340 1341 (void) pthread_mutex_unlock(&cip->ci_lock); 1342 1343 if (rv != 0) 1344 fmd_event_transition(ep, state); 1345 1346 return (rv); 1347 } 1348 1349 int 1350 fmd_case_orphaned(fmd_case_t *cp) 1351 { 1352 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1353 } 1354 1355 void 1356 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1357 { 1358 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1359 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1360 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1361 } 1362