1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 extern volatile uint32_t fmd_asru_fake_not_present; 135 136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 137 138 fmd_case_hash_t * 139 fmd_case_hash_create(void) 140 { 141 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 142 143 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 144 chp->ch_hashlen = fmd.d_str_buckets; 145 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 146 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 147 FMD_SLEEP); 148 chp->ch_count = 0; 149 150 return (chp); 151 } 152 153 /* 154 * Destroy the case hash. Unlike most of our hash tables, no active references 155 * are kept by the case hash itself; all references come from other subsystems. 156 * The hash must be destroyed after all modules are unloaded; if anything was 157 * present in the hash it would be by definition a reference count leak. 158 */ 159 void 160 fmd_case_hash_destroy(fmd_case_hash_t *chp) 161 { 162 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 163 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 164 fmd_free(chp, sizeof (fmd_case_hash_t)); 165 } 166 167 /* 168 * Take a snapshot of the case hash by placing an additional hold on each 169 * member in an auxiliary array, and then call 'func' for each case. 170 */ 171 void 172 fmd_case_hash_apply(fmd_case_hash_t *chp, 173 void (*func)(fmd_case_t *, void *), void *arg) 174 { 175 fmd_case_impl_t *cp, **cps, **cpp; 176 uint_t cpc, i; 177 178 (void) pthread_rwlock_rdlock(&chp->ch_lock); 179 180 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 181 cpc = chp->ch_count; 182 183 for (i = 0; i < chp->ch_hashlen; i++) { 184 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 185 fmd_case_hold((fmd_case_t *)cp); 186 *cpp++ = cp; 187 } 188 } 189 190 ASSERT(cpp == cps + cpc); 191 (void) pthread_rwlock_unlock(&chp->ch_lock); 192 193 for (i = 0; i < cpc; i++) { 194 func((fmd_case_t *)cps[i], arg); 195 fmd_case_rele((fmd_case_t *)cps[i]); 196 } 197 198 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 199 } 200 201 static void 202 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 203 { 204 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 205 206 cip->ci_code_next = chp->ch_code_hash[h]; 207 chp->ch_code_hash[h] = cip; 208 } 209 210 static void 211 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 212 { 213 fmd_case_impl_t **pp, *cp; 214 215 if (cip->ci_code) { 216 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 217 218 pp = &chp->ch_code_hash[h]; 219 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 220 if (cp != cip) 221 pp = &cp->ci_code_next; 222 else 223 break; 224 } 225 if (cp != NULL) { 226 *pp = cp->ci_code_next; 227 cp->ci_code_next = NULL; 228 } 229 } 230 } 231 232 /* 233 * Look up the diagcode for this case and cache it in ci_code. If no suspects 234 * were defined for this case or if the lookup fails, the event dictionary or 235 * module code is broken, and we set the event code to a precomputed default. 236 */ 237 static const char * 238 fmd_case_mkcode(fmd_case_t *cp) 239 { 240 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 241 fmd_case_susp_t *cis; 242 fmd_case_hash_t *chp = fmd.d_cases; 243 244 char **keys, **keyp; 245 const char *s; 246 247 ASSERT(MUTEX_HELD(&cip->ci_lock)); 248 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 249 250 /* 251 * delete any existing entry from code hash if it is on it 252 */ 253 fmd_case_code_hash_delete(chp, cip); 254 255 fmd_free(cip->ci_code, cip->ci_codelen); 256 cip->ci_codelen = cip->ci_mod->mod_codelen; 257 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 258 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 259 260 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 261 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 262 keyp++; 263 } 264 265 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 266 267 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 268 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 269 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 270 fmd_free(cip->ci_code, cip->ci_codelen); 271 cip->ci_codelen = strlen(s) + 1; 272 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 273 (void) strcpy(cip->ci_code, s); 274 } 275 276 /* 277 * add into hash of solved cases 278 */ 279 fmd_case_code_hash_insert(chp, cip); 280 281 return (cip->ci_code); 282 } 283 284 typedef struct { 285 int *fcl_countp; 286 uint8_t *fcl_ba; 287 nvlist_t **fcl_nva; 288 int *fcl_msgp; 289 } fmd_case_lst_t; 290 291 static void 292 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 293 { 294 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 295 boolean_t b; 296 int state; 297 298 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 299 &b) == 0 && b == B_FALSE) 300 *entryp->fcl_msgp = B_FALSE; 301 entryp->fcl_ba[*entryp->fcl_countp] = 0; 302 state = fmd_asru_al_getstate(alp); 303 if (state & FMD_ASRU_UNUSABLE) 304 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 305 if (state & FMD_ASRU_FAULTY) 306 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 307 if (!(state & FMD_ASRU_PRESENT)) 308 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 309 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 310 (*entryp->fcl_countp)++; 311 } 312 313 static void 314 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 315 { 316 int *faultyp = (int *)arg; 317 318 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 319 } 320 321 static void 322 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 323 { 324 int *usablep = (int *)arg; 325 326 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 327 } 328 329 nvlist_t * 330 fmd_case_mkevent(fmd_case_t *cp, const char *class) 331 { 332 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 333 nvlist_t **nva, *nvl; 334 uint8_t *ba; 335 int msg = B_TRUE; 336 fmd_case_lst_t fcl; 337 int count = 0; 338 339 (void) pthread_mutex_lock(&cip->ci_lock); 340 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 341 342 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 343 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 344 345 /* 346 * For each suspect associated with the case, store its fault event 347 * nvlist in 'nva'. We also look to see if any of the suspect faults 348 * have asked not to be messaged. If any of them have made such a 349 * request, propagate that attribute to the composite list.* event. 350 * Finally, store each suspect's faulty status into the bitmap 'ba'. 351 */ 352 fcl.fcl_countp = &count; 353 fcl.fcl_msgp = &msg; 354 fcl.fcl_ba = ba; 355 fcl.fcl_nva = nva; 356 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 357 358 if (cip->ci_code == NULL) 359 (void) fmd_case_mkcode(cp); 360 361 if (msg == B_FALSE) 362 cip->ci_flags |= FMD_CF_INVISIBLE; 363 364 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 365 cip->ci_code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv); 366 367 (void) pthread_mutex_unlock(&cip->ci_lock); 368 return (nvl); 369 } 370 371 static boolean_t 372 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 373 { 374 nvlist_t *new_rsrc; 375 nvlist_t *rsrc; 376 char *new_name = NULL; 377 char *name = NULL; 378 ssize_t new_namelen; 379 ssize_t namelen; 380 int fmri_present = 1; 381 int new_fmri_present = 1; 382 int match = B_FALSE; 383 384 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 385 fmri_present = 0; 386 else { 387 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 388 goto done; 389 name = fmd_alloc(namelen + 1, FMD_SLEEP); 390 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 391 goto done; 392 } 393 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 394 new_fmri_present = 0; 395 else { 396 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 397 goto done; 398 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 399 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 400 goto done; 401 } 402 match = (fmri_present == new_fmri_present && 403 (fmri_present == 0 || strcmp(name, new_name) == 0)); 404 done: 405 if (name != NULL) 406 fmd_free(name, namelen + 1); 407 if (new_name != NULL) 408 fmd_free(new_name, new_namelen + 1); 409 return (match); 410 } 411 412 static int 413 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 414 { 415 char *class, *new_class; 416 417 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 418 return (0); 419 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 420 FM_FAULT_RESOURCE)) 421 return (0); 422 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 423 return (0); 424 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 425 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 426 return (strcmp(class, new_class) == 0); 427 } 428 429 /* 430 * see if an identical suspect list already exists in the cache 431 */ 432 static int 433 fmd_case_check_for_dups(fmd_case_t *cp) 434 { 435 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 436 fmd_case_hash_t *chp = fmd.d_cases; 437 fmd_case_susp_t *xcis, *cis; 438 int match = 0, match_susp; 439 uint_t h; 440 441 (void) pthread_rwlock_rdlock(&chp->ch_lock); 442 443 /* 444 * Find all cases with this code 445 */ 446 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 447 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 448 xcip = xcip->ci_code_next) { 449 /* 450 * only look for any cases (apart from this one) 451 * whose code and number of suspects match 452 */ 453 if (xcip == cip || strcmp(xcip->ci_code, cip->ci_code) != 0 || 454 xcip->ci_nsuspects != cip->ci_nsuspects) 455 continue; 456 457 /* 458 * For each suspect in one list, check if there 459 * is an identical suspect in the other list 460 */ 461 match = 1; 462 fmd_case_hold((fmd_case_t *)xcip); 463 for (xcis = xcip->ci_suspects; xcis != NULL; 464 xcis = xcis->cis_next) { 465 match_susp = 0; 466 for (cis = cip->ci_suspects; cis != NULL; 467 cis = cis->cis_next) { 468 if (fmd_case_match_suspect(cis, xcis) == 1) { 469 match_susp = 1; 470 break; 471 } 472 } 473 if (match_susp == 0) { 474 match = 0; 475 break; 476 } 477 } 478 fmd_case_rele((fmd_case_t *)xcip); 479 if (match) { 480 (void) pthread_rwlock_unlock(&chp->ch_lock); 481 return (1); 482 } 483 } 484 (void) pthread_rwlock_unlock(&chp->ch_lock); 485 return (0); 486 } 487 488 /* 489 * Convict suspects in a case by applying a conviction policy and updating the 490 * resource cache prior to emitting the list.suspect event for the given case. 491 * At present, our policy is very simple: convict every suspect in the case. 492 * In the future, this policy can be extended and made configurable to permit: 493 * 494 * - convicting the suspect with the highest FIT rate 495 * - convicting the suspect with the cheapest FRU 496 * - convicting the suspect with the FRU that is in a depot's inventory 497 * - convicting the suspect with the longest lifetime 498 * 499 * and so forth. A word to the wise: this problem is significantly harder that 500 * it seems at first glance. Future work should heed the following advice: 501 * 502 * Hacking the policy into C code here is a very bad idea. The policy needs to 503 * be decided upon very carefully and fundamentally encodes knowledge of what 504 * suspect list combinations can be emitted by what diagnosis engines. As such 505 * fmd's code is the wrong location, because that would require fmd itself to 506 * be updated for every diagnosis engine change, defeating the entire design. 507 * The FMA Event Registry knows the suspect list combinations: policy inputs 508 * can be derived from it and used to produce per-module policy configuration. 509 * 510 * If the policy needs to be dynamic and not statically fixed at either fmd 511 * startup or module load time, any implementation of dynamic policy retrieval 512 * must employ some kind of caching mechanism or be part of a built-in module. 513 * The fmd_case_convict() function is called with locks held inside of fmd and 514 * is not a place where unbounded blocking on some inter-process or inter- 515 * system communication to another service (e.g. another daemon) can occur. 516 */ 517 static int 518 fmd_case_convict(fmd_case_t *cp) 519 { 520 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 521 fmd_asru_hash_t *ahp = fmd.d_asrus; 522 523 fmd_case_susp_t *cis; 524 fmd_asru_link_t *alp; 525 526 (void) pthread_mutex_lock(&cip->ci_lock); 527 (void) fmd_case_mkcode(cp); 528 if (fmd_case_check_for_dups(cp) == 1) { 529 (void) pthread_mutex_unlock(&cip->ci_lock); 530 return (1); 531 } 532 533 /* 534 * no suspect list already exists - allocate new cache entries 535 */ 536 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 537 if ((alp = fmd_asru_hash_create_entry(ahp, 538 cp, cis->cis_nvl)) == NULL) { 539 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 540 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 541 continue; 542 } 543 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE); 544 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 545 } 546 547 (void) pthread_mutex_unlock(&cip->ci_lock); 548 return (0); 549 } 550 551 void 552 fmd_case_publish(fmd_case_t *cp, uint_t state) 553 { 554 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 555 fmd_event_t *e; 556 nvlist_t *nvl; 557 char *class; 558 559 if (state == FMD_CASE_CURRENT) 560 state = cip->ci_state; /* use current state */ 561 562 switch (state) { 563 case FMD_CASE_SOLVED: 564 (void) pthread_mutex_lock(&cip->ci_lock); 565 if (cip->ci_tv_valid == 0) { 566 fmd_time_gettimeofday(&cip->ci_tv); 567 cip->ci_tv_valid = 1; 568 } 569 (void) pthread_mutex_unlock(&cip->ci_lock); 570 571 if (fmd_case_convict(cp) == 1) { /* dupclose */ 572 cip->ci_flags &= ~FMD_CF_SOLVED; 573 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 574 break; 575 } 576 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 577 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 578 579 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 580 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 581 fmd_log_append(fmd.d_fltlog, e, cp); 582 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 583 fmd_dispq_dispatch(fmd.d_disp, e, class); 584 585 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 586 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 587 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 588 589 break; 590 591 case FMD_CASE_CLOSE_WAIT: 592 fmd_case_hold(cp); 593 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 594 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 595 596 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 597 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 598 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 599 600 break; 601 602 case FMD_CASE_CLOSED: 603 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 604 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 605 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 606 fmd_dispq_dispatch(fmd.d_disp, e, class); 607 break; 608 609 case FMD_CASE_REPAIRED: 610 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 611 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 612 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 613 fmd_dispq_dispatch(fmd.d_disp, e, class); 614 break; 615 } 616 } 617 618 fmd_case_t * 619 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 620 { 621 fmd_case_impl_t *cip; 622 uint_t h; 623 624 (void) pthread_rwlock_rdlock(&chp->ch_lock); 625 h = fmd_strhash(uuid) % chp->ch_hashlen; 626 627 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 628 if (strcmp(cip->ci_uuid, uuid) == 0) 629 break; 630 } 631 632 /* 633 * If deleting bit is set, treat the case as if it doesn't exist. 634 */ 635 if (cip != NULL) 636 cip = fmd_case_tryhold(cip); 637 638 if (cip == NULL) 639 (void) fmd_set_errno(EFMD_CASE_INVAL); 640 641 (void) pthread_rwlock_unlock(&chp->ch_lock); 642 return ((fmd_case_t *)cip); 643 } 644 645 static fmd_case_impl_t * 646 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 647 { 648 fmd_case_impl_t *eip; 649 uint_t h; 650 651 (void) pthread_rwlock_wrlock(&chp->ch_lock); 652 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 653 654 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 655 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 656 fmd_case_tryhold(eip) != NULL) { 657 (void) pthread_rwlock_unlock(&chp->ch_lock); 658 return (eip); /* uuid already present */ 659 } 660 } 661 662 cip->ci_next = chp->ch_hash[h]; 663 chp->ch_hash[h] = cip; 664 665 chp->ch_count++; 666 ASSERT(chp->ch_count != 0); 667 668 (void) pthread_rwlock_unlock(&chp->ch_lock); 669 return (cip); 670 } 671 672 static void 673 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 674 { 675 fmd_case_impl_t *cp, **pp; 676 uint_t h; 677 678 ASSERT(MUTEX_HELD(&cip->ci_lock)); 679 680 cip->ci_flags |= FMD_CF_DELETING; 681 (void) pthread_mutex_unlock(&cip->ci_lock); 682 683 (void) pthread_rwlock_wrlock(&chp->ch_lock); 684 685 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 686 pp = &chp->ch_hash[h]; 687 688 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 689 if (cp != cip) 690 pp = &cp->ci_next; 691 else 692 break; 693 } 694 695 if (cp == NULL) { 696 fmd_panic("case %p (%s) not found on hash chain %u\n", 697 (void *)cip, cip->ci_uuid, h); 698 } 699 700 *pp = cp->ci_next; 701 cp->ci_next = NULL; 702 703 /* 704 * delete from code hash if it is on it 705 */ 706 fmd_case_code_hash_delete(chp, cip); 707 708 ASSERT(chp->ch_count != 0); 709 chp->ch_count--; 710 711 (void) pthread_rwlock_unlock(&chp->ch_lock); 712 713 (void) pthread_mutex_lock(&cip->ci_lock); 714 ASSERT(cip->ci_flags & FMD_CF_DELETING); 715 } 716 717 fmd_case_t * 718 fmd_case_create(fmd_module_t *mp, void *data) 719 { 720 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 721 fmd_case_impl_t *eip = NULL; 722 uuid_t uuid; 723 724 (void) pthread_mutex_init(&cip->ci_lock, NULL); 725 fmd_buf_hash_create(&cip->ci_bufs); 726 727 fmd_module_hold(mp); 728 cip->ci_mod = mp; 729 cip->ci_refs = 1; 730 cip->ci_state = FMD_CASE_UNSOLVED; 731 cip->ci_flags = FMD_CF_DIRTY; 732 cip->ci_data = data; 733 734 /* 735 * Calling libuuid: get a clue. The library interfaces cleverly do not 736 * define any constant for the length of an unparse string, and do not 737 * permit the caller to specify a buffer length for safety. The spec 738 * says it will be 36 bytes, but we make it tunable just in case. 739 */ 740 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 741 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 742 743 /* 744 * We expect this loop to execute only once, but code it defensively 745 * against the possibility of libuuid bugs. Keep generating uuids and 746 * attempting to do a hash insert until we get a unique one. 747 */ 748 do { 749 if (eip != NULL) 750 fmd_case_rele((fmd_case_t *)eip); 751 uuid_generate(uuid); 752 uuid_unparse(uuid, cip->ci_uuid); 753 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 754 755 ASSERT(fmd_module_locked(mp)); 756 fmd_list_append(&mp->mod_cases, cip); 757 fmd_module_setcdirty(mp); 758 759 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 760 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 761 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 762 763 return ((fmd_case_t *)cip); 764 } 765 766 static void 767 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 768 { 769 fmd_case_susp_t *cis, *ncis; 770 771 ASSERT(MUTEX_HELD(&cip->ci_lock)); 772 773 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 774 ncis = cis->cis_next; 775 nvlist_free(cis->cis_nvl); 776 fmd_free(cis, sizeof (fmd_case_susp_t)); 777 } 778 779 cip->ci_suspects = NULL; 780 cip->ci_nsuspects = 0; 781 } 782 783 fmd_case_t * 784 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 785 uint_t state, const char *uuid, const char *code) 786 { 787 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 788 fmd_case_impl_t *eip; 789 790 ASSERT(state < FMD_CASE_REPAIRED); 791 792 (void) pthread_mutex_init(&cip->ci_lock, NULL); 793 fmd_buf_hash_create(&cip->ci_bufs); 794 795 fmd_module_hold(mp); 796 cip->ci_mod = mp; 797 cip->ci_xprt = xp; 798 cip->ci_refs = 1; 799 cip->ci_state = state; 800 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 801 cip->ci_uuidlen = strlen(cip->ci_uuid); 802 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 803 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 804 805 if (state > FMD_CASE_CLOSE_WAIT) 806 cip->ci_flags |= FMD_CF_SOLVED; 807 808 /* 809 * Insert the case into the global case hash. If the specified UUID is 810 * already present, check to see if it is an orphan: if so, reclaim it; 811 * otherwise if it is owned by a different module then return NULL. 812 */ 813 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 814 (void) pthread_mutex_lock(&cip->ci_lock); 815 cip->ci_refs--; /* decrement to zero */ 816 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 817 818 cip = eip; /* switch 'cip' to the existing case */ 819 (void) pthread_mutex_lock(&cip->ci_lock); 820 821 /* 822 * If the ASRU cache is trying to recreate an orphan, then just 823 * return the existing case that we found without changing it. 824 */ 825 if (mp == fmd.d_rmod) { 826 (void) pthread_mutex_unlock(&cip->ci_lock); 827 fmd_case_rele((fmd_case_t *)cip); 828 return ((fmd_case_t *)cip); 829 } 830 831 /* 832 * If the existing case isn't an orphan or is being proxied, 833 * then we have a UUID conflict: return failure to the caller. 834 */ 835 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 836 (void) pthread_mutex_unlock(&cip->ci_lock); 837 fmd_case_rele((fmd_case_t *)cip); 838 return (NULL); 839 } 840 841 /* 842 * If the new module is reclaiming an orphaned case, remove 843 * the case from the root module, switch ci_mod, and then fall 844 * through to adding the case to the new owner module 'mp'. 845 */ 846 fmd_module_lock(cip->ci_mod); 847 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 848 fmd_module_unlock(cip->ci_mod); 849 850 fmd_module_rele(cip->ci_mod); 851 cip->ci_mod = mp; 852 fmd_module_hold(mp); 853 854 fmd_case_destroy_suspects(cip); 855 cip->ci_state = state; 856 857 (void) pthread_mutex_unlock(&cip->ci_lock); 858 fmd_case_rele((fmd_case_t *)cip); 859 } else { 860 /* 861 * add into hash of solved cases 862 */ 863 if (cip->ci_code) 864 fmd_case_code_hash_insert(fmd.d_cases, cip); 865 } 866 867 ASSERT(fmd_module_locked(mp)); 868 fmd_list_append(&mp->mod_cases, cip); 869 870 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 871 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 872 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 873 874 return ((fmd_case_t *)cip); 875 } 876 877 void 878 fmd_case_destroy(fmd_case_t *cp, int visible) 879 { 880 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 881 fmd_case_item_t *cit, *ncit; 882 883 ASSERT(MUTEX_HELD(&cip->ci_lock)); 884 ASSERT(cip->ci_refs == 0); 885 886 if (visible) { 887 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 888 fmd_case_hash_delete(fmd.d_cases, cip); 889 } 890 891 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 892 ncit = cit->cit_next; 893 fmd_event_rele(cit->cit_event); 894 fmd_free(cit, sizeof (fmd_case_item_t)); 895 } 896 897 fmd_case_destroy_suspects(cip); 898 899 if (cip->ci_principal != NULL) 900 fmd_event_rele(cip->ci_principal); 901 902 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 903 fmd_free(cip->ci_code, cip->ci_codelen); 904 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 905 906 fmd_module_rele(cip->ci_mod); 907 fmd_free(cip, sizeof (fmd_case_impl_t)); 908 } 909 910 void 911 fmd_case_hold(fmd_case_t *cp) 912 { 913 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 914 915 (void) pthread_mutex_lock(&cip->ci_lock); 916 fmd_case_hold_locked(cp); 917 (void) pthread_mutex_unlock(&cip->ci_lock); 918 } 919 920 void 921 fmd_case_hold_locked(fmd_case_t *cp) 922 { 923 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 924 925 ASSERT(MUTEX_HELD(&cip->ci_lock)); 926 if (cip->ci_flags & FMD_CF_DELETING) 927 fmd_panic("attempt to hold a deleting case %p (%s)\n", 928 (void *)cip, cip->ci_uuid); 929 cip->ci_refs++; 930 ASSERT(cip->ci_refs != 0); 931 } 932 933 static fmd_case_impl_t * 934 fmd_case_tryhold(fmd_case_impl_t *cip) 935 { 936 /* 937 * If the case's "deleting" bit is unset, hold and return case, 938 * otherwise, return NULL. 939 */ 940 (void) pthread_mutex_lock(&cip->ci_lock); 941 if (cip->ci_flags & FMD_CF_DELETING) { 942 (void) pthread_mutex_unlock(&cip->ci_lock); 943 cip = NULL; 944 } else { 945 fmd_case_hold_locked((fmd_case_t *)cip); 946 (void) pthread_mutex_unlock(&cip->ci_lock); 947 } 948 return (cip); 949 } 950 951 void 952 fmd_case_rele(fmd_case_t *cp) 953 { 954 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 955 956 (void) pthread_mutex_lock(&cip->ci_lock); 957 ASSERT(cip->ci_refs != 0); 958 959 if (--cip->ci_refs == 0) 960 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 961 else 962 (void) pthread_mutex_unlock(&cip->ci_lock); 963 } 964 965 void 966 fmd_case_rele_locked(fmd_case_t *cp) 967 { 968 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 969 970 ASSERT(MUTEX_HELD(&cip->ci_lock)); 971 --cip->ci_refs; 972 ASSERT(cip->ci_refs != 0); 973 } 974 975 int 976 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 977 { 978 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 979 fmd_case_item_t *cit; 980 fmd_event_t *oep; 981 uint_t state; 982 int new; 983 984 fmd_event_hold(ep); 985 (void) pthread_mutex_lock(&cip->ci_lock); 986 987 if (cip->ci_flags & FMD_CF_SOLVED) 988 state = FMD_EVS_DIAGNOSED; 989 else 990 state = FMD_EVS_ACCEPTED; 991 992 oep = cip->ci_principal; 993 cip->ci_principal = ep; 994 995 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 996 if (cit->cit_event == ep) 997 break; 998 } 999 1000 cip->ci_flags |= FMD_CF_DIRTY; 1001 new = cit == NULL && ep != oep; 1002 1003 (void) pthread_mutex_unlock(&cip->ci_lock); 1004 1005 fmd_module_setcdirty(cip->ci_mod); 1006 fmd_event_transition(ep, state); 1007 1008 if (oep != NULL) 1009 fmd_event_rele(oep); 1010 1011 return (new); 1012 } 1013 1014 int 1015 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1016 { 1017 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1018 fmd_case_item_t *cit; 1019 uint_t state; 1020 int new; 1021 1022 (void) pthread_mutex_lock(&cip->ci_lock); 1023 1024 if (cip->ci_flags & FMD_CF_SOLVED) 1025 state = FMD_EVS_DIAGNOSED; 1026 else 1027 state = FMD_EVS_ACCEPTED; 1028 1029 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1030 if (cit->cit_event == ep) 1031 break; 1032 } 1033 1034 new = cit == NULL && ep != cip->ci_principal; 1035 1036 /* 1037 * If the event is already in the case or the case is already solved, 1038 * there is no reason to save it: just transition it appropriately. 1039 */ 1040 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1041 (void) pthread_mutex_unlock(&cip->ci_lock); 1042 fmd_event_transition(ep, state); 1043 return (new); 1044 } 1045 1046 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1047 fmd_event_hold(ep); 1048 1049 cit->cit_next = cip->ci_items; 1050 cit->cit_event = ep; 1051 1052 cip->ci_items = cit; 1053 cip->ci_nitems++; 1054 1055 cip->ci_flags |= FMD_CF_DIRTY; 1056 (void) pthread_mutex_unlock(&cip->ci_lock); 1057 1058 fmd_module_setcdirty(cip->ci_mod); 1059 fmd_event_transition(ep, state); 1060 1061 return (new); 1062 } 1063 1064 void 1065 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1066 { 1067 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1068 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1069 1070 (void) pthread_mutex_lock(&cip->ci_lock); 1071 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1072 cip->ci_flags |= FMD_CF_DIRTY; 1073 1074 cis->cis_next = cip->ci_suspects; 1075 cis->cis_nvl = nvl; 1076 1077 cip->ci_suspects = cis; 1078 cip->ci_nsuspects++; 1079 1080 (void) pthread_mutex_unlock(&cip->ci_lock); 1081 fmd_module_setcdirty(cip->ci_mod); 1082 } 1083 1084 void 1085 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1086 { 1087 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1088 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1089 boolean_t b; 1090 1091 (void) pthread_mutex_lock(&cip->ci_lock); 1092 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 1093 ASSERT(cip->ci_mod == fmd.d_rmod); 1094 1095 cis->cis_next = cip->ci_suspects; 1096 cis->cis_nvl = nvl; 1097 1098 if (nvlist_lookup_boolean_value(nvl, 1099 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1100 cip->ci_flags |= FMD_CF_INVISIBLE; 1101 1102 cip->ci_suspects = cis; 1103 cip->ci_nsuspects++; 1104 1105 (void) pthread_mutex_unlock(&cip->ci_lock); 1106 } 1107 1108 void 1109 fmd_case_reset_suspects(fmd_case_t *cp) 1110 { 1111 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1112 1113 (void) pthread_mutex_lock(&cip->ci_lock); 1114 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1115 1116 fmd_case_destroy_suspects(cip); 1117 cip->ci_flags |= FMD_CF_DIRTY; 1118 1119 (void) pthread_mutex_unlock(&cip->ci_lock); 1120 fmd_module_setcdirty(cip->ci_mod); 1121 } 1122 1123 /*ARGSUSED*/ 1124 static void 1125 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1126 { 1127 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1128 } 1129 1130 /* 1131 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1132 * whatever actions and emit whatever events are appropriate for the state. 1133 * Refer to the topmost block comment explaining the state machine for details. 1134 */ 1135 void 1136 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1137 { 1138 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1139 fmd_case_item_t *cit; 1140 fmd_event_t *e; 1141 1142 ASSERT(state <= FMD_CASE_REPAIRED); 1143 (void) pthread_mutex_lock(&cip->ci_lock); 1144 1145 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1146 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 1147 1148 cip->ci_flags |= flags; 1149 1150 if (cip->ci_state >= state) { 1151 (void) pthread_mutex_unlock(&cip->ci_lock); 1152 return; /* already in specified state */ 1153 } 1154 1155 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1156 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1157 1158 cip->ci_state = state; 1159 cip->ci_flags |= FMD_CF_DIRTY; 1160 1161 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1162 fmd_module_setcdirty(cip->ci_mod); 1163 1164 switch (state) { 1165 case FMD_CASE_SOLVED: 1166 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1167 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1168 1169 if (cip->ci_principal != NULL) { 1170 fmd_event_transition(cip->ci_principal, 1171 FMD_EVS_DIAGNOSED); 1172 } 1173 break; 1174 1175 case FMD_CASE_CLOSE_WAIT: 1176 /* 1177 * If the case was never solved, do not change ASRUs. 1178 * If the case was never fmd_case_closed, do not change ASRUs. 1179 * If the case was repaired, do not change ASRUs. 1180 */ 1181 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1182 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1183 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1184 fmd_case_unusable, NULL); 1185 1186 /* 1187 * If an orphaned case transitions to CLOSE_WAIT, the owning 1188 * module is no longer loaded: continue on to CASE_CLOSED. 1189 */ 1190 if (fmd_case_orphaned(cp)) 1191 state = cip->ci_state = FMD_CASE_CLOSED; 1192 break; 1193 1194 case FMD_CASE_REPAIRED: 1195 ASSERT(fmd_case_orphaned(cp)); 1196 fmd_module_lock(cip->ci_mod); 1197 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1198 fmd_module_unlock(cip->ci_mod); 1199 break; 1200 } 1201 1202 (void) pthread_mutex_unlock(&cip->ci_lock); 1203 1204 /* 1205 * If the module has initialized, then publish the appropriate event 1206 * for the new case state. If not, we are being called from the 1207 * checkpoint code during module load, in which case the module's 1208 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1209 * may not be open yet, which will prevent us from computing the event 1210 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1211 * event in our queue: this won't be processed until _fmd_init is done. 1212 */ 1213 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1214 fmd_case_publish(cp, state); 1215 else { 1216 fmd_case_hold(cp); 1217 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1218 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1219 } 1220 1221 /* 1222 * If we transitioned to REPAIRED, adjust the reference count to 1223 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 1224 * not placed an additional hold on the case, it will now be freed. 1225 */ 1226 if (state == FMD_CASE_REPAIRED) { 1227 (void) pthread_mutex_lock(&cip->ci_lock); 1228 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1229 (void) pthread_mutex_unlock(&cip->ci_lock); 1230 fmd_case_rele(cp); 1231 } 1232 } 1233 1234 /* 1235 * Transition the specified case to *at least* the specified state by first 1236 * re-validating the suspect list using the resource cache. This function is 1237 * employed by the checkpoint code when restoring a saved, solved case to see 1238 * if the state of the case has effectively changed while fmd was not running 1239 * or the module was not loaded. If none of the suspects are present anymore, 1240 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 1241 */ 1242 void 1243 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1244 { 1245 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1246 1247 int faulty = 0; /* are any suspects faulty? */ 1248 int usable = 0; /* are any suspects usable? */ 1249 1250 ASSERT(state >= FMD_CASE_SOLVED); 1251 (void) pthread_mutex_lock(&cip->ci_lock); 1252 1253 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1254 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1255 1256 (void) pthread_mutex_unlock(&cip->ci_lock); 1257 1258 /* 1259 * If none of the suspects were faulty, it implies they were either 1260 * repaired already or not present and the rsrc.age time has expired. 1261 * We can move the state on to repaired. 1262 */ 1263 if (!faulty) { 1264 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1265 flags |= FMD_CF_REPAIRED; 1266 } else if (!usable) { 1267 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1268 flags |= FMD_CF_ISOLATED; 1269 } 1270 1271 fmd_case_transition(cp, state, flags); 1272 } 1273 1274 void 1275 fmd_case_setdirty(fmd_case_t *cp) 1276 { 1277 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1278 1279 (void) pthread_mutex_lock(&cip->ci_lock); 1280 cip->ci_flags |= FMD_CF_DIRTY; 1281 (void) pthread_mutex_unlock(&cip->ci_lock); 1282 1283 fmd_module_setcdirty(cip->ci_mod); 1284 } 1285 1286 void 1287 fmd_case_clrdirty(fmd_case_t *cp) 1288 { 1289 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1290 1291 (void) pthread_mutex_lock(&cip->ci_lock); 1292 cip->ci_flags &= ~FMD_CF_DIRTY; 1293 (void) pthread_mutex_unlock(&cip->ci_lock); 1294 } 1295 1296 void 1297 fmd_case_commit(fmd_case_t *cp) 1298 { 1299 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1300 fmd_case_item_t *cit; 1301 1302 (void) pthread_mutex_lock(&cip->ci_lock); 1303 1304 if (cip->ci_flags & FMD_CF_DIRTY) { 1305 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1306 fmd_event_commit(cit->cit_event); 1307 1308 if (cip->ci_principal != NULL) 1309 fmd_event_commit(cip->ci_principal); 1310 1311 fmd_buf_hash_commit(&cip->ci_bufs); 1312 cip->ci_flags &= ~FMD_CF_DIRTY; 1313 } 1314 1315 (void) pthread_mutex_unlock(&cip->ci_lock); 1316 } 1317 1318 /* 1319 * Indicate that the case may need to change state because one or more of the 1320 * ASRUs named as a suspect has changed state. We examine all the suspects 1321 * and if none are still faulty, we initiate a case close transition. 1322 */ 1323 void 1324 fmd_case_update(fmd_case_t *cp) 1325 { 1326 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1327 uint_t cstate; 1328 int faulty = 0; 1329 1330 (void) pthread_mutex_lock(&cip->ci_lock); 1331 cstate = cip->ci_state; 1332 1333 if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1334 (void) pthread_mutex_unlock(&cip->ci_lock); 1335 return; /* update is not appropriate */ 1336 } 1337 1338 if (cip->ci_flags & FMD_CF_REPAIRED) { 1339 (void) pthread_mutex_unlock(&cip->ci_lock); 1340 return; /* already repaired */ 1341 } 1342 1343 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1344 (void) pthread_mutex_unlock(&cip->ci_lock); 1345 1346 if (faulty) 1347 return; /* one or more suspects are still marked faulty */ 1348 1349 if (cstate == FMD_CASE_CLOSED) 1350 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1351 else 1352 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1353 } 1354 1355 /* 1356 * Delete a closed case from the module's case list once the fmdo_close() entry 1357 * point has run to completion. If the case is owned by a transport module, 1358 * tell the transport to proxy a case close on the other end of the transport. 1359 * If not, transition to the appropriate next state based on ci_flags. This 1360 * function represents the end of CLOSE_WAIT and transitions the case to either 1361 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1362 * refer to the topmost block comment explaining the state machine for details. 1363 */ 1364 void 1365 fmd_case_delete(fmd_case_t *cp) 1366 { 1367 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1368 fmd_modstat_t *msp; 1369 size_t buftotal; 1370 1371 ASSERT(fmd_module_locked(cip->ci_mod)); 1372 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1373 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1374 1375 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1376 msp = cip->ci_mod->mod_stats; 1377 1378 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1379 msp->ms_caseopen.fmds_value.ui64--; 1380 1381 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1382 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1383 1384 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1385 1386 if (cip->ci_xprt == NULL) 1387 fmd_module_setcdirty(cip->ci_mod); 1388 1389 fmd_module_rele(cip->ci_mod); 1390 cip->ci_mod = fmd.d_rmod; 1391 fmd_module_hold(cip->ci_mod); 1392 1393 /* 1394 * If the case is not proxied and it has been solved, then retain it 1395 * on the root module's case list at least until we're transitioned. 1396 * Otherwise free the case with our final fmd_case_rele() below. 1397 */ 1398 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1399 fmd_module_lock(cip->ci_mod); 1400 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1401 fmd_module_unlock(cip->ci_mod); 1402 fmd_case_hold(cp); 1403 } 1404 1405 /* 1406 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1407 * rather than orphaned because by definition it can have no entries 1408 * in the resource cache of the current fault manager. 1409 */ 1410 if (cip->ci_xprt != NULL) 1411 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1412 else if (cip->ci_flags & FMD_CF_REPAIRED) 1413 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1414 else if (cip->ci_flags & FMD_CF_ISOLATED) 1415 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1416 1417 fmd_case_rele(cp); 1418 } 1419 1420 void 1421 fmd_case_discard(fmd_case_t *cp) 1422 { 1423 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1424 1425 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1426 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1427 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1428 1429 ASSERT(fmd_module_locked(cip->ci_mod)); 1430 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1431 fmd_case_rele(cp); 1432 } 1433 1434 /* 1435 * Indicate that the problem corresponding to a case has been repaired by 1436 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1437 * already been closed, this function initiates the transition to CLOSE_WAIT. 1438 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1439 * grab and drop ci_lock without the case being able to be freed in between. 1440 */ 1441 int 1442 fmd_case_repair(fmd_case_t *cp) 1443 { 1444 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1445 uint_t cstate; 1446 1447 (void) pthread_mutex_lock(&cip->ci_lock); 1448 cstate = cip->ci_state; 1449 1450 if (cip->ci_xprt != NULL) { 1451 (void) pthread_mutex_unlock(&cip->ci_lock); 1452 return (fmd_set_errno(EFMD_CASE_OWNER)); 1453 } 1454 1455 if (cstate < FMD_CASE_SOLVED) { 1456 (void) pthread_mutex_unlock(&cip->ci_lock); 1457 return (fmd_set_errno(EFMD_CASE_STATE)); 1458 } 1459 1460 if (cip->ci_flags & FMD_CF_REPAIRED) { 1461 (void) pthread_mutex_unlock(&cip->ci_lock); 1462 return (0); /* already repaired */ 1463 } 1464 1465 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repair, NULL); 1466 (void) pthread_mutex_unlock(&cip->ci_lock); 1467 1468 if (cstate == FMD_CASE_CLOSED) 1469 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1470 else 1471 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1472 1473 return (0); 1474 } 1475 1476 int 1477 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1478 { 1479 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1480 fmd_case_item_t *cit; 1481 uint_t state; 1482 int rv = 0; 1483 1484 (void) pthread_mutex_lock(&cip->ci_lock); 1485 1486 if (cip->ci_state >= FMD_CASE_SOLVED) 1487 state = FMD_EVS_DIAGNOSED; 1488 else 1489 state = FMD_EVS_ACCEPTED; 1490 1491 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1492 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1493 break; 1494 } 1495 1496 if (rv == 0 && cip->ci_principal != NULL) 1497 rv = fmd_event_equal(ep, cip->ci_principal); 1498 1499 (void) pthread_mutex_unlock(&cip->ci_lock); 1500 1501 if (rv != 0) 1502 fmd_event_transition(ep, state); 1503 1504 return (rv); 1505 } 1506 1507 int 1508 fmd_case_orphaned(fmd_case_t *cp) 1509 { 1510 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1511 } 1512 1513 void 1514 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1515 { 1516 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1517 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1518 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1519 } 1520