1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 extern volatile uint32_t fmd_asru_fake_not_present; 135 136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 137 138 fmd_case_hash_t * 139 fmd_case_hash_create(void) 140 { 141 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 142 143 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 144 chp->ch_hashlen = fmd.d_str_buckets; 145 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 146 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 147 FMD_SLEEP); 148 chp->ch_count = 0; 149 150 return (chp); 151 } 152 153 /* 154 * Destroy the case hash. Unlike most of our hash tables, no active references 155 * are kept by the case hash itself; all references come from other subsystems. 156 * The hash must be destroyed after all modules are unloaded; if anything was 157 * present in the hash it would be by definition a reference count leak. 158 */ 159 void 160 fmd_case_hash_destroy(fmd_case_hash_t *chp) 161 { 162 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 163 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 164 fmd_free(chp, sizeof (fmd_case_hash_t)); 165 } 166 167 /* 168 * Take a snapshot of the case hash by placing an additional hold on each 169 * member in an auxiliary array, and then call 'func' for each case. 170 */ 171 void 172 fmd_case_hash_apply(fmd_case_hash_t *chp, 173 void (*func)(fmd_case_t *, void *), void *arg) 174 { 175 fmd_case_impl_t *cp, **cps, **cpp; 176 uint_t cpc, i; 177 178 (void) pthread_rwlock_rdlock(&chp->ch_lock); 179 180 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 181 cpc = chp->ch_count; 182 183 for (i = 0; i < chp->ch_hashlen; i++) { 184 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 185 fmd_case_hold((fmd_case_t *)cp); 186 *cpp++ = cp; 187 } 188 } 189 190 ASSERT(cpp == cps + cpc); 191 (void) pthread_rwlock_unlock(&chp->ch_lock); 192 193 for (i = 0; i < cpc; i++) { 194 func((fmd_case_t *)cps[i], arg); 195 fmd_case_rele((fmd_case_t *)cps[i]); 196 } 197 198 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 199 } 200 201 static void 202 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 203 { 204 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 205 206 cip->ci_code_next = chp->ch_code_hash[h]; 207 chp->ch_code_hash[h] = cip; 208 } 209 210 static void 211 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 212 { 213 fmd_case_impl_t **pp, *cp; 214 215 if (cip->ci_code) { 216 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 217 218 pp = &chp->ch_code_hash[h]; 219 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 220 if (cp != cip) 221 pp = &cp->ci_code_next; 222 else 223 break; 224 } 225 if (cp != NULL) { 226 *pp = cp->ci_code_next; 227 cp->ci_code_next = NULL; 228 } 229 } 230 } 231 232 /* 233 * Look up the diagcode for this case and cache it in ci_code. If no suspects 234 * were defined for this case or if the lookup fails, the event dictionary or 235 * module code is broken, and we set the event code to a precomputed default. 236 */ 237 static const char * 238 fmd_case_mkcode(fmd_case_t *cp) 239 { 240 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 241 fmd_case_susp_t *cis; 242 fmd_case_hash_t *chp = fmd.d_cases; 243 244 char **keys, **keyp; 245 const char *s; 246 247 ASSERT(MUTEX_HELD(&cip->ci_lock)); 248 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 249 250 /* 251 * delete any existing entry from code hash if it is on it 252 */ 253 fmd_case_code_hash_delete(chp, cip); 254 255 fmd_free(cip->ci_code, cip->ci_codelen); 256 cip->ci_codelen = cip->ci_mod->mod_codelen; 257 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 258 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 259 260 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 261 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 262 keyp++; 263 } 264 265 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 266 267 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 268 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 269 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 270 fmd_free(cip->ci_code, cip->ci_codelen); 271 cip->ci_codelen = strlen(s) + 1; 272 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 273 (void) strcpy(cip->ci_code, s); 274 } 275 276 /* 277 * add into hash of solved cases 278 */ 279 fmd_case_code_hash_insert(chp, cip); 280 281 return (cip->ci_code); 282 } 283 284 typedef struct { 285 int *fcl_countp; 286 uint8_t *fcl_ba; 287 nvlist_t **fcl_nva; 288 int *fcl_msgp; 289 } fmd_case_lst_t; 290 291 static void 292 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 293 { 294 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 295 boolean_t b; 296 int state; 297 298 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 299 &b) == 0 && b == B_FALSE) 300 *entryp->fcl_msgp = B_FALSE; 301 entryp->fcl_ba[*entryp->fcl_countp] = 0; 302 state = fmd_asru_al_getstate(alp); 303 if (state & FMD_ASRU_UNUSABLE) 304 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 305 if (state & FMD_ASRU_FAULTY) 306 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 307 if (!(state & FMD_ASRU_PRESENT)) 308 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 309 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 310 (*entryp->fcl_countp)++; 311 } 312 313 static void 314 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 315 { 316 int *faultyp = (int *)arg; 317 318 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 319 } 320 321 static void 322 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 323 { 324 int *usablep = (int *)arg; 325 326 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 327 } 328 329 nvlist_t * 330 fmd_case_mkevent(fmd_case_t *cp, const char *class) 331 { 332 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 333 nvlist_t **nva, *nvl; 334 uint8_t *ba; 335 int msg = B_TRUE; 336 const char *code; 337 fmd_case_lst_t fcl; 338 int count = 0; 339 340 (void) pthread_mutex_lock(&cip->ci_lock); 341 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 342 343 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 344 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 345 346 /* 347 * For each suspect associated with the case, store its fault event 348 * nvlist in 'nva'. We also look to see if any of the suspect faults 349 * have asked not to be messaged. If any of them have made such a 350 * request, propagate that attribute to the composite list.* event. 351 * Finally, store each suspect's faulty status into the bitmap 'ba'. 352 */ 353 fcl.fcl_countp = &count; 354 fcl.fcl_msgp = &msg; 355 fcl.fcl_ba = ba; 356 fcl.fcl_nva = nva; 357 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 358 359 if (cip->ci_code == NULL) 360 (void) fmd_case_mkcode(cp); 361 /* 362 * For repair event, we lookup diagcode from dict using key 363 * "list.repaired". 364 */ 365 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 366 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 367 else 368 code = cip->ci_code; 369 370 if (msg == B_FALSE) 371 cip->ci_flags |= FMD_CF_INVISIBLE; 372 373 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 374 code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv); 375 376 (void) pthread_mutex_unlock(&cip->ci_lock); 377 return (nvl); 378 } 379 380 static boolean_t 381 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 382 { 383 nvlist_t *new_rsrc; 384 nvlist_t *rsrc; 385 char *new_name = NULL; 386 char *name = NULL; 387 ssize_t new_namelen; 388 ssize_t namelen; 389 int fmri_present = 1; 390 int new_fmri_present = 1; 391 int match = B_FALSE; 392 393 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 394 fmri_present = 0; 395 else { 396 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 397 goto done; 398 name = fmd_alloc(namelen + 1, FMD_SLEEP); 399 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 400 goto done; 401 } 402 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 403 new_fmri_present = 0; 404 else { 405 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 406 goto done; 407 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 408 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 409 goto done; 410 } 411 match = (fmri_present == new_fmri_present && 412 (fmri_present == 0 || strcmp(name, new_name) == 0)); 413 done: 414 if (name != NULL) 415 fmd_free(name, namelen + 1); 416 if (new_name != NULL) 417 fmd_free(new_name, new_namelen + 1); 418 return (match); 419 } 420 421 static int 422 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 423 { 424 char *class, *new_class; 425 426 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 427 return (0); 428 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 429 FM_FAULT_RESOURCE)) 430 return (0); 431 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 432 return (0); 433 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 434 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 435 return (strcmp(class, new_class) == 0); 436 } 437 438 /* 439 * see if an identical suspect list already exists in the cache 440 */ 441 static int 442 fmd_case_check_for_dups(fmd_case_t *cp) 443 { 444 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 445 fmd_case_hash_t *chp = fmd.d_cases; 446 fmd_case_susp_t *xcis, *cis; 447 int match = 0, match_susp; 448 uint_t h; 449 450 (void) pthread_rwlock_rdlock(&chp->ch_lock); 451 452 /* 453 * Find all cases with this code 454 */ 455 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 456 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 457 xcip = xcip->ci_code_next) { 458 /* 459 * only look for any cases (apart from this one) 460 * whose code and number of suspects match 461 */ 462 if (xcip == cip || strcmp(xcip->ci_code, cip->ci_code) != 0 || 463 xcip->ci_nsuspects != cip->ci_nsuspects) 464 continue; 465 466 /* 467 * For each suspect in one list, check if there 468 * is an identical suspect in the other list 469 */ 470 match = 1; 471 fmd_case_hold((fmd_case_t *)xcip); 472 for (xcis = xcip->ci_suspects; xcis != NULL; 473 xcis = xcis->cis_next) { 474 match_susp = 0; 475 for (cis = cip->ci_suspects; cis != NULL; 476 cis = cis->cis_next) { 477 if (fmd_case_match_suspect(cis, xcis) == 1) { 478 match_susp = 1; 479 break; 480 } 481 } 482 if (match_susp == 0) { 483 match = 0; 484 break; 485 } 486 } 487 fmd_case_rele((fmd_case_t *)xcip); 488 if (match) { 489 (void) pthread_rwlock_unlock(&chp->ch_lock); 490 return (1); 491 } 492 } 493 (void) pthread_rwlock_unlock(&chp->ch_lock); 494 return (0); 495 } 496 497 /* 498 * Convict suspects in a case by applying a conviction policy and updating the 499 * resource cache prior to emitting the list.suspect event for the given case. 500 * At present, our policy is very simple: convict every suspect in the case. 501 * In the future, this policy can be extended and made configurable to permit: 502 * 503 * - convicting the suspect with the highest FIT rate 504 * - convicting the suspect with the cheapest FRU 505 * - convicting the suspect with the FRU that is in a depot's inventory 506 * - convicting the suspect with the longest lifetime 507 * 508 * and so forth. A word to the wise: this problem is significantly harder that 509 * it seems at first glance. Future work should heed the following advice: 510 * 511 * Hacking the policy into C code here is a very bad idea. The policy needs to 512 * be decided upon very carefully and fundamentally encodes knowledge of what 513 * suspect list combinations can be emitted by what diagnosis engines. As such 514 * fmd's code is the wrong location, because that would require fmd itself to 515 * be updated for every diagnosis engine change, defeating the entire design. 516 * The FMA Event Registry knows the suspect list combinations: policy inputs 517 * can be derived from it and used to produce per-module policy configuration. 518 * 519 * If the policy needs to be dynamic and not statically fixed at either fmd 520 * startup or module load time, any implementation of dynamic policy retrieval 521 * must employ some kind of caching mechanism or be part of a built-in module. 522 * The fmd_case_convict() function is called with locks held inside of fmd and 523 * is not a place where unbounded blocking on some inter-process or inter- 524 * system communication to another service (e.g. another daemon) can occur. 525 */ 526 static int 527 fmd_case_convict(fmd_case_t *cp) 528 { 529 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 530 fmd_asru_hash_t *ahp = fmd.d_asrus; 531 532 fmd_case_susp_t *cis; 533 fmd_asru_link_t *alp; 534 535 (void) pthread_mutex_lock(&cip->ci_lock); 536 (void) fmd_case_mkcode(cp); 537 if (fmd_case_check_for_dups(cp) == 1) { 538 (void) pthread_mutex_unlock(&cip->ci_lock); 539 return (1); 540 } 541 542 /* 543 * no suspect list already exists - allocate new cache entries 544 */ 545 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 546 if ((alp = fmd_asru_hash_create_entry(ahp, 547 cp, cis->cis_nvl)) == NULL) { 548 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 549 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 550 continue; 551 } 552 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE); 553 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 554 } 555 556 (void) pthread_mutex_unlock(&cip->ci_lock); 557 return (0); 558 } 559 560 void 561 fmd_case_publish(fmd_case_t *cp, uint_t state) 562 { 563 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 564 fmd_event_t *e; 565 nvlist_t *nvl; 566 char *class; 567 568 if (state == FMD_CASE_CURRENT) 569 state = cip->ci_state; /* use current state */ 570 571 switch (state) { 572 case FMD_CASE_SOLVED: 573 (void) pthread_mutex_lock(&cip->ci_lock); 574 if (cip->ci_tv_valid == 0) { 575 fmd_time_gettimeofday(&cip->ci_tv); 576 cip->ci_tv_valid = 1; 577 } 578 (void) pthread_mutex_unlock(&cip->ci_lock); 579 580 if (fmd_case_convict(cp) == 1) { /* dupclose */ 581 cip->ci_flags &= ~FMD_CF_SOLVED; 582 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 583 break; 584 } 585 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 586 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 587 588 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 589 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 590 fmd_log_append(fmd.d_fltlog, e, cp); 591 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 592 fmd_dispq_dispatch(fmd.d_disp, e, class); 593 594 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 595 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 596 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 597 598 break; 599 600 case FMD_CASE_CLOSE_WAIT: 601 fmd_case_hold(cp); 602 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 603 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 604 605 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 606 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 607 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 608 609 break; 610 611 case FMD_CASE_CLOSED: 612 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 613 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 614 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 615 fmd_dispq_dispatch(fmd.d_disp, e, class); 616 break; 617 618 case FMD_CASE_REPAIRED: 619 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 620 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 621 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 622 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 623 fmd_log_append(fmd.d_fltlog, e, cp); 624 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 625 fmd_dispq_dispatch(fmd.d_disp, e, class); 626 break; 627 } 628 } 629 630 fmd_case_t * 631 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 632 { 633 fmd_case_impl_t *cip; 634 uint_t h; 635 636 (void) pthread_rwlock_rdlock(&chp->ch_lock); 637 h = fmd_strhash(uuid) % chp->ch_hashlen; 638 639 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 640 if (strcmp(cip->ci_uuid, uuid) == 0) 641 break; 642 } 643 644 /* 645 * If deleting bit is set, treat the case as if it doesn't exist. 646 */ 647 if (cip != NULL) 648 cip = fmd_case_tryhold(cip); 649 650 if (cip == NULL) 651 (void) fmd_set_errno(EFMD_CASE_INVAL); 652 653 (void) pthread_rwlock_unlock(&chp->ch_lock); 654 return ((fmd_case_t *)cip); 655 } 656 657 static fmd_case_impl_t * 658 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 659 { 660 fmd_case_impl_t *eip; 661 uint_t h; 662 663 (void) pthread_rwlock_wrlock(&chp->ch_lock); 664 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 665 666 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 667 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 668 fmd_case_tryhold(eip) != NULL) { 669 (void) pthread_rwlock_unlock(&chp->ch_lock); 670 return (eip); /* uuid already present */ 671 } 672 } 673 674 cip->ci_next = chp->ch_hash[h]; 675 chp->ch_hash[h] = cip; 676 677 chp->ch_count++; 678 ASSERT(chp->ch_count != 0); 679 680 (void) pthread_rwlock_unlock(&chp->ch_lock); 681 return (cip); 682 } 683 684 static void 685 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 686 { 687 fmd_case_impl_t *cp, **pp; 688 uint_t h; 689 690 ASSERT(MUTEX_HELD(&cip->ci_lock)); 691 692 cip->ci_flags |= FMD_CF_DELETING; 693 (void) pthread_mutex_unlock(&cip->ci_lock); 694 695 (void) pthread_rwlock_wrlock(&chp->ch_lock); 696 697 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 698 pp = &chp->ch_hash[h]; 699 700 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 701 if (cp != cip) 702 pp = &cp->ci_next; 703 else 704 break; 705 } 706 707 if (cp == NULL) { 708 fmd_panic("case %p (%s) not found on hash chain %u\n", 709 (void *)cip, cip->ci_uuid, h); 710 } 711 712 *pp = cp->ci_next; 713 cp->ci_next = NULL; 714 715 /* 716 * delete from code hash if it is on it 717 */ 718 fmd_case_code_hash_delete(chp, cip); 719 720 ASSERT(chp->ch_count != 0); 721 chp->ch_count--; 722 723 (void) pthread_rwlock_unlock(&chp->ch_lock); 724 725 (void) pthread_mutex_lock(&cip->ci_lock); 726 ASSERT(cip->ci_flags & FMD_CF_DELETING); 727 } 728 729 fmd_case_t * 730 fmd_case_create(fmd_module_t *mp, void *data) 731 { 732 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 733 fmd_case_impl_t *eip = NULL; 734 uuid_t uuid; 735 736 (void) pthread_mutex_init(&cip->ci_lock, NULL); 737 fmd_buf_hash_create(&cip->ci_bufs); 738 739 fmd_module_hold(mp); 740 cip->ci_mod = mp; 741 cip->ci_refs = 1; 742 cip->ci_state = FMD_CASE_UNSOLVED; 743 cip->ci_flags = FMD_CF_DIRTY; 744 cip->ci_data = data; 745 746 /* 747 * Calling libuuid: get a clue. The library interfaces cleverly do not 748 * define any constant for the length of an unparse string, and do not 749 * permit the caller to specify a buffer length for safety. The spec 750 * says it will be 36 bytes, but we make it tunable just in case. 751 */ 752 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 753 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 754 755 /* 756 * We expect this loop to execute only once, but code it defensively 757 * against the possibility of libuuid bugs. Keep generating uuids and 758 * attempting to do a hash insert until we get a unique one. 759 */ 760 do { 761 if (eip != NULL) 762 fmd_case_rele((fmd_case_t *)eip); 763 uuid_generate(uuid); 764 uuid_unparse(uuid, cip->ci_uuid); 765 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 766 767 ASSERT(fmd_module_locked(mp)); 768 fmd_list_append(&mp->mod_cases, cip); 769 fmd_module_setcdirty(mp); 770 771 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 772 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 773 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 774 775 return ((fmd_case_t *)cip); 776 } 777 778 static void 779 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 780 { 781 fmd_case_susp_t *cis, *ncis; 782 783 ASSERT(MUTEX_HELD(&cip->ci_lock)); 784 785 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 786 ncis = cis->cis_next; 787 nvlist_free(cis->cis_nvl); 788 fmd_free(cis, sizeof (fmd_case_susp_t)); 789 } 790 791 cip->ci_suspects = NULL; 792 cip->ci_nsuspects = 0; 793 } 794 795 fmd_case_t * 796 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 797 uint_t state, const char *uuid, const char *code) 798 { 799 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 800 fmd_case_impl_t *eip; 801 802 ASSERT(state < FMD_CASE_REPAIRED); 803 804 (void) pthread_mutex_init(&cip->ci_lock, NULL); 805 fmd_buf_hash_create(&cip->ci_bufs); 806 807 fmd_module_hold(mp); 808 cip->ci_mod = mp; 809 cip->ci_xprt = xp; 810 cip->ci_refs = 1; 811 cip->ci_state = state; 812 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 813 cip->ci_uuidlen = strlen(cip->ci_uuid); 814 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 815 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 816 817 if (state > FMD_CASE_CLOSE_WAIT) 818 cip->ci_flags |= FMD_CF_SOLVED; 819 820 /* 821 * Insert the case into the global case hash. If the specified UUID is 822 * already present, check to see if it is an orphan: if so, reclaim it; 823 * otherwise if it is owned by a different module then return NULL. 824 */ 825 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 826 (void) pthread_mutex_lock(&cip->ci_lock); 827 cip->ci_refs--; /* decrement to zero */ 828 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 829 830 cip = eip; /* switch 'cip' to the existing case */ 831 (void) pthread_mutex_lock(&cip->ci_lock); 832 833 /* 834 * If the ASRU cache is trying to recreate an orphan, then just 835 * return the existing case that we found without changing it. 836 */ 837 if (mp == fmd.d_rmod) { 838 (void) pthread_mutex_unlock(&cip->ci_lock); 839 fmd_case_rele((fmd_case_t *)cip); 840 return ((fmd_case_t *)cip); 841 } 842 843 /* 844 * If the existing case isn't an orphan or is being proxied, 845 * then we have a UUID conflict: return failure to the caller. 846 */ 847 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 848 (void) pthread_mutex_unlock(&cip->ci_lock); 849 fmd_case_rele((fmd_case_t *)cip); 850 return (NULL); 851 } 852 853 /* 854 * If the new module is reclaiming an orphaned case, remove 855 * the case from the root module, switch ci_mod, and then fall 856 * through to adding the case to the new owner module 'mp'. 857 */ 858 fmd_module_lock(cip->ci_mod); 859 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 860 fmd_module_unlock(cip->ci_mod); 861 862 fmd_module_rele(cip->ci_mod); 863 cip->ci_mod = mp; 864 fmd_module_hold(mp); 865 866 fmd_case_destroy_suspects(cip); 867 cip->ci_state = state; 868 869 (void) pthread_mutex_unlock(&cip->ci_lock); 870 fmd_case_rele((fmd_case_t *)cip); 871 } else { 872 /* 873 * add into hash of solved cases 874 */ 875 if (cip->ci_code) 876 fmd_case_code_hash_insert(fmd.d_cases, cip); 877 } 878 879 ASSERT(fmd_module_locked(mp)); 880 fmd_list_append(&mp->mod_cases, cip); 881 882 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 883 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 884 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 885 886 return ((fmd_case_t *)cip); 887 } 888 889 void 890 fmd_case_destroy(fmd_case_t *cp, int visible) 891 { 892 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 893 fmd_case_item_t *cit, *ncit; 894 895 ASSERT(MUTEX_HELD(&cip->ci_lock)); 896 ASSERT(cip->ci_refs == 0); 897 898 if (visible) { 899 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 900 fmd_case_hash_delete(fmd.d_cases, cip); 901 } 902 903 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 904 ncit = cit->cit_next; 905 fmd_event_rele(cit->cit_event); 906 fmd_free(cit, sizeof (fmd_case_item_t)); 907 } 908 909 fmd_case_destroy_suspects(cip); 910 911 if (cip->ci_principal != NULL) 912 fmd_event_rele(cip->ci_principal); 913 914 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 915 fmd_free(cip->ci_code, cip->ci_codelen); 916 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 917 918 fmd_module_rele(cip->ci_mod); 919 fmd_free(cip, sizeof (fmd_case_impl_t)); 920 } 921 922 void 923 fmd_case_hold(fmd_case_t *cp) 924 { 925 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 926 927 (void) pthread_mutex_lock(&cip->ci_lock); 928 fmd_case_hold_locked(cp); 929 (void) pthread_mutex_unlock(&cip->ci_lock); 930 } 931 932 void 933 fmd_case_hold_locked(fmd_case_t *cp) 934 { 935 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 936 937 ASSERT(MUTEX_HELD(&cip->ci_lock)); 938 if (cip->ci_flags & FMD_CF_DELETING) 939 fmd_panic("attempt to hold a deleting case %p (%s)\n", 940 (void *)cip, cip->ci_uuid); 941 cip->ci_refs++; 942 ASSERT(cip->ci_refs != 0); 943 } 944 945 static fmd_case_impl_t * 946 fmd_case_tryhold(fmd_case_impl_t *cip) 947 { 948 /* 949 * If the case's "deleting" bit is unset, hold and return case, 950 * otherwise, return NULL. 951 */ 952 (void) pthread_mutex_lock(&cip->ci_lock); 953 if (cip->ci_flags & FMD_CF_DELETING) { 954 (void) pthread_mutex_unlock(&cip->ci_lock); 955 cip = NULL; 956 } else { 957 fmd_case_hold_locked((fmd_case_t *)cip); 958 (void) pthread_mutex_unlock(&cip->ci_lock); 959 } 960 return (cip); 961 } 962 963 void 964 fmd_case_rele(fmd_case_t *cp) 965 { 966 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 967 968 (void) pthread_mutex_lock(&cip->ci_lock); 969 ASSERT(cip->ci_refs != 0); 970 971 if (--cip->ci_refs == 0) 972 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 973 else 974 (void) pthread_mutex_unlock(&cip->ci_lock); 975 } 976 977 void 978 fmd_case_rele_locked(fmd_case_t *cp) 979 { 980 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 981 982 ASSERT(MUTEX_HELD(&cip->ci_lock)); 983 --cip->ci_refs; 984 ASSERT(cip->ci_refs != 0); 985 } 986 987 int 988 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 989 { 990 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 991 fmd_case_item_t *cit; 992 fmd_event_t *oep; 993 uint_t state; 994 int new; 995 996 fmd_event_hold(ep); 997 (void) pthread_mutex_lock(&cip->ci_lock); 998 999 if (cip->ci_flags & FMD_CF_SOLVED) 1000 state = FMD_EVS_DIAGNOSED; 1001 else 1002 state = FMD_EVS_ACCEPTED; 1003 1004 oep = cip->ci_principal; 1005 cip->ci_principal = ep; 1006 1007 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1008 if (cit->cit_event == ep) 1009 break; 1010 } 1011 1012 cip->ci_flags |= FMD_CF_DIRTY; 1013 new = cit == NULL && ep != oep; 1014 1015 (void) pthread_mutex_unlock(&cip->ci_lock); 1016 1017 fmd_module_setcdirty(cip->ci_mod); 1018 fmd_event_transition(ep, state); 1019 1020 if (oep != NULL) 1021 fmd_event_rele(oep); 1022 1023 return (new); 1024 } 1025 1026 int 1027 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1028 { 1029 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1030 fmd_case_item_t *cit; 1031 uint_t state; 1032 int new; 1033 1034 (void) pthread_mutex_lock(&cip->ci_lock); 1035 1036 if (cip->ci_flags & FMD_CF_SOLVED) 1037 state = FMD_EVS_DIAGNOSED; 1038 else 1039 state = FMD_EVS_ACCEPTED; 1040 1041 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1042 if (cit->cit_event == ep) 1043 break; 1044 } 1045 1046 new = cit == NULL && ep != cip->ci_principal; 1047 1048 /* 1049 * If the event is already in the case or the case is already solved, 1050 * there is no reason to save it: just transition it appropriately. 1051 */ 1052 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1053 (void) pthread_mutex_unlock(&cip->ci_lock); 1054 fmd_event_transition(ep, state); 1055 return (new); 1056 } 1057 1058 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1059 fmd_event_hold(ep); 1060 1061 cit->cit_next = cip->ci_items; 1062 cit->cit_event = ep; 1063 1064 cip->ci_items = cit; 1065 cip->ci_nitems++; 1066 1067 cip->ci_flags |= FMD_CF_DIRTY; 1068 (void) pthread_mutex_unlock(&cip->ci_lock); 1069 1070 fmd_module_setcdirty(cip->ci_mod); 1071 fmd_event_transition(ep, state); 1072 1073 return (new); 1074 } 1075 1076 void 1077 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1078 { 1079 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1080 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1081 1082 (void) pthread_mutex_lock(&cip->ci_lock); 1083 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1084 cip->ci_flags |= FMD_CF_DIRTY; 1085 1086 cis->cis_next = cip->ci_suspects; 1087 cis->cis_nvl = nvl; 1088 1089 cip->ci_suspects = cis; 1090 cip->ci_nsuspects++; 1091 1092 (void) pthread_mutex_unlock(&cip->ci_lock); 1093 fmd_module_setcdirty(cip->ci_mod); 1094 } 1095 1096 void 1097 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1098 { 1099 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1100 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1101 boolean_t b; 1102 1103 (void) pthread_mutex_lock(&cip->ci_lock); 1104 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 1105 ASSERT(cip->ci_mod == fmd.d_rmod); 1106 1107 cis->cis_next = cip->ci_suspects; 1108 cis->cis_nvl = nvl; 1109 1110 if (nvlist_lookup_boolean_value(nvl, 1111 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1112 cip->ci_flags |= FMD_CF_INVISIBLE; 1113 1114 cip->ci_suspects = cis; 1115 cip->ci_nsuspects++; 1116 1117 (void) pthread_mutex_unlock(&cip->ci_lock); 1118 } 1119 1120 void 1121 fmd_case_reset_suspects(fmd_case_t *cp) 1122 { 1123 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1124 1125 (void) pthread_mutex_lock(&cip->ci_lock); 1126 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1127 1128 fmd_case_destroy_suspects(cip); 1129 cip->ci_flags |= FMD_CF_DIRTY; 1130 1131 (void) pthread_mutex_unlock(&cip->ci_lock); 1132 fmd_module_setcdirty(cip->ci_mod); 1133 } 1134 1135 /*ARGSUSED*/ 1136 static void 1137 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1138 { 1139 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1140 } 1141 1142 /* 1143 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1144 * whatever actions and emit whatever events are appropriate for the state. 1145 * Refer to the topmost block comment explaining the state machine for details. 1146 */ 1147 void 1148 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1149 { 1150 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1151 fmd_case_item_t *cit; 1152 fmd_event_t *e; 1153 1154 ASSERT(state <= FMD_CASE_REPAIRED); 1155 (void) pthread_mutex_lock(&cip->ci_lock); 1156 1157 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1158 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 1159 1160 cip->ci_flags |= flags; 1161 1162 if (cip->ci_state >= state) { 1163 (void) pthread_mutex_unlock(&cip->ci_lock); 1164 return; /* already in specified state */ 1165 } 1166 1167 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1168 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1169 1170 cip->ci_state = state; 1171 cip->ci_flags |= FMD_CF_DIRTY; 1172 1173 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1174 fmd_module_setcdirty(cip->ci_mod); 1175 1176 switch (state) { 1177 case FMD_CASE_SOLVED: 1178 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1179 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1180 1181 if (cip->ci_principal != NULL) { 1182 fmd_event_transition(cip->ci_principal, 1183 FMD_EVS_DIAGNOSED); 1184 } 1185 break; 1186 1187 case FMD_CASE_CLOSE_WAIT: 1188 /* 1189 * If the case was never solved, do not change ASRUs. 1190 * If the case was never fmd_case_closed, do not change ASRUs. 1191 * If the case was repaired, do not change ASRUs. 1192 */ 1193 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1194 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1195 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1196 fmd_case_unusable, NULL); 1197 1198 /* 1199 * If an orphaned case transitions to CLOSE_WAIT, the owning 1200 * module is no longer loaded: continue on to CASE_CLOSED. 1201 */ 1202 if (fmd_case_orphaned(cp)) 1203 state = cip->ci_state = FMD_CASE_CLOSED; 1204 break; 1205 1206 case FMD_CASE_REPAIRED: 1207 ASSERT(fmd_case_orphaned(cp)); 1208 fmd_module_lock(cip->ci_mod); 1209 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1210 fmd_module_unlock(cip->ci_mod); 1211 break; 1212 } 1213 1214 (void) pthread_mutex_unlock(&cip->ci_lock); 1215 1216 /* 1217 * If the module has initialized, then publish the appropriate event 1218 * for the new case state. If not, we are being called from the 1219 * checkpoint code during module load, in which case the module's 1220 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1221 * may not be open yet, which will prevent us from computing the event 1222 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1223 * event in our queue: this won't be processed until _fmd_init is done. 1224 */ 1225 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1226 fmd_case_publish(cp, state); 1227 else { 1228 fmd_case_hold(cp); 1229 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1230 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1231 } 1232 1233 /* 1234 * If we transitioned to REPAIRED, adjust the reference count to 1235 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 1236 * not placed an additional hold on the case, it will now be freed. 1237 */ 1238 if (state == FMD_CASE_REPAIRED) { 1239 (void) pthread_mutex_lock(&cip->ci_lock); 1240 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1241 (void) pthread_mutex_unlock(&cip->ci_lock); 1242 fmd_case_rele(cp); 1243 } 1244 } 1245 1246 /* 1247 * Transition the specified case to *at least* the specified state by first 1248 * re-validating the suspect list using the resource cache. This function is 1249 * employed by the checkpoint code when restoring a saved, solved case to see 1250 * if the state of the case has effectively changed while fmd was not running 1251 * or the module was not loaded. If none of the suspects are present anymore, 1252 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 1253 */ 1254 void 1255 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1256 { 1257 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1258 1259 int faulty = 0; /* are any suspects faulty? */ 1260 int usable = 0; /* are any suspects usable? */ 1261 1262 ASSERT(state >= FMD_CASE_SOLVED); 1263 (void) pthread_mutex_lock(&cip->ci_lock); 1264 1265 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1266 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1267 1268 (void) pthread_mutex_unlock(&cip->ci_lock); 1269 1270 /* 1271 * If none of the suspects were faulty, it implies they were either 1272 * repaired already or not present and the rsrc.age time has expired. 1273 * We can move the state on to repaired. 1274 */ 1275 if (!faulty) { 1276 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1277 flags |= FMD_CF_REPAIRED; 1278 } else if (!usable) { 1279 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1280 flags |= FMD_CF_ISOLATED; 1281 } 1282 1283 fmd_case_transition(cp, state, flags); 1284 } 1285 1286 void 1287 fmd_case_setdirty(fmd_case_t *cp) 1288 { 1289 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1290 1291 (void) pthread_mutex_lock(&cip->ci_lock); 1292 cip->ci_flags |= FMD_CF_DIRTY; 1293 (void) pthread_mutex_unlock(&cip->ci_lock); 1294 1295 fmd_module_setcdirty(cip->ci_mod); 1296 } 1297 1298 void 1299 fmd_case_clrdirty(fmd_case_t *cp) 1300 { 1301 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1302 1303 (void) pthread_mutex_lock(&cip->ci_lock); 1304 cip->ci_flags &= ~FMD_CF_DIRTY; 1305 (void) pthread_mutex_unlock(&cip->ci_lock); 1306 } 1307 1308 void 1309 fmd_case_commit(fmd_case_t *cp) 1310 { 1311 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1312 fmd_case_item_t *cit; 1313 1314 (void) pthread_mutex_lock(&cip->ci_lock); 1315 1316 if (cip->ci_flags & FMD_CF_DIRTY) { 1317 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1318 fmd_event_commit(cit->cit_event); 1319 1320 if (cip->ci_principal != NULL) 1321 fmd_event_commit(cip->ci_principal); 1322 1323 fmd_buf_hash_commit(&cip->ci_bufs); 1324 cip->ci_flags &= ~FMD_CF_DIRTY; 1325 } 1326 1327 (void) pthread_mutex_unlock(&cip->ci_lock); 1328 } 1329 1330 /* 1331 * Indicate that the case may need to change state because one or more of the 1332 * ASRUs named as a suspect has changed state. We examine all the suspects 1333 * and if none are still faulty, we initiate a case close transition. 1334 */ 1335 void 1336 fmd_case_update(fmd_case_t *cp) 1337 { 1338 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1339 uint_t cstate; 1340 int faulty = 0; 1341 1342 (void) pthread_mutex_lock(&cip->ci_lock); 1343 cstate = cip->ci_state; 1344 1345 if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1346 (void) pthread_mutex_unlock(&cip->ci_lock); 1347 return; /* update is not appropriate */ 1348 } 1349 1350 if (cip->ci_flags & FMD_CF_REPAIRED) { 1351 (void) pthread_mutex_unlock(&cip->ci_lock); 1352 return; /* already repaired */ 1353 } 1354 1355 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1356 (void) pthread_mutex_unlock(&cip->ci_lock); 1357 1358 if (faulty) 1359 return; /* one or more suspects are still marked faulty */ 1360 1361 if (cstate == FMD_CASE_CLOSED) 1362 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1363 else 1364 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1365 } 1366 1367 /* 1368 * Delete a closed case from the module's case list once the fmdo_close() entry 1369 * point has run to completion. If the case is owned by a transport module, 1370 * tell the transport to proxy a case close on the other end of the transport. 1371 * If not, transition to the appropriate next state based on ci_flags. This 1372 * function represents the end of CLOSE_WAIT and transitions the case to either 1373 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1374 * refer to the topmost block comment explaining the state machine for details. 1375 */ 1376 void 1377 fmd_case_delete(fmd_case_t *cp) 1378 { 1379 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1380 fmd_modstat_t *msp; 1381 size_t buftotal; 1382 1383 ASSERT(fmd_module_locked(cip->ci_mod)); 1384 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1385 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1386 1387 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1388 msp = cip->ci_mod->mod_stats; 1389 1390 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1391 msp->ms_caseopen.fmds_value.ui64--; 1392 1393 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1394 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1395 1396 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1397 1398 if (cip->ci_xprt == NULL) 1399 fmd_module_setcdirty(cip->ci_mod); 1400 1401 fmd_module_rele(cip->ci_mod); 1402 cip->ci_mod = fmd.d_rmod; 1403 fmd_module_hold(cip->ci_mod); 1404 1405 /* 1406 * If the case is not proxied and it has been solved, then retain it 1407 * on the root module's case list at least until we're transitioned. 1408 * Otherwise free the case with our final fmd_case_rele() below. 1409 */ 1410 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1411 fmd_module_lock(cip->ci_mod); 1412 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1413 fmd_module_unlock(cip->ci_mod); 1414 fmd_case_hold(cp); 1415 } 1416 1417 /* 1418 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1419 * rather than orphaned because by definition it can have no entries 1420 * in the resource cache of the current fault manager. 1421 */ 1422 if (cip->ci_xprt != NULL) 1423 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1424 else if (cip->ci_flags & FMD_CF_REPAIRED) 1425 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1426 else if (cip->ci_flags & FMD_CF_ISOLATED) 1427 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1428 1429 fmd_case_rele(cp); 1430 } 1431 1432 void 1433 fmd_case_discard(fmd_case_t *cp) 1434 { 1435 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1436 1437 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1438 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1439 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1440 1441 ASSERT(fmd_module_locked(cip->ci_mod)); 1442 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1443 fmd_case_rele(cp); 1444 } 1445 1446 /* 1447 * Indicate that the problem corresponding to a case has been repaired by 1448 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1449 * already been closed, this function initiates the transition to CLOSE_WAIT. 1450 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1451 * grab and drop ci_lock without the case being able to be freed in between. 1452 */ 1453 int 1454 fmd_case_repair(fmd_case_t *cp) 1455 { 1456 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1457 uint_t cstate; 1458 1459 (void) pthread_mutex_lock(&cip->ci_lock); 1460 cstate = cip->ci_state; 1461 1462 if (cip->ci_xprt != NULL) { 1463 (void) pthread_mutex_unlock(&cip->ci_lock); 1464 return (fmd_set_errno(EFMD_CASE_OWNER)); 1465 } 1466 1467 if (cstate < FMD_CASE_SOLVED) { 1468 (void) pthread_mutex_unlock(&cip->ci_lock); 1469 return (fmd_set_errno(EFMD_CASE_STATE)); 1470 } 1471 1472 if (cip->ci_flags & FMD_CF_REPAIRED) { 1473 (void) pthread_mutex_unlock(&cip->ci_lock); 1474 return (0); /* already repaired */ 1475 } 1476 1477 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repair, NULL); 1478 (void) pthread_mutex_unlock(&cip->ci_lock); 1479 1480 if (cstate == FMD_CASE_CLOSED) 1481 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1482 else 1483 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1484 1485 return (0); 1486 } 1487 1488 int 1489 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1490 { 1491 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1492 fmd_case_item_t *cit; 1493 uint_t state; 1494 int rv = 0; 1495 1496 (void) pthread_mutex_lock(&cip->ci_lock); 1497 1498 if (cip->ci_state >= FMD_CASE_SOLVED) 1499 state = FMD_EVS_DIAGNOSED; 1500 else 1501 state = FMD_EVS_ACCEPTED; 1502 1503 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1504 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1505 break; 1506 } 1507 1508 if (rv == 0 && cip->ci_principal != NULL) 1509 rv = fmd_event_equal(ep, cip->ci_principal); 1510 1511 (void) pthread_mutex_unlock(&cip->ci_lock); 1512 1513 if (rv != 0) 1514 fmd_event_transition(ep, state); 1515 1516 return (rv); 1517 } 1518 1519 int 1520 fmd_case_orphaned(fmd_case_t *cp) 1521 { 1522 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1523 } 1524 1525 void 1526 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1527 { 1528 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1529 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1530 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1531 } 1532