1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * | 1 | 54 * | | 55 * | +-------v----+ 56 * 2 | | SOLVED | 57 * | +------------+ 58 * | 3 | 5 | 59 * +------------+ | | 60 * | | | 61 * +-v---v----v-+ 62 * | CLOSE_WAIT | 63 * +------------+ 64 * | | | 65 * +-----------+ | +------------+ 66 * | 4 | | 67 * v +-----v------+ | 68 * discard | CLOSED | 6 | 69 * +------------+ | 70 * | | 71 * | +------------+ 72 * 7 | | 73 * +-----v----v-+ 74 * | REPAIRED | 75 * +------------+ 76 * | 77 * 8 | 78 * +-----v------+ 79 * | RESOLVED | 80 * +------------+ 81 * | 82 * v 83 * discard 84 * 85 * The state machine changes are triggered by calls to fmd_case_transition() 86 * from various locations inside of fmd, as described below: 87 * 88 * [1] Called by: fmd_case_solve() 89 * Actions: FMD_CF_SOLVED flag is set in ci_flags 90 * conviction policy is applied to suspect list 91 * suspects convicted are marked faulty (F) in R$ 92 * list.suspect event logged and dispatched 93 * 94 * [2] Called by: fmd_case_close(), fmd_case_uuclose() 95 * Actions: diagnosis engine fmdo_close() entry point scheduled 96 * case discarded upon exit from CLOSE_WAIT 97 * 98 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 99 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 100 * suspects convicted (F) are marked unusable (U) in R$ 101 * diagnosis engine fmdo_close() entry point scheduled 102 * case transitions to CLOSED [4] upon exit from CLOSE_WAIT 103 * 104 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 105 * Actions: list.isolated event dispatched 106 * case deleted from module's list of open cases 107 * 108 * [5] Called by: fmd_case_repair(), fmd_case_update() 109 * Actions: FMD_CF_REPAIR flag is set in ci_flags 110 * diagnosis engine fmdo_close() entry point scheduled 111 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 112 * 113 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 114 * Actions: suspects convicted are marked non faulty (!F) in R$ 115 * list.repaired or list.updated event dispatched 116 * 117 * [7] Called by: fmd_case_repair(), fmd_case_update() 118 * Actions: FMD_CF_REPAIR flag is set in ci_flags 119 * suspects convicted are marked non faulty (!F) in R$ 120 * list.repaired or list.updated event dispatched 121 * 122 * [8] Called by: fmd_case_uuresolve() 123 * Actions: list.resolved event dispatched 124 * case is discarded 125 */ 126 127 #include <sys/fm/protocol.h> 128 #include <uuid/uuid.h> 129 #include <alloca.h> 130 131 #include <fmd_alloc.h> 132 #include <fmd_module.h> 133 #include <fmd_error.h> 134 #include <fmd_conf.h> 135 #include <fmd_case.h> 136 #include <fmd_string.h> 137 #include <fmd_subr.h> 138 #include <fmd_protocol.h> 139 #include <fmd_event.h> 140 #include <fmd_eventq.h> 141 #include <fmd_dispq.h> 142 #include <fmd_buf.h> 143 #include <fmd_log.h> 144 #include <fmd_asru.h> 145 #include <fmd_fmri.h> 146 #include <fmd_xprt.h> 147 148 #include <fmd.h> 149 150 static const char *const _fmd_case_snames[] = { 151 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 152 "SOLVED", /* FMD_CASE_SOLVED */ 153 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 154 "CLOSED", /* FMD_CASE_CLOSED */ 155 "REPAIRED", /* FMD_CASE_REPAIRED */ 156 "RESOLVED" /* FMD_CASE_RESOLVED */ 157 }; 158 159 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 160 161 fmd_case_hash_t * 162 fmd_case_hash_create(void) 163 { 164 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 165 166 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 167 chp->ch_hashlen = fmd.d_str_buckets; 168 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 169 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 170 FMD_SLEEP); 171 chp->ch_count = 0; 172 173 return (chp); 174 } 175 176 /* 177 * Destroy the case hash. Unlike most of our hash tables, no active references 178 * are kept by the case hash itself; all references come from other subsystems. 179 * The hash must be destroyed after all modules are unloaded; if anything was 180 * present in the hash it would be by definition a reference count leak. 181 */ 182 void 183 fmd_case_hash_destroy(fmd_case_hash_t *chp) 184 { 185 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 186 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 187 fmd_free(chp, sizeof (fmd_case_hash_t)); 188 } 189 190 /* 191 * Take a snapshot of the case hash by placing an additional hold on each 192 * member in an auxiliary array, and then call 'func' for each case. 193 */ 194 void 195 fmd_case_hash_apply(fmd_case_hash_t *chp, 196 void (*func)(fmd_case_t *, void *), void *arg) 197 { 198 fmd_case_impl_t *cp, **cps, **cpp; 199 uint_t cpc, i; 200 201 (void) pthread_rwlock_rdlock(&chp->ch_lock); 202 203 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 204 cpc = chp->ch_count; 205 206 for (i = 0; i < chp->ch_hashlen; i++) { 207 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 208 if (fmd_case_tryhold(cp) != NULL) 209 *cpp++ = cp; 210 } 211 } 212 213 ASSERT(cpp == cps + cpc); 214 (void) pthread_rwlock_unlock(&chp->ch_lock); 215 216 for (i = 0; i < cpc; i++) { 217 func((fmd_case_t *)cps[i], arg); 218 fmd_case_rele((fmd_case_t *)cps[i]); 219 } 220 221 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 222 } 223 224 static void 225 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 226 { 227 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 228 229 cip->ci_code_next = chp->ch_code_hash[h]; 230 chp->ch_code_hash[h] = cip; 231 } 232 233 static void 234 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 235 { 236 fmd_case_impl_t **pp, *cp; 237 238 if (cip->ci_code) { 239 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 240 241 pp = &chp->ch_code_hash[h]; 242 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 243 if (cp != cip) 244 pp = &cp->ci_code_next; 245 else 246 break; 247 } 248 if (cp != NULL) { 249 *pp = cp->ci_code_next; 250 cp->ci_code_next = NULL; 251 } 252 } 253 } 254 255 /* 256 * Look up the diagcode for this case and cache it in ci_code. If no suspects 257 * were defined for this case or if the lookup fails, the event dictionary or 258 * module code is broken, and we set the event code to a precomputed default. 259 */ 260 static const char * 261 fmd_case_mkcode(fmd_case_t *cp) 262 { 263 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 264 fmd_case_susp_t *cis; 265 fmd_case_hash_t *chp = fmd.d_cases; 266 267 char **keys, **keyp; 268 const char *s; 269 270 ASSERT(MUTEX_HELD(&cip->ci_lock)); 271 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 272 273 /* 274 * delete any existing entry from code hash if it is on it 275 */ 276 fmd_case_code_hash_delete(chp, cip); 277 278 fmd_free(cip->ci_code, cip->ci_codelen); 279 cip->ci_codelen = cip->ci_mod->mod_codelen; 280 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 281 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 282 283 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 284 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 285 keyp++; 286 } 287 288 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 289 290 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 291 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 292 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 293 fmd_free(cip->ci_code, cip->ci_codelen); 294 cip->ci_codelen = strlen(s) + 1; 295 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 296 (void) strcpy(cip->ci_code, s); 297 } 298 299 /* 300 * add into hash of solved cases 301 */ 302 fmd_case_code_hash_insert(chp, cip); 303 304 return (cip->ci_code); 305 } 306 307 typedef struct { 308 int *fcl_countp; 309 uint8_t *fcl_ba; 310 nvlist_t **fcl_nva; 311 int *fcl_msgp; 312 } fmd_case_lst_t; 313 314 static void 315 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 316 { 317 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 318 boolean_t b; 319 int state; 320 321 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 322 &b) == 0 && b == B_FALSE) 323 *entryp->fcl_msgp = B_FALSE; 324 entryp->fcl_ba[*entryp->fcl_countp] = 0; 325 state = fmd_asru_al_getstate(alp); 326 if (state & FMD_ASRU_DEGRADED) 327 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED; 328 if (state & FMD_ASRU_UNUSABLE) 329 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 330 if (state & FMD_ASRU_FAULTY) 331 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 332 if (!(state & FMD_ASRU_PRESENT)) 333 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 334 if (alp->al_reason == FMD_ASRU_REPAIRED) 335 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED; 336 else if (alp->al_reason == FMD_ASRU_REPLACED) 337 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED; 338 else if (alp->al_reason == FMD_ASRU_ACQUITTED) 339 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED; 340 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 341 (*entryp->fcl_countp)++; 342 } 343 344 static void 345 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 346 { 347 int *faultyp = (int *)arg; 348 349 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 350 } 351 352 static void 353 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 354 { 355 int *usablep = (int *)arg; 356 357 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 358 } 359 360 static void 361 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg) 362 { 363 int *not_faultyp = (int *)arg; 364 365 *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY); 366 } 367 368 /* 369 * Have we got any suspects with an asru that are still unusable and present? 370 */ 371 static void 372 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg) 373 { 374 int *rvalp = (int *)arg; 375 int state = fmd_asru_al_getstate(alp); 376 nvlist_t *asru; 377 378 if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0) 379 return; 380 *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT)); 381 } 382 383 nvlist_t * 384 fmd_case_mkevent(fmd_case_t *cp, const char *class) 385 { 386 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 387 nvlist_t **nva, *nvl; 388 uint8_t *ba; 389 int msg = B_TRUE; 390 const char *code; 391 fmd_case_lst_t fcl; 392 int count = 0; 393 394 (void) pthread_mutex_lock(&cip->ci_lock); 395 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 396 397 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 398 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 399 400 /* 401 * For each suspect associated with the case, store its fault event 402 * nvlist in 'nva'. We also look to see if any of the suspect faults 403 * have asked not to be messaged. If any of them have made such a 404 * request, propagate that attribute to the composite list.* event. 405 * Finally, store each suspect's faulty status into the bitmap 'ba'. 406 */ 407 fcl.fcl_countp = &count; 408 fcl.fcl_msgp = &msg; 409 fcl.fcl_ba = ba; 410 fcl.fcl_nva = nva; 411 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 412 413 if (cip->ci_code == NULL) 414 (void) fmd_case_mkcode(cp); 415 /* 416 * For repair and updated event, we lookup diagcode from dict using key 417 * "list.repaired" or "list.updated" or "list.resolved". 418 */ 419 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 420 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 421 else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) 422 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code); 423 else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0) 424 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code); 425 else 426 code = cip->ci_code; 427 428 if (msg == B_FALSE) 429 cip->ci_flags |= FMD_CF_INVISIBLE; 430 431 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 432 code, count, nva, ba, msg, &cip->ci_tv); 433 434 (void) pthread_mutex_unlock(&cip->ci_lock); 435 return (nvl); 436 } 437 438 static boolean_t 439 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 440 { 441 nvlist_t *new_rsrc; 442 nvlist_t *rsrc; 443 char *new_name = NULL; 444 char *name = NULL; 445 ssize_t new_namelen; 446 ssize_t namelen; 447 int fmri_present = 1; 448 int new_fmri_present = 1; 449 int match = B_FALSE; 450 fmd_topo_t *ftp = fmd_topo_hold(); 451 452 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 453 fmri_present = 0; 454 else { 455 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 456 goto done; 457 name = fmd_alloc(namelen + 1, FMD_SLEEP); 458 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 459 goto done; 460 } 461 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 462 new_fmri_present = 0; 463 else { 464 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 465 goto done; 466 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 467 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 468 goto done; 469 } 470 match = (fmri_present == new_fmri_present && 471 (fmri_present == 0 || 472 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 473 done: 474 if (name != NULL) 475 fmd_free(name, namelen + 1); 476 if (new_name != NULL) 477 fmd_free(new_name, new_namelen + 1); 478 fmd_topo_rele(ftp); 479 return (match); 480 } 481 482 static int 483 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 484 { 485 char *class, *new_class; 486 487 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 488 return (0); 489 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 490 FM_FAULT_RESOURCE)) 491 return (0); 492 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 493 return (0); 494 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 495 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 496 return (strcmp(class, new_class) == 0); 497 } 498 499 /* 500 * see if an identical suspect list already exists in the cache 501 */ 502 static int 503 fmd_case_check_for_dups(fmd_case_t *cp) 504 { 505 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 506 fmd_case_hash_t *chp = fmd.d_cases; 507 fmd_case_susp_t *xcis, *cis; 508 int match = 0, match_susp; 509 uint_t h; 510 511 (void) pthread_rwlock_rdlock(&chp->ch_lock); 512 513 /* 514 * Find all cases with this code 515 */ 516 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 517 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 518 xcip = xcip->ci_code_next) { 519 /* 520 * only look for any cases (apart from this one) 521 * whose code and number of suspects match 522 */ 523 if (xcip == cip || fmd_case_tryhold(xcip) == NULL) 524 continue; 525 if (strcmp(xcip->ci_code, cip->ci_code) != 0 || 526 xcip->ci_nsuspects != cip->ci_nsuspects) { 527 fmd_case_rele((fmd_case_t *)xcip); 528 continue; 529 } 530 531 /* 532 * For each suspect in one list, check if there 533 * is an identical suspect in the other list 534 */ 535 match = 1; 536 for (xcis = xcip->ci_suspects; xcis != NULL; 537 xcis = xcis->cis_next) { 538 match_susp = 0; 539 for (cis = cip->ci_suspects; cis != NULL; 540 cis = cis->cis_next) { 541 if (fmd_case_match_suspect(cis, xcis) == 1) { 542 match_susp = 1; 543 break; 544 } 545 } 546 if (match_susp == 0) { 547 match = 0; 548 break; 549 } 550 } 551 fmd_case_rele((fmd_case_t *)xcip); 552 if (match) { 553 (void) pthread_rwlock_unlock(&chp->ch_lock); 554 return (1); 555 } 556 } 557 (void) pthread_rwlock_unlock(&chp->ch_lock); 558 return (0); 559 } 560 561 /* 562 * Convict suspects in a case by applying a conviction policy and updating the 563 * resource cache prior to emitting the list.suspect event for the given case. 564 * At present, our policy is very simple: convict every suspect in the case. 565 * In the future, this policy can be extended and made configurable to permit: 566 * 567 * - convicting the suspect with the highest FIT rate 568 * - convicting the suspect with the cheapest FRU 569 * - convicting the suspect with the FRU that is in a depot's inventory 570 * - convicting the suspect with the longest lifetime 571 * 572 * and so forth. A word to the wise: this problem is significantly harder that 573 * it seems at first glance. Future work should heed the following advice: 574 * 575 * Hacking the policy into C code here is a very bad idea. The policy needs to 576 * be decided upon very carefully and fundamentally encodes knowledge of what 577 * suspect list combinations can be emitted by what diagnosis engines. As such 578 * fmd's code is the wrong location, because that would require fmd itself to 579 * be updated for every diagnosis engine change, defeating the entire design. 580 * The FMA Event Registry knows the suspect list combinations: policy inputs 581 * can be derived from it and used to produce per-module policy configuration. 582 * 583 * If the policy needs to be dynamic and not statically fixed at either fmd 584 * startup or module load time, any implementation of dynamic policy retrieval 585 * must employ some kind of caching mechanism or be part of a built-in module. 586 * The fmd_case_convict() function is called with locks held inside of fmd and 587 * is not a place where unbounded blocking on some inter-process or inter- 588 * system communication to another service (e.g. another daemon) can occur. 589 */ 590 static int 591 fmd_case_convict(fmd_case_t *cp) 592 { 593 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 594 fmd_asru_hash_t *ahp = fmd.d_asrus; 595 596 fmd_case_susp_t *cis; 597 fmd_asru_link_t *alp; 598 599 (void) pthread_mutex_lock(&cip->ci_lock); 600 (void) fmd_case_mkcode(cp); 601 if (fmd_case_check_for_dups(cp) == 1) { 602 (void) pthread_mutex_unlock(&cip->ci_lock); 603 return (1); 604 } 605 606 /* 607 * no suspect list already exists - allocate new cache entries 608 */ 609 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 610 if ((alp = fmd_asru_hash_create_entry(ahp, 611 cp, cis->cis_nvl)) == NULL) { 612 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 613 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 614 continue; 615 } 616 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0); 617 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 618 } 619 620 (void) pthread_mutex_unlock(&cip->ci_lock); 621 return (0); 622 } 623 624 void 625 fmd_case_publish(fmd_case_t *cp, uint_t state) 626 { 627 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 628 fmd_event_t *e; 629 nvlist_t *nvl; 630 char *class; 631 632 if (state == FMD_CASE_CURRENT) 633 state = cip->ci_state; /* use current state */ 634 635 switch (state) { 636 case FMD_CASE_SOLVED: 637 (void) pthread_mutex_lock(&cip->ci_lock); 638 if (cip->ci_tv_valid == 0) { 639 fmd_time_gettimeofday(&cip->ci_tv); 640 cip->ci_tv_valid = 1; 641 } 642 (void) pthread_mutex_unlock(&cip->ci_lock); 643 644 if (fmd_case_convict(cp) == 1) { /* dupclose */ 645 cip->ci_flags &= ~FMD_CF_SOLVED; 646 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 647 break; 648 } 649 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 650 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 651 652 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 653 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 654 fmd_log_append(fmd.d_fltlog, e, cp); 655 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 656 fmd_dispq_dispatch(fmd.d_disp, e, class); 657 658 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 659 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 660 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 661 662 break; 663 664 case FMD_CASE_CLOSE_WAIT: 665 fmd_case_hold(cp); 666 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 667 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 668 669 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 670 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 671 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 672 673 break; 674 675 case FMD_CASE_CLOSED: 676 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 677 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 678 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 679 fmd_dispq_dispatch(fmd.d_disp, e, class); 680 break; 681 682 case FMD_CASE_REPAIRED: 683 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 684 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 685 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 686 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 687 fmd_log_append(fmd.d_fltlog, e, cp); 688 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 689 fmd_dispq_dispatch(fmd.d_disp, e, class); 690 break; 691 692 case FMD_CASE_RESOLVED: 693 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS); 694 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 695 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 696 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 697 fmd_log_append(fmd.d_fltlog, e, cp); 698 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 699 fmd_dispq_dispatch(fmd.d_disp, e, class); 700 break; 701 } 702 } 703 704 fmd_case_t * 705 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 706 { 707 fmd_case_impl_t *cip; 708 uint_t h; 709 710 (void) pthread_rwlock_rdlock(&chp->ch_lock); 711 h = fmd_strhash(uuid) % chp->ch_hashlen; 712 713 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 714 if (strcmp(cip->ci_uuid, uuid) == 0) 715 break; 716 } 717 718 /* 719 * If deleting bit is set, treat the case as if it doesn't exist. 720 */ 721 if (cip != NULL) 722 cip = fmd_case_tryhold(cip); 723 724 if (cip == NULL) 725 (void) fmd_set_errno(EFMD_CASE_INVAL); 726 727 (void) pthread_rwlock_unlock(&chp->ch_lock); 728 return ((fmd_case_t *)cip); 729 } 730 731 static fmd_case_impl_t * 732 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 733 { 734 fmd_case_impl_t *eip; 735 uint_t h; 736 737 (void) pthread_rwlock_wrlock(&chp->ch_lock); 738 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 739 740 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 741 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 742 fmd_case_tryhold(eip) != NULL) { 743 (void) pthread_rwlock_unlock(&chp->ch_lock); 744 return (eip); /* uuid already present */ 745 } 746 } 747 748 cip->ci_next = chp->ch_hash[h]; 749 chp->ch_hash[h] = cip; 750 751 chp->ch_count++; 752 ASSERT(chp->ch_count != 0); 753 754 (void) pthread_rwlock_unlock(&chp->ch_lock); 755 return (cip); 756 } 757 758 static void 759 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 760 { 761 fmd_case_impl_t *cp, **pp; 762 uint_t h; 763 764 ASSERT(MUTEX_HELD(&cip->ci_lock)); 765 766 cip->ci_flags |= FMD_CF_DELETING; 767 (void) pthread_mutex_unlock(&cip->ci_lock); 768 769 (void) pthread_rwlock_wrlock(&chp->ch_lock); 770 771 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 772 pp = &chp->ch_hash[h]; 773 774 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 775 if (cp != cip) 776 pp = &cp->ci_next; 777 else 778 break; 779 } 780 781 if (cp == NULL) { 782 fmd_panic("case %p (%s) not found on hash chain %u\n", 783 (void *)cip, cip->ci_uuid, h); 784 } 785 786 *pp = cp->ci_next; 787 cp->ci_next = NULL; 788 789 /* 790 * delete from code hash if it is on it 791 */ 792 fmd_case_code_hash_delete(chp, cip); 793 794 ASSERT(chp->ch_count != 0); 795 chp->ch_count--; 796 797 (void) pthread_rwlock_unlock(&chp->ch_lock); 798 799 (void) pthread_mutex_lock(&cip->ci_lock); 800 ASSERT(cip->ci_flags & FMD_CF_DELETING); 801 } 802 803 fmd_case_t * 804 fmd_case_create(fmd_module_t *mp, void *data) 805 { 806 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 807 fmd_case_impl_t *eip = NULL; 808 uuid_t uuid; 809 810 (void) pthread_mutex_init(&cip->ci_lock, NULL); 811 fmd_buf_hash_create(&cip->ci_bufs); 812 813 fmd_module_hold(mp); 814 cip->ci_mod = mp; 815 cip->ci_refs = 1; 816 cip->ci_state = FMD_CASE_UNSOLVED; 817 cip->ci_flags = FMD_CF_DIRTY; 818 cip->ci_data = data; 819 820 /* 821 * Calling libuuid: get a clue. The library interfaces cleverly do not 822 * define any constant for the length of an unparse string, and do not 823 * permit the caller to specify a buffer length for safety. The spec 824 * says it will be 36 bytes, but we make it tunable just in case. 825 */ 826 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 827 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 828 829 /* 830 * We expect this loop to execute only once, but code it defensively 831 * against the possibility of libuuid bugs. Keep generating uuids and 832 * attempting to do a hash insert until we get a unique one. 833 */ 834 do { 835 if (eip != NULL) 836 fmd_case_rele((fmd_case_t *)eip); 837 uuid_generate(uuid); 838 uuid_unparse(uuid, cip->ci_uuid); 839 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 840 841 ASSERT(fmd_module_locked(mp)); 842 fmd_list_append(&mp->mod_cases, cip); 843 fmd_module_setcdirty(mp); 844 845 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 846 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 847 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 848 849 return ((fmd_case_t *)cip); 850 } 851 852 static void 853 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 854 { 855 fmd_case_susp_t *cis, *ncis; 856 857 ASSERT(MUTEX_HELD(&cip->ci_lock)); 858 859 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 860 ncis = cis->cis_next; 861 nvlist_free(cis->cis_nvl); 862 fmd_free(cis, sizeof (fmd_case_susp_t)); 863 } 864 865 cip->ci_suspects = NULL; 866 cip->ci_nsuspects = 0; 867 } 868 869 fmd_case_t * 870 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 871 uint_t state, const char *uuid, const char *code) 872 { 873 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 874 fmd_case_impl_t *eip; 875 876 ASSERT(state < FMD_CASE_RESOLVED); 877 878 (void) pthread_mutex_init(&cip->ci_lock, NULL); 879 fmd_buf_hash_create(&cip->ci_bufs); 880 881 fmd_module_hold(mp); 882 cip->ci_mod = mp; 883 cip->ci_xprt = xp; 884 cip->ci_refs = 1; 885 cip->ci_state = state; 886 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 887 cip->ci_uuidlen = strlen(cip->ci_uuid); 888 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 889 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 890 891 if (state > FMD_CASE_CLOSE_WAIT) 892 cip->ci_flags |= FMD_CF_SOLVED; 893 894 /* 895 * Insert the case into the global case hash. If the specified UUID is 896 * already present, check to see if it is an orphan: if so, reclaim it; 897 * otherwise if it is owned by a different module then return NULL. 898 */ 899 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 900 (void) pthread_mutex_lock(&cip->ci_lock); 901 cip->ci_refs--; /* decrement to zero */ 902 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 903 904 cip = eip; /* switch 'cip' to the existing case */ 905 (void) pthread_mutex_lock(&cip->ci_lock); 906 907 /* 908 * If the ASRU cache is trying to recreate an orphan, then just 909 * return the existing case that we found without changing it. 910 */ 911 if (mp == fmd.d_rmod) { 912 /* 913 * When recreating an orphan case, state passed in may 914 * either be CLOSED (faulty) or REPAIRED (!faulty). If 915 * any suspects are still CLOSED (faulty) then the 916 * overall state needs to be CLOSED. 917 */ 918 if (state == FMD_CASE_CLOSED) 919 cip->ci_state = FMD_CASE_CLOSED; 920 (void) pthread_mutex_unlock(&cip->ci_lock); 921 fmd_case_rele((fmd_case_t *)cip); 922 return ((fmd_case_t *)cip); 923 } 924 925 /* 926 * If the existing case isn't an orphan or is being proxied, 927 * then we have a UUID conflict: return failure to the caller. 928 */ 929 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 930 (void) pthread_mutex_unlock(&cip->ci_lock); 931 fmd_case_rele((fmd_case_t *)cip); 932 return (NULL); 933 } 934 935 /* 936 * If the new module is reclaiming an orphaned case, remove 937 * the case from the root module, switch ci_mod, and then fall 938 * through to adding the case to the new owner module 'mp'. 939 */ 940 fmd_module_lock(cip->ci_mod); 941 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 942 fmd_module_unlock(cip->ci_mod); 943 944 fmd_module_rele(cip->ci_mod); 945 cip->ci_mod = mp; 946 fmd_module_hold(mp); 947 948 fmd_case_destroy_suspects(cip); 949 cip->ci_state = state; 950 951 (void) pthread_mutex_unlock(&cip->ci_lock); 952 fmd_case_rele((fmd_case_t *)cip); 953 } else { 954 /* 955 * add into hash of solved cases 956 */ 957 if (cip->ci_code) 958 fmd_case_code_hash_insert(fmd.d_cases, cip); 959 } 960 961 ASSERT(fmd_module_locked(mp)); 962 fmd_list_append(&mp->mod_cases, cip); 963 964 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 965 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 966 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 967 968 return ((fmd_case_t *)cip); 969 } 970 971 void 972 fmd_case_destroy(fmd_case_t *cp, int visible) 973 { 974 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 975 fmd_case_item_t *cit, *ncit; 976 977 ASSERT(MUTEX_HELD(&cip->ci_lock)); 978 ASSERT(cip->ci_refs == 0); 979 980 if (visible) { 981 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 982 fmd_case_hash_delete(fmd.d_cases, cip); 983 } 984 985 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 986 ncit = cit->cit_next; 987 fmd_event_rele(cit->cit_event); 988 fmd_free(cit, sizeof (fmd_case_item_t)); 989 } 990 991 fmd_case_destroy_suspects(cip); 992 993 if (cip->ci_principal != NULL) 994 fmd_event_rele(cip->ci_principal); 995 996 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 997 fmd_free(cip->ci_code, cip->ci_codelen); 998 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 999 1000 fmd_module_rele(cip->ci_mod); 1001 fmd_free(cip, sizeof (fmd_case_impl_t)); 1002 } 1003 1004 void 1005 fmd_case_hold(fmd_case_t *cp) 1006 { 1007 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1008 1009 (void) pthread_mutex_lock(&cip->ci_lock); 1010 fmd_case_hold_locked(cp); 1011 (void) pthread_mutex_unlock(&cip->ci_lock); 1012 } 1013 1014 void 1015 fmd_case_hold_locked(fmd_case_t *cp) 1016 { 1017 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1018 1019 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1020 if (cip->ci_flags & FMD_CF_DELETING) 1021 fmd_panic("attempt to hold a deleting case %p (%s)\n", 1022 (void *)cip, cip->ci_uuid); 1023 cip->ci_refs++; 1024 ASSERT(cip->ci_refs != 0); 1025 } 1026 1027 static fmd_case_impl_t * 1028 fmd_case_tryhold(fmd_case_impl_t *cip) 1029 { 1030 /* 1031 * If the case's "deleting" bit is unset, hold and return case, 1032 * otherwise, return NULL. 1033 */ 1034 (void) pthread_mutex_lock(&cip->ci_lock); 1035 if (cip->ci_flags & FMD_CF_DELETING) { 1036 (void) pthread_mutex_unlock(&cip->ci_lock); 1037 cip = NULL; 1038 } else { 1039 fmd_case_hold_locked((fmd_case_t *)cip); 1040 (void) pthread_mutex_unlock(&cip->ci_lock); 1041 } 1042 return (cip); 1043 } 1044 1045 void 1046 fmd_case_rele(fmd_case_t *cp) 1047 { 1048 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1049 1050 (void) pthread_mutex_lock(&cip->ci_lock); 1051 ASSERT(cip->ci_refs != 0); 1052 1053 if (--cip->ci_refs == 0) 1054 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 1055 else 1056 (void) pthread_mutex_unlock(&cip->ci_lock); 1057 } 1058 1059 void 1060 fmd_case_rele_locked(fmd_case_t *cp) 1061 { 1062 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1063 1064 ASSERT(MUTEX_HELD(&cip->ci_lock)); 1065 --cip->ci_refs; 1066 ASSERT(cip->ci_refs != 0); 1067 } 1068 1069 int 1070 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 1071 { 1072 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1073 fmd_case_item_t *cit; 1074 fmd_event_t *oep; 1075 uint_t state; 1076 int new; 1077 1078 fmd_event_hold(ep); 1079 (void) pthread_mutex_lock(&cip->ci_lock); 1080 1081 if (cip->ci_flags & FMD_CF_SOLVED) 1082 state = FMD_EVS_DIAGNOSED; 1083 else 1084 state = FMD_EVS_ACCEPTED; 1085 1086 oep = cip->ci_principal; 1087 cip->ci_principal = ep; 1088 1089 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1090 if (cit->cit_event == ep) 1091 break; 1092 } 1093 1094 cip->ci_flags |= FMD_CF_DIRTY; 1095 new = cit == NULL && ep != oep; 1096 1097 (void) pthread_mutex_unlock(&cip->ci_lock); 1098 1099 fmd_module_setcdirty(cip->ci_mod); 1100 fmd_event_transition(ep, state); 1101 1102 if (oep != NULL) 1103 fmd_event_rele(oep); 1104 1105 return (new); 1106 } 1107 1108 int 1109 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1110 { 1111 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1112 fmd_case_item_t *cit; 1113 uint_t state; 1114 int new; 1115 1116 (void) pthread_mutex_lock(&cip->ci_lock); 1117 1118 if (cip->ci_flags & FMD_CF_SOLVED) 1119 state = FMD_EVS_DIAGNOSED; 1120 else 1121 state = FMD_EVS_ACCEPTED; 1122 1123 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1124 if (cit->cit_event == ep) 1125 break; 1126 } 1127 1128 new = cit == NULL && ep != cip->ci_principal; 1129 1130 /* 1131 * If the event is already in the case or the case is already solved, 1132 * there is no reason to save it: just transition it appropriately. 1133 */ 1134 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1135 (void) pthread_mutex_unlock(&cip->ci_lock); 1136 fmd_event_transition(ep, state); 1137 return (new); 1138 } 1139 1140 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1141 fmd_event_hold(ep); 1142 1143 cit->cit_next = cip->ci_items; 1144 cit->cit_event = ep; 1145 1146 cip->ci_items = cit; 1147 cip->ci_nitems++; 1148 1149 cip->ci_flags |= FMD_CF_DIRTY; 1150 (void) pthread_mutex_unlock(&cip->ci_lock); 1151 1152 fmd_module_setcdirty(cip->ci_mod); 1153 fmd_event_transition(ep, state); 1154 1155 return (new); 1156 } 1157 1158 void 1159 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1160 { 1161 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1162 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1163 1164 (void) pthread_mutex_lock(&cip->ci_lock); 1165 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1166 cip->ci_flags |= FMD_CF_DIRTY; 1167 1168 cis->cis_next = cip->ci_suspects; 1169 cis->cis_nvl = nvl; 1170 1171 cip->ci_suspects = cis; 1172 cip->ci_nsuspects++; 1173 1174 (void) pthread_mutex_unlock(&cip->ci_lock); 1175 fmd_module_setcdirty(cip->ci_mod); 1176 } 1177 1178 void 1179 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1180 { 1181 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1182 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1183 boolean_t b; 1184 1185 (void) pthread_mutex_lock(&cip->ci_lock); 1186 ASSERT(cip->ci_state == FMD_CASE_CLOSED || 1187 cip->ci_state == FMD_CASE_REPAIRED); 1188 ASSERT(cip->ci_mod == fmd.d_rmod); 1189 1190 cis->cis_next = cip->ci_suspects; 1191 cis->cis_nvl = nvl; 1192 1193 if (nvlist_lookup_boolean_value(nvl, 1194 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1195 cip->ci_flags |= FMD_CF_INVISIBLE; 1196 1197 cip->ci_suspects = cis; 1198 cip->ci_nsuspects++; 1199 1200 (void) pthread_mutex_unlock(&cip->ci_lock); 1201 } 1202 1203 void 1204 fmd_case_reset_suspects(fmd_case_t *cp) 1205 { 1206 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1207 1208 (void) pthread_mutex_lock(&cip->ci_lock); 1209 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1210 1211 fmd_case_destroy_suspects(cip); 1212 cip->ci_flags |= FMD_CF_DIRTY; 1213 1214 (void) pthread_mutex_unlock(&cip->ci_lock); 1215 fmd_module_setcdirty(cip->ci_mod); 1216 } 1217 1218 /*ARGSUSED*/ 1219 static void 1220 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1221 { 1222 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1223 } 1224 1225 /* 1226 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1227 * whatever actions and emit whatever events are appropriate for the state. 1228 * Refer to the topmost block comment explaining the state machine for details. 1229 */ 1230 void 1231 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1232 { 1233 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1234 fmd_case_item_t *cit; 1235 fmd_event_t *e; 1236 int resolved = 0; 1237 int any_unusable_and_present = 0; 1238 1239 ASSERT(state <= FMD_CASE_RESOLVED); 1240 (void) pthread_mutex_lock(&cip->ci_lock); 1241 1242 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1243 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 1244 1245 cip->ci_flags |= flags; 1246 1247 if (cip->ci_state >= state) { 1248 (void) pthread_mutex_unlock(&cip->ci_lock); 1249 return; /* already in specified state */ 1250 } 1251 1252 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1253 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1254 1255 cip->ci_state = state; 1256 cip->ci_flags |= FMD_CF_DIRTY; 1257 1258 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1259 fmd_module_setcdirty(cip->ci_mod); 1260 1261 switch (state) { 1262 case FMD_CASE_SOLVED: 1263 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1264 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1265 1266 if (cip->ci_principal != NULL) { 1267 fmd_event_transition(cip->ci_principal, 1268 FMD_EVS_DIAGNOSED); 1269 } 1270 break; 1271 1272 case FMD_CASE_CLOSE_WAIT: 1273 /* 1274 * If the case was never solved, do not change ASRUs. 1275 * If the case was never fmd_case_closed, do not change ASRUs. 1276 * If the case was repaired, do not change ASRUs. 1277 */ 1278 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1279 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1280 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1281 fmd_case_unusable, NULL); 1282 1283 /* 1284 * If an orphaned case transitions to CLOSE_WAIT, the owning 1285 * module is no longer loaded: continue on to CASE_CLOSED. 1286 */ 1287 if (fmd_case_orphaned(cp)) 1288 state = cip->ci_state = FMD_CASE_CLOSED; 1289 break; 1290 1291 case FMD_CASE_REPAIRED: 1292 ASSERT(fmd_case_orphaned(cp)); 1293 1294 /* 1295 * If all suspects are already either usable or not present then 1296 * transition straight to RESOLVED state, publishing both the 1297 * list.repaired and list.resolved. 1298 */ 1299 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1300 fmd_case_unusable_and_present, &any_unusable_and_present); 1301 if (any_unusable_and_present) 1302 break; 1303 1304 fmd_module_lock(cip->ci_mod); 1305 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1306 fmd_module_unlock(cip->ci_mod); 1307 cip->ci_state = FMD_CASE_RESOLVED; 1308 (void) pthread_mutex_unlock(&cip->ci_lock); 1309 fmd_case_publish(cp, state); 1310 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1311 _fmd_case_snames[FMD_CASE_REPAIRED], 1312 _fmd_case_snames[FMD_CASE_RESOLVED])); 1313 state = FMD_CASE_RESOLVED; 1314 resolved = 1; 1315 (void) pthread_mutex_lock(&cip->ci_lock); 1316 break; 1317 1318 case FMD_CASE_RESOLVED: 1319 ASSERT(fmd_case_orphaned(cp)); 1320 1321 /* 1322 * If all suspects are already either usable or not present then 1323 * carry on, publish list.resolved and discard the case. 1324 */ 1325 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1326 fmd_case_unusable_and_present, &any_unusable_and_present); 1327 if (any_unusable_and_present) { 1328 (void) pthread_mutex_unlock(&cip->ci_lock); 1329 return; 1330 } 1331 1332 fmd_module_lock(cip->ci_mod); 1333 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1334 fmd_module_unlock(cip->ci_mod); 1335 resolved = 1; 1336 break; 1337 } 1338 1339 (void) pthread_mutex_unlock(&cip->ci_lock); 1340 1341 /* 1342 * If the module has initialized, then publish the appropriate event 1343 * for the new case state. If not, we are being called from the 1344 * checkpoint code during module load, in which case the module's 1345 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1346 * may not be open yet, which will prevent us from computing the event 1347 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1348 * event in our queue: this won't be processed until _fmd_init is done. 1349 */ 1350 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1351 fmd_case_publish(cp, state); 1352 else { 1353 fmd_case_hold(cp); 1354 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1355 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1356 } 1357 1358 if (resolved) { 1359 /* 1360 * If we transitioned to RESOLVED, adjust the reference count to 1361 * reflect our removal from fmd.d_rmod->mod_cases above. If the 1362 * caller has not placed an additional hold on the case, it 1363 * will now be freed. 1364 */ 1365 (void) pthread_mutex_lock(&cip->ci_lock); 1366 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1367 (void) pthread_mutex_unlock(&cip->ci_lock); 1368 fmd_case_rele(cp); 1369 } 1370 } 1371 1372 /* 1373 * Transition the specified case to *at least* the specified state by first 1374 * re-validating the suspect list using the resource cache. This function is 1375 * employed by the checkpoint code when restoring a saved, solved case to see 1376 * if the state of the case has effectively changed while fmd was not running 1377 * or the module was not loaded. 1378 */ 1379 void 1380 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1381 { 1382 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1383 1384 int usable = 0; /* are any suspects usable? */ 1385 1386 ASSERT(state >= FMD_CASE_SOLVED); 1387 (void) pthread_mutex_lock(&cip->ci_lock); 1388 1389 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1390 1391 (void) pthread_mutex_unlock(&cip->ci_lock); 1392 1393 if (!usable) { 1394 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1395 flags |= FMD_CF_ISOLATED; 1396 } 1397 1398 fmd_case_transition(cp, state, flags); 1399 } 1400 1401 void 1402 fmd_case_setdirty(fmd_case_t *cp) 1403 { 1404 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1405 1406 (void) pthread_mutex_lock(&cip->ci_lock); 1407 cip->ci_flags |= FMD_CF_DIRTY; 1408 (void) pthread_mutex_unlock(&cip->ci_lock); 1409 1410 fmd_module_setcdirty(cip->ci_mod); 1411 } 1412 1413 void 1414 fmd_case_clrdirty(fmd_case_t *cp) 1415 { 1416 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1417 1418 (void) pthread_mutex_lock(&cip->ci_lock); 1419 cip->ci_flags &= ~FMD_CF_DIRTY; 1420 (void) pthread_mutex_unlock(&cip->ci_lock); 1421 } 1422 1423 void 1424 fmd_case_commit(fmd_case_t *cp) 1425 { 1426 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1427 fmd_case_item_t *cit; 1428 1429 (void) pthread_mutex_lock(&cip->ci_lock); 1430 1431 if (cip->ci_flags & FMD_CF_DIRTY) { 1432 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1433 fmd_event_commit(cit->cit_event); 1434 1435 if (cip->ci_principal != NULL) 1436 fmd_event_commit(cip->ci_principal); 1437 1438 fmd_buf_hash_commit(&cip->ci_bufs); 1439 cip->ci_flags &= ~FMD_CF_DIRTY; 1440 } 1441 1442 (void) pthread_mutex_unlock(&cip->ci_lock); 1443 } 1444 1445 /* 1446 * Indicate that the case may need to change state because one or more of the 1447 * ASRUs named as a suspect has changed state. We examine all the suspects 1448 * and if none are still faulty, we initiate a case close transition. 1449 */ 1450 void 1451 fmd_case_update(fmd_case_t *cp) 1452 { 1453 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1454 uint_t cstate; 1455 int faulty = 0; 1456 1457 (void) pthread_mutex_lock(&cip->ci_lock); 1458 cstate = cip->ci_state; 1459 1460 if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1461 (void) pthread_mutex_unlock(&cip->ci_lock); 1462 return; /* update is not appropriate */ 1463 } 1464 1465 if (cip->ci_flags & FMD_CF_REPAIRED) { 1466 (void) pthread_mutex_unlock(&cip->ci_lock); 1467 return; /* already repaired */ 1468 } 1469 1470 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1471 (void) pthread_mutex_unlock(&cip->ci_lock); 1472 1473 if (faulty) { 1474 nvlist_t *nvl; 1475 fmd_event_t *e; 1476 char *class; 1477 1478 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 1479 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1480 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1481 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 1482 fmd_log_append(fmd.d_fltlog, e, cp); 1483 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 1484 fmd_dispq_dispatch(fmd.d_disp, e, class); 1485 return; /* one or more suspects are still marked faulty */ 1486 } 1487 1488 if (cstate == FMD_CASE_CLOSED) 1489 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1490 else 1491 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1492 } 1493 1494 /* 1495 * Delete a closed case from the module's case list once the fmdo_close() entry 1496 * point has run to completion. If the case is owned by a transport module, 1497 * tell the transport to proxy a case close on the other end of the transport. 1498 * If not, transition to the appropriate next state based on ci_flags. This 1499 * function represents the end of CLOSE_WAIT and transitions the case to either 1500 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1501 * refer to the topmost block comment explaining the state machine for details. 1502 */ 1503 void 1504 fmd_case_delete(fmd_case_t *cp) 1505 { 1506 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1507 fmd_modstat_t *msp; 1508 size_t buftotal; 1509 1510 ASSERT(fmd_module_locked(cip->ci_mod)); 1511 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1512 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1513 1514 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1515 msp = cip->ci_mod->mod_stats; 1516 1517 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1518 msp->ms_caseopen.fmds_value.ui64--; 1519 1520 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1521 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1522 1523 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1524 1525 if (cip->ci_xprt == NULL) 1526 fmd_module_setcdirty(cip->ci_mod); 1527 1528 fmd_module_rele(cip->ci_mod); 1529 cip->ci_mod = fmd.d_rmod; 1530 fmd_module_hold(cip->ci_mod); 1531 1532 /* 1533 * If the case is not proxied and it has been solved, then retain it 1534 * on the root module's case list at least until we're transitioned. 1535 * Otherwise free the case with our final fmd_case_rele() below. 1536 */ 1537 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1538 fmd_module_lock(cip->ci_mod); 1539 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1540 fmd_module_unlock(cip->ci_mod); 1541 fmd_case_hold(cp); 1542 } 1543 1544 /* 1545 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1546 * rather than orphaned because by definition it can have no entries 1547 * in the resource cache of the current fault manager. 1548 */ 1549 if (cip->ci_xprt != NULL) 1550 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1551 else if (cip->ci_flags & FMD_CF_REPAIRED) 1552 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1553 else if (cip->ci_flags & FMD_CF_ISOLATED) 1554 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1555 1556 fmd_case_rele(cp); 1557 } 1558 1559 void 1560 fmd_case_discard(fmd_case_t *cp) 1561 { 1562 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1563 1564 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1565 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1566 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1567 1568 ASSERT(fmd_module_locked(cip->ci_mod)); 1569 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1570 fmd_case_rele(cp); 1571 } 1572 1573 /* 1574 * Indicate that the problem corresponding to a case has been repaired by 1575 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1576 * already been closed, this function initiates the transition to CLOSE_WAIT. 1577 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1578 * grab and drop ci_lock without the case being able to be freed in between. 1579 */ 1580 int 1581 fmd_case_repair(fmd_case_t *cp) 1582 { 1583 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1584 uint_t cstate; 1585 1586 (void) pthread_mutex_lock(&cip->ci_lock); 1587 cstate = cip->ci_state; 1588 1589 if (cip->ci_xprt != NULL) { 1590 (void) pthread_mutex_unlock(&cip->ci_lock); 1591 return (fmd_set_errno(EFMD_CASE_OWNER)); 1592 } 1593 1594 if (cstate < FMD_CASE_SOLVED) { 1595 (void) pthread_mutex_unlock(&cip->ci_lock); 1596 return (fmd_set_errno(EFMD_CASE_STATE)); 1597 } 1598 1599 if (cip->ci_flags & FMD_CF_REPAIRED) { 1600 (void) pthread_mutex_unlock(&cip->ci_lock); 1601 return (0); /* already repaired */ 1602 } 1603 1604 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, NULL); 1605 (void) pthread_mutex_unlock(&cip->ci_lock); 1606 1607 if (cstate == FMD_CASE_CLOSED) 1608 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1609 else 1610 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1611 1612 return (0); 1613 } 1614 1615 int 1616 fmd_case_acquit(fmd_case_t *cp) 1617 { 1618 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1619 uint_t cstate; 1620 1621 (void) pthread_mutex_lock(&cip->ci_lock); 1622 cstate = cip->ci_state; 1623 1624 if (cip->ci_xprt != NULL) { 1625 (void) pthread_mutex_unlock(&cip->ci_lock); 1626 return (fmd_set_errno(EFMD_CASE_OWNER)); 1627 } 1628 1629 if (cstate < FMD_CASE_SOLVED) { 1630 (void) pthread_mutex_unlock(&cip->ci_lock); 1631 return (fmd_set_errno(EFMD_CASE_STATE)); 1632 } 1633 1634 if (cip->ci_flags & FMD_CF_REPAIRED) { 1635 (void) pthread_mutex_unlock(&cip->ci_lock); 1636 return (0); /* already repaired */ 1637 } 1638 1639 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_acquit, NULL); 1640 (void) pthread_mutex_unlock(&cip->ci_lock); 1641 1642 if (cstate == FMD_CASE_CLOSED) 1643 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1644 else 1645 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1646 1647 return (0); 1648 } 1649 1650 int 1651 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1652 { 1653 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1654 fmd_case_item_t *cit; 1655 uint_t state; 1656 int rv = 0; 1657 1658 (void) pthread_mutex_lock(&cip->ci_lock); 1659 1660 if (cip->ci_state >= FMD_CASE_SOLVED) 1661 state = FMD_EVS_DIAGNOSED; 1662 else 1663 state = FMD_EVS_ACCEPTED; 1664 1665 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1666 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1667 break; 1668 } 1669 1670 if (rv == 0 && cip->ci_principal != NULL) 1671 rv = fmd_event_equal(ep, cip->ci_principal); 1672 1673 (void) pthread_mutex_unlock(&cip->ci_lock); 1674 1675 if (rv != 0) 1676 fmd_event_transition(ep, state); 1677 1678 return (rv); 1679 } 1680 1681 int 1682 fmd_case_orphaned(fmd_case_t *cp) 1683 { 1684 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1685 } 1686 1687 void 1688 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1689 { 1690 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1691 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1692 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1693 } 1694 1695 /*ARGSUSED*/ 1696 void 1697 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg) 1698 { 1699 int not_faulty = 0; 1700 int faulty = 0; 1701 nvlist_t *nvl; 1702 fmd_event_t *e; 1703 char *class; 1704 int any_unusable_and_present = 0; 1705 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1706 1707 if (cip->ci_state < FMD_CASE_SOLVED) 1708 return; 1709 1710 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1711 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty, 1712 ¬_faulty); 1713 1714 if (!faulty) { 1715 /* 1716 * If none of the suspects is faulty, replay the list.repaired. 1717 * If all suspects are already either usable or not present then 1718 * also transition straight to RESOLVED state. 1719 */ 1720 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1721 fmd_case_unusable_and_present, &any_unusable_and_present); 1722 if (!any_unusable_and_present) { 1723 fmd_module_lock(cip->ci_mod); 1724 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1725 fmd_module_unlock(cip->ci_mod); 1726 cip->ci_state = FMD_CASE_RESOLVED; 1727 1728 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1729 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1730 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 1731 class); 1732 fmd_dispq_dispatch(fmd.d_disp, e, class); 1733 1734 fmd_case_publish(cp, FMD_CASE_RESOLVED); 1735 (void) pthread_mutex_lock(&cip->ci_lock); 1736 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1737 (void) pthread_mutex_unlock(&cip->ci_lock); 1738 fmd_case_rele(cp); 1739 } else { 1740 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 1741 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1742 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, 1743 class); 1744 fmd_dispq_dispatch(fmd.d_disp, e, class); 1745 } 1746 } else if (not_faulty) { 1747 /* 1748 * if some but not all of the suspects are not faulty, replay 1749 * the list.updated. 1750 */ 1751 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS); 1752 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 1753 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 1754 fmd_dispq_dispatch(fmd.d_disp, e, class); 1755 } 1756 } 1757 1758 void 1759 fmd_case_repair_replay() 1760 { 1761 fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL); 1762 } 1763