1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * FMD Case Subsystem 31 * 32 * Diagnosis engines are expected to group telemetry events related to the 33 * diagnosis of a particular problem on the system into a set of cases. The 34 * diagnosis engine may have any number of cases open at a given point in time. 35 * Some cases may eventually be *solved* by associating a suspect list of one 36 * or more problems with the case, at which point fmd publishes a list.suspect 37 * event for the case and it becomes visible to administrators and agents. 38 * 39 * Every case is named using a UUID, and is globally visible in the case hash. 40 * Cases are reference-counted, except for the reference from the case hash 41 * itself. Consumers of case references include modules, which store active 42 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code. 43 * 44 * Cases obey the following state machine. In states UNSOLVED, SOLVED, and 45 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine 46 * or transport) and the case is referenced by the mod_cases list. Once the 47 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer 48 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases. 49 * 50 * +------------+ 51 * +----------| UNSOLVED | 52 * | +------------+ 53 * 1 | 4 | 54 * | | 55 * +----v---+ /-2->+------v-----+ 3 +--------+ 56 * | SOLVED |< | CLOSE_WAIT |--------->| CLOSED | 57 * +--------+ \-5->+------------+ +--------+ 58 * | | 59 * 6 | | 7 60 * +------v-----+ | 61 * | REPAIRED |<-------------+ 62 * +------------+ 63 * 64 * The state machine changes are triggered by calls to fmd_case_transition() 65 * from various locations inside of fmd, as described below: 66 * 67 * [1] Called by: fmd_case_solve() 68 * Actions: FMD_CF_SOLVED flag is set in ci_flags 69 * conviction policy is applied to suspect list 70 * suspects convicted are marked faulty (F) in R$ 71 * list.suspect event logged and dispatched 72 * 73 * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose() 74 * Actions: FMD_CF_ISOLATED flag is set in ci_flags 75 * suspects convicted (F) are marked unusable (U) in R$ 76 * diagnosis engine fmdo_close() entry point scheduled 77 * case transitions to CLOSED [3] upon exit from CLOSE_WAIT 78 * 79 * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns) 80 * Actions: list.isolated event dispatched 81 * case deleted from module's list of open cases 82 * 83 * [4] Called by: fmd_case_close(), fmd_case_uuclose() 84 * Actions: diagnosis engine fmdo_close() entry point scheduled 85 * case is subsequently discarded by fmd_case_delete() 86 * 87 * [5] Called by: fmd_case_repair(), fmd_case_update() 88 * Actions: FMD_CF_REPAIR flag is set in ci_flags 89 * diagnosis engine fmdo_close() entry point scheduled 90 * case transitions to REPAIRED [6] upon exit from CLOSE_WAIT 91 * 92 * [6] Called by: fmd_case_repair(), fmd_case_update() 93 * Actions: FMD_CF_REPAIR flag is set in ci_flags 94 * suspects convicted are marked non faulty (!F) in R$ 95 * list.repaired event dispatched 96 * 97 * [7] Called by: fmd_case_repair(), fmd_case_update() 98 * Actions: FMD_CF_REPAIR flag is set in ci_flags 99 * suspects convicted are marked non faulty (!F) in R$ 100 * list.repaired event dispatched 101 */ 102 103 #include <sys/fm/protocol.h> 104 #include <uuid/uuid.h> 105 #include <alloca.h> 106 107 #include <fmd_alloc.h> 108 #include <fmd_module.h> 109 #include <fmd_error.h> 110 #include <fmd_conf.h> 111 #include <fmd_case.h> 112 #include <fmd_string.h> 113 #include <fmd_subr.h> 114 #include <fmd_protocol.h> 115 #include <fmd_event.h> 116 #include <fmd_eventq.h> 117 #include <fmd_dispq.h> 118 #include <fmd_buf.h> 119 #include <fmd_log.h> 120 #include <fmd_asru.h> 121 #include <fmd_fmri.h> 122 #include <fmd_xprt.h> 123 124 #include <fmd.h> 125 126 static const char *const _fmd_case_snames[] = { 127 "UNSOLVED", /* FMD_CASE_UNSOLVED */ 128 "SOLVED", /* FMD_CASE_SOLVED */ 129 "CLOSE_WAIT", /* FMD_CASE_CLOSE_WAIT */ 130 "CLOSED", /* FMD_CASE_CLOSED */ 131 "REPAIRED" /* FMD_CASE_REPAIRED */ 132 }; 133 134 extern volatile uint32_t fmd_asru_fake_not_present; 135 136 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *); 137 138 fmd_case_hash_t * 139 fmd_case_hash_create(void) 140 { 141 fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP); 142 143 (void) pthread_rwlock_init(&chp->ch_lock, NULL); 144 chp->ch_hashlen = fmd.d_str_buckets; 145 chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP); 146 chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, 147 FMD_SLEEP); 148 chp->ch_count = 0; 149 150 return (chp); 151 } 152 153 /* 154 * Destroy the case hash. Unlike most of our hash tables, no active references 155 * are kept by the case hash itself; all references come from other subsystems. 156 * The hash must be destroyed after all modules are unloaded; if anything was 157 * present in the hash it would be by definition a reference count leak. 158 */ 159 void 160 fmd_case_hash_destroy(fmd_case_hash_t *chp) 161 { 162 fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen); 163 fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen); 164 fmd_free(chp, sizeof (fmd_case_hash_t)); 165 } 166 167 /* 168 * Take a snapshot of the case hash by placing an additional hold on each 169 * member in an auxiliary array, and then call 'func' for each case. 170 */ 171 void 172 fmd_case_hash_apply(fmd_case_hash_t *chp, 173 void (*func)(fmd_case_t *, void *), void *arg) 174 { 175 fmd_case_impl_t *cp, **cps, **cpp; 176 uint_t cpc, i; 177 178 (void) pthread_rwlock_rdlock(&chp->ch_lock); 179 180 cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP); 181 cpc = chp->ch_count; 182 183 for (i = 0; i < chp->ch_hashlen; i++) { 184 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) { 185 if (fmd_case_tryhold(cp) != NULL) 186 *cpp++ = cp; 187 } 188 } 189 190 ASSERT(cpp == cps + cpc); 191 (void) pthread_rwlock_unlock(&chp->ch_lock); 192 193 for (i = 0; i < cpc; i++) { 194 func((fmd_case_t *)cps[i], arg); 195 fmd_case_rele((fmd_case_t *)cps[i]); 196 } 197 198 fmd_free(cps, cpc * sizeof (fmd_case_t *)); 199 } 200 201 static void 202 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 203 { 204 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 205 206 cip->ci_code_next = chp->ch_code_hash[h]; 207 chp->ch_code_hash[h] = cip; 208 } 209 210 static void 211 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 212 { 213 fmd_case_impl_t **pp, *cp; 214 215 if (cip->ci_code) { 216 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 217 218 pp = &chp->ch_code_hash[h]; 219 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) { 220 if (cp != cip) 221 pp = &cp->ci_code_next; 222 else 223 break; 224 } 225 if (cp != NULL) { 226 *pp = cp->ci_code_next; 227 cp->ci_code_next = NULL; 228 } 229 } 230 } 231 232 /* 233 * Look up the diagcode for this case and cache it in ci_code. If no suspects 234 * were defined for this case or if the lookup fails, the event dictionary or 235 * module code is broken, and we set the event code to a precomputed default. 236 */ 237 static const char * 238 fmd_case_mkcode(fmd_case_t *cp) 239 { 240 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 241 fmd_case_susp_t *cis; 242 fmd_case_hash_t *chp = fmd.d_cases; 243 244 char **keys, **keyp; 245 const char *s; 246 247 ASSERT(MUTEX_HELD(&cip->ci_lock)); 248 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 249 250 /* 251 * delete any existing entry from code hash if it is on it 252 */ 253 fmd_case_code_hash_delete(chp, cip); 254 255 fmd_free(cip->ci_code, cip->ci_codelen); 256 cip->ci_codelen = cip->ci_mod->mod_codelen; 257 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 258 keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1)); 259 260 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 261 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0) 262 keyp++; 263 } 264 265 *keyp = NULL; /* mark end of keys[] array for libdiagcode */ 266 267 if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code( 268 cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) { 269 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s); 270 fmd_free(cip->ci_code, cip->ci_codelen); 271 cip->ci_codelen = strlen(s) + 1; 272 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP); 273 (void) strcpy(cip->ci_code, s); 274 } 275 276 /* 277 * add into hash of solved cases 278 */ 279 fmd_case_code_hash_insert(chp, cip); 280 281 return (cip->ci_code); 282 } 283 284 typedef struct { 285 int *fcl_countp; 286 uint8_t *fcl_ba; 287 nvlist_t **fcl_nva; 288 int *fcl_msgp; 289 } fmd_case_lst_t; 290 291 static void 292 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg) 293 { 294 fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg; 295 boolean_t b; 296 int state; 297 298 if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE, 299 &b) == 0 && b == B_FALSE) 300 *entryp->fcl_msgp = B_FALSE; 301 entryp->fcl_ba[*entryp->fcl_countp] = 0; 302 state = fmd_asru_al_getstate(alp); 303 if (state & FMD_ASRU_UNUSABLE) 304 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE; 305 if (state & FMD_ASRU_FAULTY) 306 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY; 307 if (!(state & FMD_ASRU_PRESENT)) 308 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT; 309 entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event; 310 (*entryp->fcl_countp)++; 311 } 312 313 static void 314 fmd_case_faulty(fmd_asru_link_t *alp, void *arg) 315 { 316 int *faultyp = (int *)arg; 317 318 *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY); 319 } 320 321 static void 322 fmd_case_usable(fmd_asru_link_t *alp, void *arg) 323 { 324 int *usablep = (int *)arg; 325 326 *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE); 327 } 328 329 nvlist_t * 330 fmd_case_mkevent(fmd_case_t *cp, const char *class) 331 { 332 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 333 nvlist_t **nva, *nvl; 334 uint8_t *ba; 335 int msg = B_TRUE; 336 const char *code; 337 fmd_case_lst_t fcl; 338 int count = 0; 339 340 (void) pthread_mutex_lock(&cip->ci_lock); 341 ASSERT(cip->ci_state >= FMD_CASE_SOLVED); 342 343 nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects); 344 ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects); 345 346 /* 347 * For each suspect associated with the case, store its fault event 348 * nvlist in 'nva'. We also look to see if any of the suspect faults 349 * have asked not to be messaged. If any of them have made such a 350 * request, propagate that attribute to the composite list.* event. 351 * Finally, store each suspect's faulty status into the bitmap 'ba'. 352 */ 353 fcl.fcl_countp = &count; 354 fcl.fcl_msgp = &msg; 355 fcl.fcl_ba = ba; 356 fcl.fcl_nva = nva; 357 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl); 358 359 if (cip->ci_code == NULL) 360 (void) fmd_case_mkcode(cp); 361 /* 362 * For repair event, we lookup diagcode from dict using key 363 * "list.repaired". 364 */ 365 if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) 366 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code); 367 else 368 code = cip->ci_code; 369 370 if (msg == B_FALSE) 371 cip->ci_flags |= FMD_CF_INVISIBLE; 372 373 nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid, 374 code, count, nva, ba, msg, &cip->ci_tv); 375 376 (void) pthread_mutex_unlock(&cip->ci_lock); 377 return (nvl); 378 } 379 380 static boolean_t 381 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem) 382 { 383 nvlist_t *new_rsrc; 384 nvlist_t *rsrc; 385 char *new_name = NULL; 386 char *name = NULL; 387 ssize_t new_namelen; 388 ssize_t namelen; 389 int fmri_present = 1; 390 int new_fmri_present = 1; 391 int match = B_FALSE; 392 fmd_topo_t *ftp = fmd_topo_hold(); 393 394 if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0) 395 fmri_present = 0; 396 else { 397 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1) 398 goto done; 399 name = fmd_alloc(namelen + 1, FMD_SLEEP); 400 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1) 401 goto done; 402 } 403 if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0) 404 new_fmri_present = 0; 405 else { 406 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1) 407 goto done; 408 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP); 409 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1) 410 goto done; 411 } 412 match = (fmri_present == new_fmri_present && 413 (fmri_present == 0 || 414 topo_fmri_strcmp(ftp->ft_hdl, name, new_name))); 415 done: 416 if (name != NULL) 417 fmd_free(name, namelen + 1); 418 if (new_name != NULL) 419 fmd_free(new_name, new_namelen + 1); 420 fmd_topo_rele(ftp); 421 return (match); 422 } 423 424 static int 425 fmd_case_match_suspect(fmd_case_susp_t *cis, fmd_case_susp_t *xcis) 426 { 427 char *class, *new_class; 428 429 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_ASRU)) 430 return (0); 431 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, 432 FM_FAULT_RESOURCE)) 433 return (0); 434 if (!fmd_case_compare_elem(cis->cis_nvl, xcis->cis_nvl, FM_FAULT_FRU)) 435 return (0); 436 (void) nvlist_lookup_string(xcis->cis_nvl, FM_CLASS, &class); 437 (void) nvlist_lookup_string(cis->cis_nvl, FM_CLASS, &new_class); 438 return (strcmp(class, new_class) == 0); 439 } 440 441 /* 442 * see if an identical suspect list already exists in the cache 443 */ 444 static int 445 fmd_case_check_for_dups(fmd_case_t *cp) 446 { 447 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp, *xcip; 448 fmd_case_hash_t *chp = fmd.d_cases; 449 fmd_case_susp_t *xcis, *cis; 450 int match = 0, match_susp; 451 uint_t h; 452 453 (void) pthread_rwlock_rdlock(&chp->ch_lock); 454 455 /* 456 * Find all cases with this code 457 */ 458 h = fmd_strhash(cip->ci_code) % chp->ch_hashlen; 459 for (xcip = chp->ch_code_hash[h]; xcip != NULL; 460 xcip = xcip->ci_code_next) { 461 /* 462 * only look for any cases (apart from this one) 463 * whose code and number of suspects match 464 */ 465 if (xcip == cip || fmd_case_tryhold(xcip) == NULL) 466 continue; 467 if (strcmp(xcip->ci_code, cip->ci_code) != 0 || 468 xcip->ci_nsuspects != cip->ci_nsuspects) { 469 fmd_case_rele((fmd_case_t *)xcip); 470 continue; 471 } 472 473 /* 474 * For each suspect in one list, check if there 475 * is an identical suspect in the other list 476 */ 477 match = 1; 478 for (xcis = xcip->ci_suspects; xcis != NULL; 479 xcis = xcis->cis_next) { 480 match_susp = 0; 481 for (cis = cip->ci_suspects; cis != NULL; 482 cis = cis->cis_next) { 483 if (fmd_case_match_suspect(cis, xcis) == 1) { 484 match_susp = 1; 485 break; 486 } 487 } 488 if (match_susp == 0) { 489 match = 0; 490 break; 491 } 492 } 493 fmd_case_rele((fmd_case_t *)xcip); 494 if (match) { 495 (void) pthread_rwlock_unlock(&chp->ch_lock); 496 return (1); 497 } 498 } 499 (void) pthread_rwlock_unlock(&chp->ch_lock); 500 return (0); 501 } 502 503 /* 504 * Convict suspects in a case by applying a conviction policy and updating the 505 * resource cache prior to emitting the list.suspect event for the given case. 506 * At present, our policy is very simple: convict every suspect in the case. 507 * In the future, this policy can be extended and made configurable to permit: 508 * 509 * - convicting the suspect with the highest FIT rate 510 * - convicting the suspect with the cheapest FRU 511 * - convicting the suspect with the FRU that is in a depot's inventory 512 * - convicting the suspect with the longest lifetime 513 * 514 * and so forth. A word to the wise: this problem is significantly harder that 515 * it seems at first glance. Future work should heed the following advice: 516 * 517 * Hacking the policy into C code here is a very bad idea. The policy needs to 518 * be decided upon very carefully and fundamentally encodes knowledge of what 519 * suspect list combinations can be emitted by what diagnosis engines. As such 520 * fmd's code is the wrong location, because that would require fmd itself to 521 * be updated for every diagnosis engine change, defeating the entire design. 522 * The FMA Event Registry knows the suspect list combinations: policy inputs 523 * can be derived from it and used to produce per-module policy configuration. 524 * 525 * If the policy needs to be dynamic and not statically fixed at either fmd 526 * startup or module load time, any implementation of dynamic policy retrieval 527 * must employ some kind of caching mechanism or be part of a built-in module. 528 * The fmd_case_convict() function is called with locks held inside of fmd and 529 * is not a place where unbounded blocking on some inter-process or inter- 530 * system communication to another service (e.g. another daemon) can occur. 531 */ 532 static int 533 fmd_case_convict(fmd_case_t *cp) 534 { 535 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 536 fmd_asru_hash_t *ahp = fmd.d_asrus; 537 538 fmd_case_susp_t *cis; 539 fmd_asru_link_t *alp; 540 541 (void) pthread_mutex_lock(&cip->ci_lock); 542 (void) fmd_case_mkcode(cp); 543 if (fmd_case_check_for_dups(cp) == 1) { 544 (void) pthread_mutex_unlock(&cip->ci_lock); 545 return (1); 546 } 547 548 /* 549 * no suspect list already exists - allocate new cache entries 550 */ 551 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) { 552 if ((alp = fmd_asru_hash_create_entry(ahp, 553 cp, cis->cis_nvl)) == NULL) { 554 fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in " 555 "%s: %s\n", cip->ci_uuid, fmd_strerror(errno)); 556 continue; 557 } 558 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE); 559 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY); 560 } 561 562 (void) pthread_mutex_unlock(&cip->ci_lock); 563 return (0); 564 } 565 566 void 567 fmd_case_publish(fmd_case_t *cp, uint_t state) 568 { 569 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 570 fmd_event_t *e; 571 nvlist_t *nvl; 572 char *class; 573 574 if (state == FMD_CASE_CURRENT) 575 state = cip->ci_state; /* use current state */ 576 577 switch (state) { 578 case FMD_CASE_SOLVED: 579 (void) pthread_mutex_lock(&cip->ci_lock); 580 if (cip->ci_tv_valid == 0) { 581 fmd_time_gettimeofday(&cip->ci_tv); 582 cip->ci_tv_valid = 1; 583 } 584 (void) pthread_mutex_unlock(&cip->ci_lock); 585 586 if (fmd_case_convict(cp) == 1) { /* dupclose */ 587 cip->ci_flags &= ~FMD_CF_SOLVED; 588 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0); 589 break; 590 } 591 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS); 592 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 593 594 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 595 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 596 fmd_log_append(fmd.d_fltlog, e, cp); 597 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 598 fmd_dispq_dispatch(fmd.d_disp, e, class); 599 600 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 601 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++; 602 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 603 604 break; 605 606 case FMD_CASE_CLOSE_WAIT: 607 fmd_case_hold(cp); 608 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp); 609 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 610 611 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 612 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++; 613 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 614 615 break; 616 617 case FMD_CASE_CLOSED: 618 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS); 619 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 620 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 621 fmd_dispq_dispatch(fmd.d_disp, e, class); 622 break; 623 624 case FMD_CASE_REPAIRED: 625 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS); 626 (void) nvlist_lookup_string(nvl, FM_CLASS, &class); 627 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class); 628 (void) pthread_rwlock_rdlock(&fmd.d_log_lock); 629 fmd_log_append(fmd.d_fltlog, e, cp); 630 (void) pthread_rwlock_unlock(&fmd.d_log_lock); 631 fmd_dispq_dispatch(fmd.d_disp, e, class); 632 break; 633 } 634 } 635 636 fmd_case_t * 637 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid) 638 { 639 fmd_case_impl_t *cip; 640 uint_t h; 641 642 (void) pthread_rwlock_rdlock(&chp->ch_lock); 643 h = fmd_strhash(uuid) % chp->ch_hashlen; 644 645 for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) { 646 if (strcmp(cip->ci_uuid, uuid) == 0) 647 break; 648 } 649 650 /* 651 * If deleting bit is set, treat the case as if it doesn't exist. 652 */ 653 if (cip != NULL) 654 cip = fmd_case_tryhold(cip); 655 656 if (cip == NULL) 657 (void) fmd_set_errno(EFMD_CASE_INVAL); 658 659 (void) pthread_rwlock_unlock(&chp->ch_lock); 660 return ((fmd_case_t *)cip); 661 } 662 663 static fmd_case_impl_t * 664 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 665 { 666 fmd_case_impl_t *eip; 667 uint_t h; 668 669 (void) pthread_rwlock_wrlock(&chp->ch_lock); 670 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 671 672 for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) { 673 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 && 674 fmd_case_tryhold(eip) != NULL) { 675 (void) pthread_rwlock_unlock(&chp->ch_lock); 676 return (eip); /* uuid already present */ 677 } 678 } 679 680 cip->ci_next = chp->ch_hash[h]; 681 chp->ch_hash[h] = cip; 682 683 chp->ch_count++; 684 ASSERT(chp->ch_count != 0); 685 686 (void) pthread_rwlock_unlock(&chp->ch_lock); 687 return (cip); 688 } 689 690 static void 691 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip) 692 { 693 fmd_case_impl_t *cp, **pp; 694 uint_t h; 695 696 ASSERT(MUTEX_HELD(&cip->ci_lock)); 697 698 cip->ci_flags |= FMD_CF_DELETING; 699 (void) pthread_mutex_unlock(&cip->ci_lock); 700 701 (void) pthread_rwlock_wrlock(&chp->ch_lock); 702 703 h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen; 704 pp = &chp->ch_hash[h]; 705 706 for (cp = *pp; cp != NULL; cp = cp->ci_next) { 707 if (cp != cip) 708 pp = &cp->ci_next; 709 else 710 break; 711 } 712 713 if (cp == NULL) { 714 fmd_panic("case %p (%s) not found on hash chain %u\n", 715 (void *)cip, cip->ci_uuid, h); 716 } 717 718 *pp = cp->ci_next; 719 cp->ci_next = NULL; 720 721 /* 722 * delete from code hash if it is on it 723 */ 724 fmd_case_code_hash_delete(chp, cip); 725 726 ASSERT(chp->ch_count != 0); 727 chp->ch_count--; 728 729 (void) pthread_rwlock_unlock(&chp->ch_lock); 730 731 (void) pthread_mutex_lock(&cip->ci_lock); 732 ASSERT(cip->ci_flags & FMD_CF_DELETING); 733 } 734 735 fmd_case_t * 736 fmd_case_create(fmd_module_t *mp, void *data) 737 { 738 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 739 fmd_case_impl_t *eip = NULL; 740 uuid_t uuid; 741 742 (void) pthread_mutex_init(&cip->ci_lock, NULL); 743 fmd_buf_hash_create(&cip->ci_bufs); 744 745 fmd_module_hold(mp); 746 cip->ci_mod = mp; 747 cip->ci_refs = 1; 748 cip->ci_state = FMD_CASE_UNSOLVED; 749 cip->ci_flags = FMD_CF_DIRTY; 750 cip->ci_data = data; 751 752 /* 753 * Calling libuuid: get a clue. The library interfaces cleverly do not 754 * define any constant for the length of an unparse string, and do not 755 * permit the caller to specify a buffer length for safety. The spec 756 * says it will be 36 bytes, but we make it tunable just in case. 757 */ 758 (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen); 759 cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP); 760 761 /* 762 * We expect this loop to execute only once, but code it defensively 763 * against the possibility of libuuid bugs. Keep generating uuids and 764 * attempting to do a hash insert until we get a unique one. 765 */ 766 do { 767 if (eip != NULL) 768 fmd_case_rele((fmd_case_t *)eip); 769 uuid_generate(uuid); 770 uuid_unparse(uuid, cip->ci_uuid); 771 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip); 772 773 ASSERT(fmd_module_locked(mp)); 774 fmd_list_append(&mp->mod_cases, cip); 775 fmd_module_setcdirty(mp); 776 777 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 778 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 779 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 780 781 return ((fmd_case_t *)cip); 782 } 783 784 static void 785 fmd_case_destroy_suspects(fmd_case_impl_t *cip) 786 { 787 fmd_case_susp_t *cis, *ncis; 788 789 ASSERT(MUTEX_HELD(&cip->ci_lock)); 790 791 for (cis = cip->ci_suspects; cis != NULL; cis = ncis) { 792 ncis = cis->cis_next; 793 nvlist_free(cis->cis_nvl); 794 fmd_free(cis, sizeof (fmd_case_susp_t)); 795 } 796 797 cip->ci_suspects = NULL; 798 cip->ci_nsuspects = 0; 799 } 800 801 fmd_case_t * 802 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp, 803 uint_t state, const char *uuid, const char *code) 804 { 805 fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP); 806 fmd_case_impl_t *eip; 807 808 ASSERT(state < FMD_CASE_REPAIRED); 809 810 (void) pthread_mutex_init(&cip->ci_lock, NULL); 811 fmd_buf_hash_create(&cip->ci_bufs); 812 813 fmd_module_hold(mp); 814 cip->ci_mod = mp; 815 cip->ci_xprt = xp; 816 cip->ci_refs = 1; 817 cip->ci_state = state; 818 cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP); 819 cip->ci_uuidlen = strlen(cip->ci_uuid); 820 cip->ci_code = fmd_strdup(code, FMD_SLEEP); 821 cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0; 822 823 if (state > FMD_CASE_CLOSE_WAIT) 824 cip->ci_flags |= FMD_CF_SOLVED; 825 826 /* 827 * Insert the case into the global case hash. If the specified UUID is 828 * already present, check to see if it is an orphan: if so, reclaim it; 829 * otherwise if it is owned by a different module then return NULL. 830 */ 831 if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) { 832 (void) pthread_mutex_lock(&cip->ci_lock); 833 cip->ci_refs--; /* decrement to zero */ 834 fmd_case_destroy((fmd_case_t *)cip, B_FALSE); 835 836 cip = eip; /* switch 'cip' to the existing case */ 837 (void) pthread_mutex_lock(&cip->ci_lock); 838 839 /* 840 * If the ASRU cache is trying to recreate an orphan, then just 841 * return the existing case that we found without changing it. 842 */ 843 if (mp == fmd.d_rmod) { 844 (void) pthread_mutex_unlock(&cip->ci_lock); 845 fmd_case_rele((fmd_case_t *)cip); 846 return ((fmd_case_t *)cip); 847 } 848 849 /* 850 * If the existing case isn't an orphan or is being proxied, 851 * then we have a UUID conflict: return failure to the caller. 852 */ 853 if (cip->ci_mod != fmd.d_rmod || xp != NULL) { 854 (void) pthread_mutex_unlock(&cip->ci_lock); 855 fmd_case_rele((fmd_case_t *)cip); 856 return (NULL); 857 } 858 859 /* 860 * If the new module is reclaiming an orphaned case, remove 861 * the case from the root module, switch ci_mod, and then fall 862 * through to adding the case to the new owner module 'mp'. 863 */ 864 fmd_module_lock(cip->ci_mod); 865 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 866 fmd_module_unlock(cip->ci_mod); 867 868 fmd_module_rele(cip->ci_mod); 869 cip->ci_mod = mp; 870 fmd_module_hold(mp); 871 872 fmd_case_destroy_suspects(cip); 873 cip->ci_state = state; 874 875 (void) pthread_mutex_unlock(&cip->ci_lock); 876 fmd_case_rele((fmd_case_t *)cip); 877 } else { 878 /* 879 * add into hash of solved cases 880 */ 881 if (cip->ci_code) 882 fmd_case_code_hash_insert(fmd.d_cases, cip); 883 } 884 885 ASSERT(fmd_module_locked(mp)); 886 fmd_list_append(&mp->mod_cases, cip); 887 888 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 889 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++; 890 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 891 892 return ((fmd_case_t *)cip); 893 } 894 895 void 896 fmd_case_destroy(fmd_case_t *cp, int visible) 897 { 898 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 899 fmd_case_item_t *cit, *ncit; 900 901 ASSERT(MUTEX_HELD(&cip->ci_lock)); 902 ASSERT(cip->ci_refs == 0); 903 904 if (visible) { 905 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid)); 906 fmd_case_hash_delete(fmd.d_cases, cip); 907 } 908 909 for (cit = cip->ci_items; cit != NULL; cit = ncit) { 910 ncit = cit->cit_next; 911 fmd_event_rele(cit->cit_event); 912 fmd_free(cit, sizeof (fmd_case_item_t)); 913 } 914 915 fmd_case_destroy_suspects(cip); 916 917 if (cip->ci_principal != NULL) 918 fmd_event_rele(cip->ci_principal); 919 920 fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1); 921 fmd_free(cip->ci_code, cip->ci_codelen); 922 (void) fmd_buf_hash_destroy(&cip->ci_bufs); 923 924 fmd_module_rele(cip->ci_mod); 925 fmd_free(cip, sizeof (fmd_case_impl_t)); 926 } 927 928 void 929 fmd_case_hold(fmd_case_t *cp) 930 { 931 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 932 933 (void) pthread_mutex_lock(&cip->ci_lock); 934 fmd_case_hold_locked(cp); 935 (void) pthread_mutex_unlock(&cip->ci_lock); 936 } 937 938 void 939 fmd_case_hold_locked(fmd_case_t *cp) 940 { 941 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 942 943 ASSERT(MUTEX_HELD(&cip->ci_lock)); 944 if (cip->ci_flags & FMD_CF_DELETING) 945 fmd_panic("attempt to hold a deleting case %p (%s)\n", 946 (void *)cip, cip->ci_uuid); 947 cip->ci_refs++; 948 ASSERT(cip->ci_refs != 0); 949 } 950 951 static fmd_case_impl_t * 952 fmd_case_tryhold(fmd_case_impl_t *cip) 953 { 954 /* 955 * If the case's "deleting" bit is unset, hold and return case, 956 * otherwise, return NULL. 957 */ 958 (void) pthread_mutex_lock(&cip->ci_lock); 959 if (cip->ci_flags & FMD_CF_DELETING) { 960 (void) pthread_mutex_unlock(&cip->ci_lock); 961 cip = NULL; 962 } else { 963 fmd_case_hold_locked((fmd_case_t *)cip); 964 (void) pthread_mutex_unlock(&cip->ci_lock); 965 } 966 return (cip); 967 } 968 969 void 970 fmd_case_rele(fmd_case_t *cp) 971 { 972 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 973 974 (void) pthread_mutex_lock(&cip->ci_lock); 975 ASSERT(cip->ci_refs != 0); 976 977 if (--cip->ci_refs == 0) 978 fmd_case_destroy((fmd_case_t *)cip, B_TRUE); 979 else 980 (void) pthread_mutex_unlock(&cip->ci_lock); 981 } 982 983 void 984 fmd_case_rele_locked(fmd_case_t *cp) 985 { 986 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 987 988 ASSERT(MUTEX_HELD(&cip->ci_lock)); 989 --cip->ci_refs; 990 ASSERT(cip->ci_refs != 0); 991 } 992 993 int 994 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep) 995 { 996 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 997 fmd_case_item_t *cit; 998 fmd_event_t *oep; 999 uint_t state; 1000 int new; 1001 1002 fmd_event_hold(ep); 1003 (void) pthread_mutex_lock(&cip->ci_lock); 1004 1005 if (cip->ci_flags & FMD_CF_SOLVED) 1006 state = FMD_EVS_DIAGNOSED; 1007 else 1008 state = FMD_EVS_ACCEPTED; 1009 1010 oep = cip->ci_principal; 1011 cip->ci_principal = ep; 1012 1013 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1014 if (cit->cit_event == ep) 1015 break; 1016 } 1017 1018 cip->ci_flags |= FMD_CF_DIRTY; 1019 new = cit == NULL && ep != oep; 1020 1021 (void) pthread_mutex_unlock(&cip->ci_lock); 1022 1023 fmd_module_setcdirty(cip->ci_mod); 1024 fmd_event_transition(ep, state); 1025 1026 if (oep != NULL) 1027 fmd_event_rele(oep); 1028 1029 return (new); 1030 } 1031 1032 int 1033 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep) 1034 { 1035 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1036 fmd_case_item_t *cit; 1037 uint_t state; 1038 int new; 1039 1040 (void) pthread_mutex_lock(&cip->ci_lock); 1041 1042 if (cip->ci_flags & FMD_CF_SOLVED) 1043 state = FMD_EVS_DIAGNOSED; 1044 else 1045 state = FMD_EVS_ACCEPTED; 1046 1047 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1048 if (cit->cit_event == ep) 1049 break; 1050 } 1051 1052 new = cit == NULL && ep != cip->ci_principal; 1053 1054 /* 1055 * If the event is already in the case or the case is already solved, 1056 * there is no reason to save it: just transition it appropriately. 1057 */ 1058 if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) { 1059 (void) pthread_mutex_unlock(&cip->ci_lock); 1060 fmd_event_transition(ep, state); 1061 return (new); 1062 } 1063 1064 cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP); 1065 fmd_event_hold(ep); 1066 1067 cit->cit_next = cip->ci_items; 1068 cit->cit_event = ep; 1069 1070 cip->ci_items = cit; 1071 cip->ci_nitems++; 1072 1073 cip->ci_flags |= FMD_CF_DIRTY; 1074 (void) pthread_mutex_unlock(&cip->ci_lock); 1075 1076 fmd_module_setcdirty(cip->ci_mod); 1077 fmd_event_transition(ep, state); 1078 1079 return (new); 1080 } 1081 1082 void 1083 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl) 1084 { 1085 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1086 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1087 1088 (void) pthread_mutex_lock(&cip->ci_lock); 1089 ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT); 1090 cip->ci_flags |= FMD_CF_DIRTY; 1091 1092 cis->cis_next = cip->ci_suspects; 1093 cis->cis_nvl = nvl; 1094 1095 cip->ci_suspects = cis; 1096 cip->ci_nsuspects++; 1097 1098 (void) pthread_mutex_unlock(&cip->ci_lock); 1099 fmd_module_setcdirty(cip->ci_mod); 1100 } 1101 1102 void 1103 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl) 1104 { 1105 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1106 fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP); 1107 boolean_t b; 1108 1109 (void) pthread_mutex_lock(&cip->ci_lock); 1110 ASSERT(cip->ci_state == FMD_CASE_CLOSED); 1111 ASSERT(cip->ci_mod == fmd.d_rmod); 1112 1113 cis->cis_next = cip->ci_suspects; 1114 cis->cis_nvl = nvl; 1115 1116 if (nvlist_lookup_boolean_value(nvl, 1117 FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE) 1118 cip->ci_flags |= FMD_CF_INVISIBLE; 1119 1120 cip->ci_suspects = cis; 1121 cip->ci_nsuspects++; 1122 1123 (void) pthread_mutex_unlock(&cip->ci_lock); 1124 } 1125 1126 void 1127 fmd_case_reset_suspects(fmd_case_t *cp) 1128 { 1129 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1130 1131 (void) pthread_mutex_lock(&cip->ci_lock); 1132 ASSERT(cip->ci_state < FMD_CASE_SOLVED); 1133 1134 fmd_case_destroy_suspects(cip); 1135 cip->ci_flags |= FMD_CF_DIRTY; 1136 1137 (void) pthread_mutex_unlock(&cip->ci_lock); 1138 fmd_module_setcdirty(cip->ci_mod); 1139 } 1140 1141 /*ARGSUSED*/ 1142 static void 1143 fmd_case_unusable(fmd_asru_link_t *alp, void *arg) 1144 { 1145 (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE); 1146 } 1147 1148 /* 1149 * Grab ci_lock and update the case state and set the dirty bit. Then perform 1150 * whatever actions and emit whatever events are appropriate for the state. 1151 * Refer to the topmost block comment explaining the state machine for details. 1152 */ 1153 void 1154 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags) 1155 { 1156 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1157 fmd_case_item_t *cit; 1158 fmd_event_t *e; 1159 1160 ASSERT(state <= FMD_CASE_REPAIRED); 1161 (void) pthread_mutex_lock(&cip->ci_lock); 1162 1163 if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED)) 1164 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED); 1165 1166 cip->ci_flags |= flags; 1167 1168 if (cip->ci_state >= state) { 1169 (void) pthread_mutex_unlock(&cip->ci_lock); 1170 return; /* already in specified state */ 1171 } 1172 1173 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid, 1174 _fmd_case_snames[cip->ci_state], _fmd_case_snames[state])); 1175 1176 cip->ci_state = state; 1177 cip->ci_flags |= FMD_CF_DIRTY; 1178 1179 if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod) 1180 fmd_module_setcdirty(cip->ci_mod); 1181 1182 switch (state) { 1183 case FMD_CASE_SOLVED: 1184 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1185 fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED); 1186 1187 if (cip->ci_principal != NULL) { 1188 fmd_event_transition(cip->ci_principal, 1189 FMD_EVS_DIAGNOSED); 1190 } 1191 break; 1192 1193 case FMD_CASE_CLOSE_WAIT: 1194 /* 1195 * If the case was never solved, do not change ASRUs. 1196 * If the case was never fmd_case_closed, do not change ASRUs. 1197 * If the case was repaired, do not change ASRUs. 1198 */ 1199 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED | 1200 FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED)) 1201 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, 1202 fmd_case_unusable, NULL); 1203 1204 /* 1205 * If an orphaned case transitions to CLOSE_WAIT, the owning 1206 * module is no longer loaded: continue on to CASE_CLOSED. 1207 */ 1208 if (fmd_case_orphaned(cp)) 1209 state = cip->ci_state = FMD_CASE_CLOSED; 1210 break; 1211 1212 case FMD_CASE_REPAIRED: 1213 ASSERT(fmd_case_orphaned(cp)); 1214 fmd_module_lock(cip->ci_mod); 1215 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1216 fmd_module_unlock(cip->ci_mod); 1217 break; 1218 } 1219 1220 (void) pthread_mutex_unlock(&cip->ci_lock); 1221 1222 /* 1223 * If the module has initialized, then publish the appropriate event 1224 * for the new case state. If not, we are being called from the 1225 * checkpoint code during module load, in which case the module's 1226 * _fmd_init() routine hasn't finished yet, and our event dictionaries 1227 * may not be open yet, which will prevent us from computing the event 1228 * code. Defer the call to fmd_case_publish() by enqueuing a PUBLISH 1229 * event in our queue: this won't be processed until _fmd_init is done. 1230 */ 1231 if (cip->ci_mod->mod_flags & FMD_MOD_INIT) 1232 fmd_case_publish(cp, state); 1233 else { 1234 fmd_case_hold(cp); 1235 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp); 1236 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e); 1237 } 1238 1239 /* 1240 * If we transitioned to REPAIRED, adjust the reference count to 1241 * reflect our removal from fmd.d_rmod->mod_cases. If the caller has 1242 * not placed an additional hold on the case, it will now be freed. 1243 */ 1244 if (state == FMD_CASE_REPAIRED) { 1245 (void) pthread_mutex_lock(&cip->ci_lock); 1246 fmd_asru_hash_delete_case(fmd.d_asrus, cp); 1247 (void) pthread_mutex_unlock(&cip->ci_lock); 1248 fmd_case_rele(cp); 1249 } 1250 } 1251 1252 /* 1253 * Transition the specified case to *at least* the specified state by first 1254 * re-validating the suspect list using the resource cache. This function is 1255 * employed by the checkpoint code when restoring a saved, solved case to see 1256 * if the state of the case has effectively changed while fmd was not running 1257 * or the module was not loaded. If none of the suspects are present anymore, 1258 * advance the state to REPAIRED. If none are usable, advance to CLOSE_WAIT. 1259 */ 1260 void 1261 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags) 1262 { 1263 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1264 1265 int faulty = 0; /* are any suspects faulty? */ 1266 int usable = 0; /* are any suspects usable? */ 1267 1268 ASSERT(state >= FMD_CASE_SOLVED); 1269 (void) pthread_mutex_lock(&cip->ci_lock); 1270 1271 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1272 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable); 1273 1274 (void) pthread_mutex_unlock(&cip->ci_lock); 1275 1276 /* 1277 * If none of the suspects were faulty, it implies they were either 1278 * repaired already or not present and the rsrc.age time has expired. 1279 * We can move the state on to repaired. 1280 */ 1281 if (!faulty) { 1282 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1283 flags |= FMD_CF_REPAIRED; 1284 } else if (!usable) { 1285 state = MAX(state, FMD_CASE_CLOSE_WAIT); 1286 flags |= FMD_CF_ISOLATED; 1287 } 1288 1289 fmd_case_transition(cp, state, flags); 1290 } 1291 1292 void 1293 fmd_case_setdirty(fmd_case_t *cp) 1294 { 1295 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1296 1297 (void) pthread_mutex_lock(&cip->ci_lock); 1298 cip->ci_flags |= FMD_CF_DIRTY; 1299 (void) pthread_mutex_unlock(&cip->ci_lock); 1300 1301 fmd_module_setcdirty(cip->ci_mod); 1302 } 1303 1304 void 1305 fmd_case_clrdirty(fmd_case_t *cp) 1306 { 1307 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1308 1309 (void) pthread_mutex_lock(&cip->ci_lock); 1310 cip->ci_flags &= ~FMD_CF_DIRTY; 1311 (void) pthread_mutex_unlock(&cip->ci_lock); 1312 } 1313 1314 void 1315 fmd_case_commit(fmd_case_t *cp) 1316 { 1317 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1318 fmd_case_item_t *cit; 1319 1320 (void) pthread_mutex_lock(&cip->ci_lock); 1321 1322 if (cip->ci_flags & FMD_CF_DIRTY) { 1323 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) 1324 fmd_event_commit(cit->cit_event); 1325 1326 if (cip->ci_principal != NULL) 1327 fmd_event_commit(cip->ci_principal); 1328 1329 fmd_buf_hash_commit(&cip->ci_bufs); 1330 cip->ci_flags &= ~FMD_CF_DIRTY; 1331 } 1332 1333 (void) pthread_mutex_unlock(&cip->ci_lock); 1334 } 1335 1336 /* 1337 * Indicate that the case may need to change state because one or more of the 1338 * ASRUs named as a suspect has changed state. We examine all the suspects 1339 * and if none are still faulty, we initiate a case close transition. 1340 */ 1341 void 1342 fmd_case_update(fmd_case_t *cp) 1343 { 1344 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1345 uint_t cstate; 1346 int faulty = 0; 1347 1348 (void) pthread_mutex_lock(&cip->ci_lock); 1349 cstate = cip->ci_state; 1350 1351 if (cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) { 1352 (void) pthread_mutex_unlock(&cip->ci_lock); 1353 return; /* update is not appropriate */ 1354 } 1355 1356 if (cip->ci_flags & FMD_CF_REPAIRED) { 1357 (void) pthread_mutex_unlock(&cip->ci_lock); 1358 return; /* already repaired */ 1359 } 1360 1361 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty); 1362 (void) pthread_mutex_unlock(&cip->ci_lock); 1363 1364 if (faulty) 1365 return; /* one or more suspects are still marked faulty */ 1366 1367 if (cstate == FMD_CASE_CLOSED) 1368 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1369 else 1370 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1371 } 1372 1373 /* 1374 * Delete a closed case from the module's case list once the fmdo_close() entry 1375 * point has run to completion. If the case is owned by a transport module, 1376 * tell the transport to proxy a case close on the other end of the transport. 1377 * If not, transition to the appropriate next state based on ci_flags. This 1378 * function represents the end of CLOSE_WAIT and transitions the case to either 1379 * CLOSED or REPAIRED or discards it entirely because it was never solved; 1380 * refer to the topmost block comment explaining the state machine for details. 1381 */ 1382 void 1383 fmd_case_delete(fmd_case_t *cp) 1384 { 1385 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1386 fmd_modstat_t *msp; 1387 size_t buftotal; 1388 1389 ASSERT(fmd_module_locked(cip->ci_mod)); 1390 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1391 buftotal = fmd_buf_hash_destroy(&cip->ci_bufs); 1392 1393 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1394 msp = cip->ci_mod->mod_stats; 1395 1396 ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0); 1397 msp->ms_caseopen.fmds_value.ui64--; 1398 1399 ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal); 1400 msp->ms_buftotal.fmds_value.ui64 -= buftotal; 1401 1402 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1403 1404 if (cip->ci_xprt == NULL) 1405 fmd_module_setcdirty(cip->ci_mod); 1406 1407 fmd_module_rele(cip->ci_mod); 1408 cip->ci_mod = fmd.d_rmod; 1409 fmd_module_hold(cip->ci_mod); 1410 1411 /* 1412 * If the case is not proxied and it has been solved, then retain it 1413 * on the root module's case list at least until we're transitioned. 1414 * Otherwise free the case with our final fmd_case_rele() below. 1415 */ 1416 if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) { 1417 fmd_module_lock(cip->ci_mod); 1418 fmd_list_append(&cip->ci_mod->mod_cases, cip); 1419 fmd_module_unlock(cip->ci_mod); 1420 fmd_case_hold(cp); 1421 } 1422 1423 /* 1424 * If a proxied case finishes CLOSE_WAIT, then it can be discarded 1425 * rather than orphaned because by definition it can have no entries 1426 * in the resource cache of the current fault manager. 1427 */ 1428 if (cip->ci_xprt != NULL) 1429 fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid); 1430 else if (cip->ci_flags & FMD_CF_REPAIRED) 1431 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0); 1432 else if (cip->ci_flags & FMD_CF_ISOLATED) 1433 fmd_case_transition(cp, FMD_CASE_CLOSED, 0); 1434 1435 fmd_case_rele(cp); 1436 } 1437 1438 void 1439 fmd_case_discard(fmd_case_t *cp) 1440 { 1441 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1442 1443 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock); 1444 cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--; 1445 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock); 1446 1447 ASSERT(fmd_module_locked(cip->ci_mod)); 1448 fmd_list_delete(&cip->ci_mod->mod_cases, cip); 1449 fmd_case_rele(cp); 1450 } 1451 1452 /* 1453 * Indicate that the problem corresponding to a case has been repaired by 1454 * clearing the faulty bit on each ASRU named as a suspect. If the case hasn't 1455 * already been closed, this function initiates the transition to CLOSE_WAIT. 1456 * The caller must have the case held from fmd_case_hash_lookup(), so we can 1457 * grab and drop ci_lock without the case being able to be freed in between. 1458 */ 1459 int 1460 fmd_case_repair(fmd_case_t *cp) 1461 { 1462 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1463 uint_t cstate; 1464 1465 (void) pthread_mutex_lock(&cip->ci_lock); 1466 cstate = cip->ci_state; 1467 1468 if (cip->ci_xprt != NULL) { 1469 (void) pthread_mutex_unlock(&cip->ci_lock); 1470 return (fmd_set_errno(EFMD_CASE_OWNER)); 1471 } 1472 1473 if (cstate < FMD_CASE_SOLVED) { 1474 (void) pthread_mutex_unlock(&cip->ci_lock); 1475 return (fmd_set_errno(EFMD_CASE_STATE)); 1476 } 1477 1478 if (cip->ci_flags & FMD_CF_REPAIRED) { 1479 (void) pthread_mutex_unlock(&cip->ci_lock); 1480 return (0); /* already repaired */ 1481 } 1482 1483 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repair, NULL); 1484 (void) pthread_mutex_unlock(&cip->ci_lock); 1485 1486 if (cstate == FMD_CASE_CLOSED) 1487 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED); 1488 else 1489 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED); 1490 1491 return (0); 1492 } 1493 1494 int 1495 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep) 1496 { 1497 fmd_case_impl_t *cip = (fmd_case_impl_t *)cp; 1498 fmd_case_item_t *cit; 1499 uint_t state; 1500 int rv = 0; 1501 1502 (void) pthread_mutex_lock(&cip->ci_lock); 1503 1504 if (cip->ci_state >= FMD_CASE_SOLVED) 1505 state = FMD_EVS_DIAGNOSED; 1506 else 1507 state = FMD_EVS_ACCEPTED; 1508 1509 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) { 1510 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0) 1511 break; 1512 } 1513 1514 if (rv == 0 && cip->ci_principal != NULL) 1515 rv = fmd_event_equal(ep, cip->ci_principal); 1516 1517 (void) pthread_mutex_unlock(&cip->ci_lock); 1518 1519 if (rv != 0) 1520 fmd_event_transition(ep, state); 1521 1522 return (rv); 1523 } 1524 1525 int 1526 fmd_case_orphaned(fmd_case_t *cp) 1527 { 1528 return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod); 1529 } 1530 1531 void 1532 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec) 1533 { 1534 ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec; 1535 ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec; 1536 ((fmd_case_impl_t *)cp)->ci_tv_valid = 1; 1537 } 1538