1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * PM8001 device state recovery routines 27 */ 28 29 #include <sys/scsi/adapters/pmcs/pmcs.h> 30 31 /* 32 * SAS Topology Configuration 33 */ 34 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt); 35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 36 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, 37 char *reason_string); 38 39 /* 40 * Get device state. Called with statlock and PHY lock held. 41 */ 42 static int 43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 44 uint8_t *ds) 45 { 46 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 47 int result; 48 struct pmcwork *pwrk; 49 50 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 51 (void *)xp); 52 53 if (xp != NULL) { 54 ASSERT(mutex_owned(&xp->statlock)); 55 } 56 57 if (phyp == NULL) { 58 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 59 "%s: PHY is NULL", __func__); 60 return (-1); 61 } 62 ASSERT(mutex_owned(&phyp->phy_lock)); 63 64 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 65 if (pwrk == NULL) { 66 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 67 return (-1); 68 } 69 pwrk->arg = msg; 70 pwrk->dtype = phyp->dtype; 71 72 if (phyp->valid_device_id == 0) { 73 pmcs_pwork(pwp, pwrk); 74 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 75 "%s: Invalid DeviceID", __func__); 76 return (-1); 77 } 78 htag = pwrk->htag; 79 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 80 PMCIN_GET_DEVICE_STATE)); 81 msg[1] = LE_32(pwrk->htag); 82 msg[2] = LE_32(phyp->device_id); 83 CLEAN_MESSAGE(msg, 3); 84 85 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 86 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 87 if (ptr == NULL) { 88 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 89 pmcs_pwork(pwp, pwrk); 90 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 91 return (-1); 92 } 93 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 94 pwrk->state = PMCS_WORK_STATE_ONCHIP; 95 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 96 97 if (xp != NULL) { 98 mutex_exit(&xp->statlock); 99 } 100 pmcs_unlock_phy(phyp); 101 WAIT_FOR(pwrk, 1000, result); 102 pmcs_pwork(pwp, pwrk); 103 pmcs_lock_phy(phyp); 104 105 if (xp != NULL) { 106 mutex_enter(&xp->statlock); 107 } 108 109 if (result) { 110 pmcs_timed_out(pwp, htag, __func__); 111 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 112 "%s: cmd timed out, returning", __func__); 113 return (-1); 114 } 115 if (LE_32(msg[2]) == 0) { 116 *ds = (uint8_t)(LE_32(msg[4])); 117 if (xp == NULL) { 118 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 119 "%s: retrieved_ds=0x%x", __func__, *ds); 120 } else if (*ds != xp->dev_state) { 121 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 122 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 123 *ds, xp->dev_state); 124 } 125 return (0); 126 } else { 127 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 128 "%s: cmd failed Status(0x%x), returning ", __func__, 129 LE_32(msg[2])); 130 return (-1); 131 } 132 } 133 134 /* 135 * Set device state. Called with target's statlock and PHY lock held. 136 */ 137 static int 138 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 139 uint8_t ds) 140 { 141 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 142 int result; 143 uint8_t pds, nds; 144 struct pmcwork *pwrk; 145 146 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 147 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 148 (void *)phyp); 149 150 if (phyp == NULL) { 151 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 152 "%s: PHY is NULL", __func__); 153 return (-1); 154 } 155 156 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 157 if (pwrk == NULL) { 158 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 159 return (-1); 160 } 161 if (phyp->valid_device_id == 0) { 162 pmcs_pwork(pwp, pwrk); 163 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 164 "%s: Invalid DeviceID", __func__); 165 return (-1); 166 } 167 pwrk->arg = msg; 168 pwrk->dtype = phyp->dtype; 169 htag = pwrk->htag; 170 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 171 PMCIN_SET_DEVICE_STATE)); 172 msg[1] = LE_32(pwrk->htag); 173 msg[2] = LE_32(phyp->device_id); 174 msg[3] = LE_32(ds); 175 CLEAN_MESSAGE(msg, 4); 176 177 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 178 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 179 if (ptr == NULL) { 180 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 181 pmcs_pwork(pwp, pwrk); 182 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 183 return (-1); 184 } 185 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 186 pwrk->state = PMCS_WORK_STATE_ONCHIP; 187 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 188 189 if (xp != NULL) { 190 mutex_exit(&xp->statlock); 191 } 192 pmcs_unlock_phy(phyp); 193 WAIT_FOR(pwrk, 1000, result); 194 pmcs_pwork(pwp, pwrk); 195 pmcs_lock_phy(phyp); 196 if (xp != NULL) { 197 mutex_enter(&xp->statlock); 198 } 199 200 if (result) { 201 pmcs_timed_out(pwp, htag, __func__); 202 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 203 "%s: cmd timed out, returning", __func__); 204 return (-1); 205 } 206 if (LE_32(msg[2]) == 0) { 207 pds = (uint8_t)(LE_32(msg[4]) >> 4); 208 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 209 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 210 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 211 if (xp != NULL) { 212 xp->dev_state = nds; 213 } 214 return (0); 215 } else { 216 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 217 "%s: cmd failed Status(0x%x), returning ", __func__, 218 LE_32(msg[2])); 219 return (-1); 220 } 221 } 222 223 static void 224 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt) 225 { 226 pmcs_hw_t *pwp; 227 228 ASSERT(pptr); 229 pwp = pptr->pwp; 230 231 if (tgt != NULL) { 232 tgt->recover_wait = 0; 233 } 234 pptr->ds_recovery_retries = 0; 235 236 if ((pptr->ds_prev_good_recoveries == 0) || 237 (ddi_get_lbolt() - pptr->last_good_recovery > 238 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 239 pptr->last_good_recovery = ddi_get_lbolt(); 240 pptr->ds_prev_good_recoveries = 1; 241 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 242 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 243 pptr->ds_prev_good_recoveries++; 244 } else { 245 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__, 246 "Max recovery attempts reached. Declaring PHY dead"); 247 } 248 249 /* Don't bother to run the work queues if the PHY is dead */ 250 if (!pptr->dead) { 251 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 252 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 253 pwp, DDI_NOSLEEP); 254 } 255 } 256 257 void 258 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 259 { 260 boolean_t reschedule = B_FALSE; 261 uint8_t ds, tgt_dev_state; 262 int rc; 263 pmcs_xscsi_t *tgt; 264 pmcs_phy_t *pptr, *pnext, *pchild; 265 266 /* 267 * First time, check to see if we're already performing recovery 268 */ 269 if (phyp == NULL) { 270 mutex_enter(&pwp->lock); 271 if (pwp->ds_err_recovering) { 272 mutex_exit(&pwp->lock); 273 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 274 return; 275 } 276 277 pwp->ds_err_recovering = 1; 278 pptr = pwp->root_phys; 279 mutex_exit(&pwp->lock); 280 } else { 281 pptr = phyp; 282 } 283 284 while (pptr) { 285 /* 286 * Since ds_err_recovering is set, we can be assured these 287 * PHYs won't disappear on us while we do this. 288 */ 289 pmcs_lock_phy(pptr); 290 pchild = pptr->children; 291 pnext = pptr->sibling; 292 pmcs_unlock_phy(pptr); 293 294 if (pchild) { 295 pmcs_dev_state_recovery(pwp, pchild); 296 } 297 298 tgt = NULL; 299 pmcs_lock_phy(pptr); 300 301 if (pptr->dead || !pptr->valid_device_id) { 302 goto next_phy; 303 } 304 305 if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) { 306 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target, 307 "%s: No DS recovery on PHY %s, iport not active", 308 __func__, pptr->path); 309 goto next_phy; 310 } 311 312 tgt = pptr->target; 313 314 if (tgt != NULL) { 315 mutex_enter(&tgt->statlock); 316 if (tgt->recover_wait == 0) { 317 goto next_phy; 318 } 319 tgt_dev_state = tgt->dev_state; 320 } else { 321 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 322 } 323 324 if (pptr->prev_recovery) { 325 if (ddi_get_lbolt() - pptr->prev_recovery < 326 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 327 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 328 "%s: DS recovery on PHY %s " 329 "re-invoked too soon. Skipping...", 330 __func__, pptr->path); 331 if ((tgt) && (tgt->recover_wait)) { 332 reschedule = B_TRUE; 333 } 334 goto next_phy; 335 } 336 } 337 pptr->prev_recovery = ddi_get_lbolt(); 338 339 /* 340 * Step 1: Put the device into the IN_RECOVERY state 341 */ 342 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 343 if (rc != 0) { 344 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 345 "%s: pmcs_get_dev_state on PHY %s " 346 "failed (rc=%d)", 347 __func__, pptr->path, rc); 348 349 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 350 __func__, "pmcs_get_dev_state"); 351 352 goto next_phy; 353 } 354 355 /* If the chip says it's operational, we're done */ 356 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) { 357 pmcs_ds_operational(pptr, tgt); 358 goto next_phy; 359 } 360 361 if ((tgt_dev_state == ds) && 362 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 363 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 364 "%s: Target 0x%p already IN_RECOVERY", __func__, 365 (void *)tgt); 366 } else { 367 if (tgt != NULL) { 368 tgt->dev_state = ds; 369 } 370 tgt_dev_state = ds; 371 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 372 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 373 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 374 "%s: pmcs_send_err_recovery_cmd " 375 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 376 __func__, rc, (void *)tgt, ds, tgt_dev_state); 377 378 if (rc) { 379 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 380 "%s: pmcs_send_err_recovery_cmd to PHY %s " 381 "failed (rc=%d)", 382 __func__, pptr->path, rc); 383 384 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 385 __func__, "pmcs_send_err_recovery_cmd"); 386 387 goto next_phy; 388 } 389 } 390 391 /* 392 * Step 2: Perform a hard reset on the PHY. 393 */ 394 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 395 "%s: Issue HARD_RESET to PHY %s", __func__, 396 pptr->path); 397 /* 398 * Must release statlock here because pmcs_reset_phy 399 * will drop and reacquire the PHY lock. 400 */ 401 if (tgt != NULL) { 402 mutex_exit(&tgt->statlock); 403 } 404 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 405 if (tgt != NULL) { 406 mutex_enter(&tgt->statlock); 407 } 408 if (rc) { 409 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 410 "%s: HARD_RESET to PHY %s failed (rc=%d)", 411 __func__, pptr->path, rc); 412 413 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 414 __func__, "HARD_RESET"); 415 416 goto next_phy; 417 } 418 419 /* 420 * Step 3: Abort all I/Os to the device 421 */ 422 if (pptr->abort_all_start) { 423 while (pptr->abort_all_start) { 424 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 425 "%s: Waiting for outstanding ABORT_ALL on " 426 "PHY 0x%p", __func__, (void *)pptr); 427 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 428 } 429 } else { 430 if (tgt != NULL) { 431 mutex_exit(&tgt->statlock); 432 } 433 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 434 if (tgt != NULL) { 435 mutex_enter(&tgt->statlock); 436 } 437 if (rc != 0) { 438 pptr->abort_pending = 1; 439 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 440 "%s: pmcs_abort to PHY %s failed (rc=%d)", 441 __func__, pptr->path, rc); 442 443 pmcs_handle_ds_recovery_error(pptr, tgt, 444 pwp, __func__, "pmcs_abort"); 445 446 goto next_phy; 447 } 448 } 449 450 /* 451 * Step 4: Set the device back to OPERATIONAL state 452 */ 453 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 454 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 455 __func__, (void *)pptr, (void *)tgt); 456 rc = pmcs_set_dev_state(pwp, pptr, tgt, 457 PMCS_DEVICE_STATE_OPERATIONAL); 458 if (rc == 0) { 459 pmcs_ds_operational(pptr, tgt); 460 } else { 461 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 462 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 463 __func__, (void *)tgt); 464 465 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 466 __func__, "SET tgt to OPERATIONAL state"); 467 468 goto next_phy; 469 } 470 471 next_phy: 472 if (tgt) { 473 mutex_exit(&tgt->statlock); 474 } 475 pmcs_unlock_phy(pptr); 476 pptr = pnext; 477 } 478 479 /* 480 * Only clear ds_err_recovering if we're exiting for good and not 481 * just unwinding from recursion 482 */ 483 if (phyp == NULL) { 484 mutex_enter(&pwp->lock); 485 pwp->ds_err_recovering = 0; 486 mutex_exit(&pwp->lock); 487 } 488 489 if (reschedule) { 490 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 491 } 492 } 493 494 /* 495 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 496 */ 497 int 498 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 499 pmcs_xscsi_t *tgt) 500 { 501 int rc = -1; 502 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 503 504 if (tgt != NULL) { 505 ASSERT(mutex_owned(&tgt->statlock)); 506 if (tgt->recovering) { 507 return (0); 508 } 509 510 tgt->recovering = 1; 511 tgt_dev_state = tgt->dev_state; 512 } 513 514 if (phyp == NULL) { 515 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 516 "%s: PHY is NULL", __func__); 517 return (-1); 518 } 519 520 ASSERT(mutex_owned(&phyp->phy_lock)); 521 522 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 523 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 524 525 switch (dev_state) { 526 case PMCS_DEVICE_STATE_IN_RECOVERY: 527 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 528 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 529 "%s: Target 0x%p already IN_RECOVERY", __func__, 530 (void *)tgt); 531 rc = 0; /* This is not an error */ 532 goto no_action; 533 } 534 535 rc = pmcs_set_dev_state(pwp, phyp, tgt, 536 PMCS_DEVICE_STATE_IN_RECOVERY); 537 if (rc != 0) { 538 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 539 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 540 __func__, (void *)tgt); 541 } 542 543 break; 544 545 case PMCS_DEVICE_STATE_OPERATIONAL: 546 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 548 "%s: Target 0x%p not ready to go OPERATIONAL", 549 __func__, (void *)tgt); 550 goto no_action; 551 } 552 553 rc = pmcs_set_dev_state(pwp, phyp, tgt, 554 PMCS_DEVICE_STATE_OPERATIONAL); 555 if (tgt != NULL) { 556 tgt->reset_success = 1; 557 } 558 if (rc != 0) { 559 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 560 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 561 __func__, (void *)tgt); 562 if (tgt != NULL) { 563 tgt->reset_success = 0; 564 } 565 } 566 567 break; 568 569 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 570 PHY_CHANGED(pwp, phyp); 571 RESTART_DISCOVERY(pwp); 572 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 573 "%s: Device at %s is non-operational", 574 __func__, phyp->path); 575 if (tgt != NULL) { 576 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 577 } 578 rc = 0; 579 580 break; 581 582 default: 583 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 584 "%s: Invalid state requested (%d)", __func__, 585 dev_state); 586 break; 587 588 } 589 590 no_action: 591 if (tgt != NULL) { 592 tgt->recovering = 0; 593 } 594 return (rc); 595 } 596 597 /* 598 * Start ssp event recovery. We have to schedule recovery operation because 599 * it involves sending multiple commands to device and we should not do it 600 * in the interrupt context. 601 * If it is failure of a recovery command, let the recovery thread deal with it. 602 * Called with the work lock held. 603 */ 604 void 605 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 606 size_t amt) 607 { 608 pmcs_xscsi_t *tgt = pwrk->xp; 609 uint32_t event = LE_32(iomb[2]); 610 pmcs_phy_t *pptr = pwrk->phy; 611 pmcs_cb_t callback; 612 uint32_t tag; 613 614 if (tgt != NULL) { 615 mutex_enter(&tgt->statlock); 616 if (!tgt->assigned) { 617 if (pptr) { 618 pmcs_dec_phy_ref_count(pptr); 619 } 620 pptr = NULL; 621 pwrk->phy = NULL; 622 } 623 mutex_exit(&tgt->statlock); 624 } 625 626 if (pptr == NULL) { 627 /* 628 * No target, need to run RE-DISCOVERY here. 629 */ 630 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 631 pwrk->state = PMCS_WORK_STATE_INTR; 632 } 633 /* 634 * Although we cannot mark phy to force abort nor mark phy 635 * as changed, killing of a target would take care of aborting 636 * commands for the device. 637 */ 638 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 639 "%s: No valid target for event processing. Reconfigure.", 640 __func__); 641 pmcs_pwork(pwp, pwrk); 642 RESTART_DISCOVERY(pwp); 643 return; 644 } else { 645 /* We have a phy pointer, we'll need to lock it */ 646 mutex_exit(&pwrk->lock); 647 pmcs_lock_phy(pptr); 648 mutex_enter(&pwrk->lock); 649 if (tgt != NULL) { 650 mutex_enter(&tgt->statlock); 651 } 652 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 653 if ((tgt != NULL) && (tgt->dev_state != 654 PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 655 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 656 "%s: Device at %s is non-operational", 657 __func__, pptr->path); 658 tgt->dev_state = 659 PMCS_DEVICE_STATE_NON_OPERATIONAL; 660 } 661 pptr->abort_pending = 1; 662 if (tgt != NULL) { 663 mutex_exit(&tgt->statlock); 664 } 665 mutex_exit(&pwrk->lock); 666 pmcs_unlock_phy(pptr); 667 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 668 RESTART_DISCOVERY(pwp); 669 return; 670 } 671 672 /* 673 * If this command is run in WAIT mode, it is a failing recovery 674 * command. If so, just wake up recovery thread waiting for 675 * command completion. 676 */ 677 tag = PMCS_TAG_TYPE(pwrk->htag); 678 if (tag == PMCS_TAG_TYPE_WAIT) { 679 pwrk->htag |= PMCS_TAG_DONE; 680 if (pwrk->arg && amt) { 681 (void) memcpy(pwrk->arg, iomb, amt); 682 } 683 cv_signal(&pwrk->sleep_cv); 684 if (tgt != NULL) { 685 mutex_exit(&tgt->statlock); 686 } 687 mutex_exit(&pwrk->lock); 688 pmcs_unlock_phy(pptr); 689 return; 690 } 691 692 if (tgt == NULL) { 693 pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, NULL, 694 "%s: Not scheduling SSP event recovery for NULL tgt" 695 " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk, 696 pwrk->htag); 697 mutex_exit(&pwrk->lock); 698 pmcs_unlock_phy(pptr); 699 return; 700 } 701 702 /* 703 * If the SSP event was an OPEN_RETRY_TIMEOUT, we don't want 704 * to go through the recovery (abort/LU reset) process. 705 * Simply complete the command and return it as STATUS_BUSY. 706 * This will cause the target driver to simply retry. 707 */ 708 if (event == PMCOUT_STATUS_IO_XFER_OPEN_RETRY_TIMEOUT) { 709 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 710 "%s: Got OPEN_RETRY_TIMEOUT event (htag 0x%08x)", 711 __func__, pwrk->htag); 712 713 mutex_exit(&tgt->statlock); 714 /* Note: work remains locked for the callback */ 715 pmcs_unlock_phy(pptr); 716 pwrk->ssp_event = event; 717 callback = (pmcs_cb_t)pwrk->ptr; 718 (*callback)(pwp, pwrk, iomb); 719 return; 720 } 721 722 /* 723 * To recover from primary failures, 724 * we need to schedule handling events recovery. 725 */ 726 tgt->event_recovery = 1; 727 mutex_exit(&tgt->statlock); 728 pwrk->ssp_event = event; 729 mutex_exit(&pwrk->lock); 730 pmcs_unlock_phy(pptr); 731 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 732 "%s: Scheduling SSP event recovery for tgt(0x%p) " 733 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 734 pwrk->htag); 735 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 736 } 737 738 /* Work cannot be completed until event recovery is completed. */ 739 } 740 741 /* 742 * SSP target event recovery 743 * Entered with a phy lock held 744 * Pwrk lock is not needed - pwrk is on the target aq and no other thread 745 * will do anything with it until this thread starts the chain of recovery. 746 * Statlock may be acquired and released. 747 */ 748 void 749 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 750 { 751 pmcs_phy_t *pptr = pwrk->phy; 752 pmcs_cmd_t *sp = pwrk->arg; 753 pmcs_lun_t *lun = sp->cmd_lun; 754 pmcs_xscsi_t *tgt = pwrk->xp; 755 uint32_t event; 756 uint32_t htag; 757 uint32_t status; 758 uint8_t dstate; 759 int rv; 760 761 ASSERT(pwrk->arg != NULL); 762 ASSERT(pwrk->xp != NULL); 763 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 764 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 765 htag = pwrk->htag; 766 event = pwrk->ssp_event; 767 pwrk->ssp_event = 0xffffffff; 768 769 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 770 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 771 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 772 /* Command may be still pending on device */ 773 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 774 lun->lun_num, &status); 775 if (rv != 0) { 776 goto out; 777 } 778 if (status == SAS_RSP_TMF_COMPLETE) { 779 /* Command NOT pending on a device */ 780 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 781 "%s: No pending command for tgt 0x%p", 782 __func__, (void *)tgt); 783 /* Nothing more to do, just abort it on chip */ 784 htag = 0; 785 } 786 } 787 /* 788 * All other events left the command pending in the host 789 * Send abort task and abort it on the chip 790 */ 791 if (htag != 0) { 792 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 793 lun->lun_num, &status)) 794 goto out; 795 } 796 (void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1); 797 /* 798 * Abort either took care of work completion, or put device in 799 * a recovery state 800 */ 801 return; 802 out: 803 /* Abort failed, do full device recovery */ 804 ASSERT(tgt != NULL); 805 mutex_enter(&tgt->statlock); 806 if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate)) 807 tgt->dev_state = dstate; 808 809 if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) && 810 (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 811 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 812 "%s: Setting IN_RECOVERY for tgt 0x%p", 813 __func__, (void *)tgt); 814 (void) pmcs_send_err_recovery_cmd(pwp, 815 PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt); 816 } 817 mutex_exit(&tgt->statlock); 818 } 819 820 /* 821 * SSP event recovery task. 822 */ 823 void 824 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 825 { 826 int idx; 827 pmcs_xscsi_t *tgt; 828 pmcs_cmd_t *cp; 829 pmcwork_t *pwrk; 830 pmcs_phy_t *pphy; 831 int er_flag; 832 uint32_t idxpwrk; 833 834 restart: 835 for (idx = 0; idx < pwp->max_dev; idx++) { 836 mutex_enter(&pwp->lock); 837 tgt = pwp->targets[idx]; 838 mutex_exit(&pwp->lock); 839 if (tgt == NULL) { 840 continue; 841 } 842 843 mutex_enter(&tgt->statlock); 844 if (!tgt->assigned) { 845 mutex_exit(&tgt->statlock); 846 continue; 847 } 848 pphy = tgt->phy; 849 er_flag = tgt->event_recovery; 850 mutex_exit(&tgt->statlock); 851 852 if ((pphy == NULL) || (er_flag == 0)) { 853 continue; 854 } 855 856 pmcs_lock_phy(pphy); 857 mutex_enter(&tgt->statlock); 858 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 859 "%s: found target(0x%p)", __func__, (void *) tgt); 860 861 /* Check what cmd expects recovery */ 862 mutex_enter(&tgt->aqlock); 863 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 864 /* 865 * Since work structure is on this target aq, and only 866 * this thread is accessing it now, we do not need 867 * to lock it 868 */ 869 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 870 pwrk = &pwp->work[idxpwrk]; 871 if (pwrk->htag != cp->cmd_tag) { 872 /* 873 * aq may contain TMF commands, so we 874 * may not find work structure with htag 875 */ 876 break; 877 } 878 if ((pwrk->ssp_event != 0) && 879 (pwrk->ssp_event != PMCS_REC_EVENT)) { 880 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 881 "%s: pwrk(%p) htag(0x%x)", 882 __func__, (void *) pwrk, cp->cmd_tag); 883 mutex_exit(&tgt->aqlock); 884 mutex_exit(&tgt->statlock); 885 pmcs_tgt_event_recovery(pwp, pwrk); 886 /* 887 * We dropped statlock, so restart the scan 888 */ 889 pmcs_unlock_phy(pphy); 890 goto restart; 891 } 892 } 893 mutex_exit(&tgt->aqlock); 894 tgt->event_recovery = 0; 895 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 896 "%s: end of SSP event recovery for target(0x%p)", 897 __func__, (void *) tgt); 898 mutex_exit(&tgt->statlock); 899 pmcs_unlock_phy(pphy); 900 } 901 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 902 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 903 (void *) pwp); 904 } 905 906 void 907 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 908 { 909 ASSERT(mutex_owned(&xp->statlock)); 910 ASSERT(xp->pwp != NULL); 911 912 if (xp->recover_wait == 0) { 913 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 914 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 915 __func__, (void *)xp, (void *)phyp, phyp->path); 916 xp->recover_wait = 1; 917 918 /* 919 * Rather than waiting for the watchdog timer, we'll 920 * kick it right now. 921 */ 922 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 923 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 924 DDI_NOSLEEP); 925 } 926 } 927 928 /* 929 * Increment the phy ds error retry count. 930 * If too many retries, mark phy dead and restart discovery; 931 * otherwise schedule ds recovery. 932 */ 933 static void 934 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 935 pmcs_hw_t *pwp, const char *func_name, char *reason_string) 936 { 937 ASSERT(mutex_owned(&phyp->phy_lock)); 938 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 939 940 phyp->ds_recovery_retries++; 941 942 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 943 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 944 "%s: retry limit reached after %s to PHY %s failed", 945 func_name, reason_string, phyp->path); 946 if (tgt != NULL) { 947 tgt->recover_wait = 0; 948 } 949 /* 950 * Mark the PHY as dead and it and its parent as changed, 951 * then restart discovery 952 */ 953 phyp->dead = 1; 954 PHY_CHANGED(pwp, phyp); 955 if (phyp->parent) 956 PHY_CHANGED(pwp, phyp->parent); 957 RESTART_DISCOVERY(pwp); 958 } else if ((phyp->ds_prev_good_recoveries > 959 PMCS_MAX_DS_RECOVERY_RETRIES) && 960 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 961 < ddi_get_lbolt())) { 962 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 963 "successful recoveries reached, declaring PHY %s dead", 964 __func__, phyp->path); 965 if (tgt != NULL) { 966 tgt->recover_wait = 0; 967 } 968 /* 969 * Mark the PHY as dead and its parent as changed, 970 * then restart discovery 971 */ 972 phyp->dead = 1; 973 PHY_CHANGED(pwp, phyp); 974 if (phyp->parent) 975 PHY_CHANGED(pwp, phyp->parent); 976 RESTART_DISCOVERY(pwp); 977 } else { 978 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 979 } 980 } 981